**Задача**: на основе накопленных данных идентифицировать посетителя в зависимости от характерного времени его прохода на территорию организации, исключив вероятность передачи пропуска одним сотрудником другому. Таким образом необходимо исключить ситуации, когда один сотрудник прикладывает пропуск за другого, который даже не появляется в организации, но по цифровому следу присуствует на ее территории.

## EDA

In [1]:
!pip install -q catboost

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import (
    date,
    datetime)
from dateutil import relativedelta
from tqdm.notebook import tqdm
tqdm.pandas()

from sklearn.preprocessing import (
    MinMaxScaler,
    PolynomialFeatures)
from sklearn.feature_selection import (
    SelectKBest,
    f_regression,
    chi2)
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    StratifiedKFold)
from sklearn import set_config
from sklearn.pipeline import make_pipeline
from imblearn.pipeline import Pipeline as imb_pipeline
from sklearn.compose import (
    ColumnTransformer,
    make_column_selector as selector)

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay)

set_config(display="diagram")

import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 1

In [None]:
! gdown 1-UkWwmStmUN9LftkimdKQ2CmYP-n3RDt
! unzip -o /content/turnstiles.zip

In [None]:
train = pd.read_csv('/content/turnstiles/train.csv', index_col = 0)
test = pd.read_csv('/content/turnstiles/test.csv', index_col = 0)

In [None]:
train.tail()

In [None]:
test.head()

In [None]:
train.shape, test.shape

In [None]:
train.dtypes

In [None]:
test.dtypes

In [None]:
train['ts'] = pd.to_datetime(
    train['ts'],
    format = '%Y-%m-%d %H:%M:%S')

test['ts'] = pd.to_datetime(
    test['ts'],
    format = '%Y-%m-%d %H:%M:%S')

In [None]:
(
    train['ts'].is_monotonic_increasing,
    test['ts'].is_monotonic_increasing
)

In [None]:
# сведения о самой ранней и поздней дате наблюдения
print(f'Самая ранняя: {train.ts.min()}')
print(f'Самая поздняя: {train.ts.max()}')

In [None]:
#длительность наблюдений по датасету train
date_min = datetime.strptime("29/07/2022", "%d/%m/%Y")
date_max = datetime.strptime("31/12/2022", "%d/%m/%Y")
difference = (relativedelta.relativedelta(date_max, date_min))
print('Число дней наблюдения по train')
print(difference.months, 'months', difference.days, 'days')

In [None]:
# сведения о самой ранней и поздней дате наблюдения по test
print(f'Самая ранняя дата наблюдений: {test.ts.min()}')
print(f'Самая поздняя дата наблюдений: {test.ts.max()}')

In [None]:
#длительность наблюдений по датасету test
date_min = datetime.strptime("03/01/2023", "%d/%m/%Y")
date_max = datetime.strptime("24/02/2023", "%d/%m/%Y")
difference = (relativedelta.relativedelta(date_max, date_min))
print('Число дней наблюдения по test')
print(difference.months, 'months', difference.days, 'days')

In [None]:
# вычислим уникальные id пунктов и количество пересечений каждого пункта
train_gate = train['gate_id'].value_counts(dropna = False).sort_index()
test_gate = test['gate_id'].value_counts(dropna = False).sort_index()

In [None]:
test_gate

In [None]:
train_gate

In [None]:
gate_only_in_train = ~np.isin(train_gate.index, test_gate.index)
gate_only_in_test = ~np.isin(test_gate.index, train_gate.index)

In [None]:
test_gate[gate_only_in_test].index

In [None]:
train_gate[gate_only_in_train].index

In [None]:
#id пропускных пунктов на удаление
list_gate_id_del = [-1, 0, 1, 2, 14, 16]

In [None]:
def del_gate(data, number_list):
    for i in number_list:
        data = data.query(f'gate_id != {i}')
    return data

In [None]:
train = del_gate(train, list_gate_id_del)

In [None]:
test = del_gate(test, list_gate_id_del)

In [None]:
train['user_id'].value_counts(ascending = False).sort_index()

In [None]:
test['user_word'].value_counts(ascending = False)

In [None]:
train['user_id'].nunique()

In [None]:
test['user_word'].nunique()

In [None]:
train.duplicated().sum()

In [None]:
test.duplicated().sum()

In [None]:
train = train.drop_duplicates()
test = test.drop_duplicates()

In [None]:
train.isna().sum(), test.isna().sum()

## FEATURES

---



In [None]:
train.head(2)

In [None]:
train['day'] = train['ts'].dt.day
train['weekday'] = train['ts'].dt.weekday
train['hour'] = train['ts'].dt.hour
train['minute'] = train['ts'].dt.minute

In [None]:
test['day'] = test['ts'].dt.day
test['weekday'] = test['ts'].dt.weekday
test['hour'] = test['ts'].dt.hour
test['minute'] = test['ts'].dt.minute

In [None]:
train['week_number'] = train['ts'].dt.isocalendar().week
test['week_number'] = test['ts'].dt.isocalendar().week

In [None]:
train['week_number'].value_counts().sort_index().plot(
    grid = True,
    color = 'orangered',
    linewidth = 3)
plt.title('Общее число проходов через турникеты \n (датасет train)')
plt.xlabel('Номер недели')
plt.ylabel('Число пересечений турникетов')
plt.show()

In [None]:
train.loc[train['week_number'] == 30]['weekday'].value_counts().sort_index()

In [None]:
train.loc[train['week_number'] == 30]['day'].unique()

In [None]:
print(
    train.loc[train['week_number'] == 30]['ts'].min(),
    train.loc[train['week_number'] == 30]['ts'].max()
)

In [None]:
train.loc[train['week_number'] == 45]['weekday'].value_counts().sort_index()

In [None]:
train.loc[train['week_number'] == 45]['day'].unique()

In [None]:
print(
    train.loc[train['week_number'] == 45]['ts'].min(),
    train.loc[train['week_number'] == 45]['ts'].max()
)

In [None]:
train = train.astype({'week_number': 'Int64'})

In [None]:
train = train.query('week_number != 45', engine='python')

In [None]:
train.shape

In [None]:
test['week_number'].value_counts().sort_index().plot(
    grid = True,
    color = 'darkcyan',
    linewidth = 3)
plt.title('Общее число проходов через турникеты \n (датасет test)')
plt.xlabel('Номер недели')
plt.ylabel('Число пересечений турникетов')
plt.show()

In [None]:
weeks_del = [6, 7, 8]

In [None]:
for number in weeks_del:
    test = test.query(f'week_number != {number}', engine='python')

In [None]:
test.shape

In [None]:
train = train.drop(columns = 'ts')
test = test.drop(columns = 'ts')

In [None]:
train = train.drop(columns = 'week_number')
test = test.drop(columns = 'week_number')

In [None]:
train.head(2)

In [None]:
np.sort(train['hour'].unique())

In [None]:
np.sort(test['hour'].unique())

In [None]:
night = [0, 1, 3, 22, 23]
morning = [6, 7, 8, 9, 10, 11]
time_day = [12, 13, 14, 15, 16, 17]
evening = [18, 19, 20, 21]

In [None]:
day_hours = [night, morning, time_day, evening]

day_hours_name = ['night', 'morning', 'time_day', 'evening']

In [None]:

number = 0

for i in day_hours:
    def new_column(row):
        if row in i:
            return 1
        else:
            return 0

    train[day_hours_name[number]] = \
        train['hour'].progress_apply(new_column)

    test[day_hours_name[number]] = \
        test['hour'].progress_apply(new_column)

    number += 1

In [None]:
train.head(7)

In [None]:
temp = pd.DataFrame()

temp['sum'] = train[day_hours_name].sum(axis = 1)
print(temp.sum(axis = 0) / train.shape[0])

del temp

In [None]:
np.sort(train['day'].unique())
np.sort(test['day'].unique())

In [None]:
train['day'].nunique(), test['day'].nunique()

In [None]:
day_list = list(np.sort(train['day'].unique()))
day_list = [int(x) for x in day_list]
type(day_list[2])

In [None]:

for i in range(len(day_list)):
    def new_column(row):
        if day_list[i] == row:
            return 1
        else:
            return 0

    train['day_' + str(i + 1)] = \
        train['day'].apply(new_column)

    test['day_' + str(i + 1)] = \
        test['day'].apply(new_column)

In [None]:
train.columns

In [None]:
day_list = ['day_' + str(x) for x in day_list]
type(day_list[2])

In [None]:
temp = pd.DataFrame()

temp['sum'] = train[day_list].sum(axis = 1)
print(temp.sum(axis = 0) / train.shape[0])

del temp

temp = pd.DataFrame()

temp['sum'] = test[day_list].sum(axis = 1)
print(temp.sum(axis = 0) / test.shape[0])

del temp

In [None]:
list_weekday = np.sort(train['weekday'].unique())
list_weekday = [int(x) for x in list_weekday]
list_weekday

In [None]:
train['weekday'].value_counts().sort_index()

In [None]:
test['weekday'].value_counts().sort_index()

In [None]:
for i in range(len(list_weekday)):
    def new_column(row):
        if list_weekday[i] == row:
            return 1
        else:
            return 0

    train['weekday_' + str(i)] = \
        train['weekday'].apply(new_column)

    test['weekday_' + str(i)] = \
        test['weekday'].apply(new_column)

In [None]:
train.columns

In [None]:
list_weekday = ['weekday_' + str(x) for x in list_weekday]
type(list_weekday[2])

In [None]:
temp = pd.DataFrame()

temp['sum'] = train[list_weekday].sum(axis = 1)
print(temp.sum(axis = 0) / train.shape[0])

del temp

temp = pd.DataFrame()

temp['sum'] = test[list_weekday].sum(axis = 1)
print(temp.sum(axis = 0) / test.shape[0])

del temp

In [None]:
list_hours_train = np.sort(train['hour'].unique())
list_hours_train = [int(x) for x in list_hours_train]
list_hours_train

In [None]:
list_hours_test = np.sort(test['hour'].unique())
list_hours_test = [int(x) for x in list_hours_test]
list_hours_test

In [None]:
common_list_hours = list(set(list_hours_train + list_hours_test))
common_list_hours = [int(x) for x in common_list_hours]
common_list_hours

In [None]:
for i in range(len(common_list_hours)):
    def new_column(row):
        if common_list_hours[i] == row:
            return 1
        else:
            return 0

    train['hour_' + str(common_list_hours[i])] = \
        train['hour'].apply(new_column)

    test['hour_' + str(common_list_hours[i])] = \
        test['hour'].apply(new_column)

In [None]:
train.columns

In [None]:
len(test.columns) == len(train.columns)

In [None]:
common_list_hours = ['hour_' + str(x) for x in common_list_hours]
type(common_list_hours[2])

In [None]:
set(list_hours_test) - set(list_hours_train)

In [None]:
set(list_hours_train) - set(list_hours_test)

In [None]:
temp = pd.DataFrame()

temp['sum'] = train[common_list_hours].sum(axis = 1)
print(temp.sum(axis = 0) / train.shape[0])

del temp

temp = pd.DataFrame()

temp['sum'] = test[common_list_hours].sum(axis = 1)
print(temp.sum(axis = 0) / test.shape[0])

del temp

In [None]:
gate_id_train = list(np.sort(train['gate_id'].unique()))
gate_id_train

In [None]:
gate_id_test = list(np.sort(test['gate_id'].unique()))
gate_id_test

In [None]:
common_gate_list = list(set(gate_id_train + gate_id_test))
common_gate_list = [int(x) for x in common_gate_list]
print(common_gate_list)
print(type(common_gate_list[5]))

In [None]:
for i in range(len(common_gate_list)):
    def new_column(row):
        if common_gate_list[i] == row:
            return 1
        else:
            return 0

    train['gate_id_' + str(common_gate_list[i])] = \
        train['gate_id'].apply(new_column)

    test['gate_id_' + str(common_gate_list[i])] = \
        test['gate_id'].apply(new_column)

In [None]:
train.columns

In [None]:
len(test.columns) == len(train.columns)

In [None]:
common_gate_list = ['gate_id_' + str(x) for x in common_gate_list]
type(common_gate_list[2])

In [None]:
temp = pd.DataFrame()

temp['sum'] = train[common_gate_list].sum(axis = 1)
print(temp.sum(axis = 0) / train.shape[0])

del temp

temp = pd.DataFrame()

temp['sum'] = test[common_gate_list].sum(axis = 1)
print(temp.sum(axis = 0) / test.shape[0])

del temp

In [None]:
working_days = [0, 1, 2, 3, 4]
weekends = [5, 6]

In [None]:
week_days = [working_days, weekends]

week_days_name = ['working_days', 'weekends']

In [None]:

number = 0

for i in week_days:
    def new_column(row):
        if row in i:
            return 1
        else:
            return 0

    train[week_days_name[number]] = \
        train['weekday'].progress_apply(new_column)

    test[week_days_name[number]] = \
        test['weekday'].progress_apply(new_column)

    number += 1

In [None]:
temp = pd.DataFrame()

temp['sum'] = train[week_days_name].sum(axis = 1)
print(temp.sum(axis = 0) / train.shape[0])

del temp

temp = pd.DataFrame()

temp['sum'] = test[week_days_name].sum(axis = 1)
print(temp.sum(axis = 0) / test.shape[0])

del temp

In [None]:
train['minute'].nunique() == test['minute'].nunique()

In [None]:
first_quarter = list(range(0, 15))
second_quarter = list(range(15, 30))
third_quarter = list(range(30, 45))
fourth_quarter = list(range(45, 60))

In [None]:
hour_minutes = [
    first_quarter, second_quarter,
    third_quarter, fourth_quarter
]

hour_minutes_names = [
    'first_quarter', 'second_quarter',
    'third_quarter', 'fourth_quarter'
]

In [None]:
number = 0

for i in hour_minutes:
    def new_column(row):
        if row in i:
            return 1
        else:
            return 0

    train[hour_minutes_names[number]] = \
        train['minute'].progress_apply(new_column)

    test[hour_minutes_names[number]] = \
        test['minute'].progress_apply(new_column)

    number += 1

In [None]:
train.columns

In [None]:
len(test.columns) == len(train.columns)

In [None]:
temp = pd.DataFrame()

temp['sum'] = train[hour_minutes_names].sum(axis = 1)
print(temp.sum(axis = 0) / train.shape[0])

del temp

temp = pd.DataFrame()

temp['sum'] = test[hour_minutes_names].sum(axis = 1)
print(temp.sum(axis = 0) / test.shape[0])

del temp

In [None]:
temp_train = pd.DataFrame(train.sum(), columns = ['sum_total'])
temp_train.loc[temp_train['sum_total'] == 0]

In [None]:
temp_test = pd.DataFrame(test.sum(), columns = ['sum_total'])
temp_test.loc[temp_test['sum_total'] == 0]

In [None]:
columns_to_del = (
    list(temp_train.loc[temp_train['sum_total'] == 0].index) +
    list(temp_test.loc[temp_test['sum_total'] == 0].index)
)
columns_to_del

In [None]:
train.shape, test.shape

In [None]:
train = train.drop(columns_to_del, axis = 1)
test = test.drop(columns_to_del, axis = 1)

In [None]:
train.shape, test.shape

In [None]:
del temp_train
del temp_test

In [None]:
temp_train_isna = pd.DataFrame(train.isna().sum(), columns = ['isna_sum'])
temp_train_isna['isna_sum'].sum()

In [None]:
temp_test_isna = pd.DataFrame(test.isna().sum(), columns = ['isna_sum'])
temp_test_isna['isna_sum'].sum()

In [None]:
del temp_train_isna
del temp_test_isna

In [None]:
train.corr().style.background_gradient(cmap = 'magma')

In [None]:
columns_to_del = [
    'hour_22',
    'weekday_5',
    'weekends'
]

In [None]:
train.shape, test.shape

In [None]:
train = train.drop(columns = columns_to_del, axis = 1)
test = test.drop(columns = columns_to_del, axis = 1)

In [None]:
train.shape, test.shape

In [None]:
features_train = train.drop(columns = 'user_id')

In [None]:
target = train['user_id']

In [None]:
features_train.shape, target.shape

In [None]:
features_test = test.drop(columns = 'user_word')

In [None]:
user_word = test['user_word']

In [None]:
features_test.shape, user_word.shape

## MODEL
---

###  LogisticRegression  (multi_class='multinomial')

---

In [None]:
features_train_mn = features_train.copy()
features_test_mn = features_test.copy()

In [None]:
scaler_mn = MinMaxScaler()

scaler_mn.fit(features_train_mn)
features_train_mn = scaler_mn.transform(features_train_mn)

In [None]:
features_test_mn = scaler_mn.transform(features_test_mn)

In [None]:
lm = OneVsRestClassifier(
    LogisticRegression(
        multi_class='multinomial',
        solver='lbfgs',
        random_state = RANDOM_STATE))

In [None]:
lm.fit(features_train_mn, target)

In [None]:
preds_train = lm.predict(features_train_mn)

In [None]:
plt.figure(figsize = (20, 15))

confmatrix = confusion_matrix(
        target,
        preds_train)
f = sns.heatmap(confmatrix,
                annot = True,
                cmap = 'YlOrRd',
                fmt = 'g')

f.set_title('Матрица ошибок (Multinomial)')
f.set_xlabel('Предсказания')
f.set_ylabel('Значения целевого признака')
plt.yticks(rotation = 0)
plt.show()

In [None]:
print(classification_report(
    target,
    preds_train
))

In [None]:
preds = lm.predict(features_test_mn)

### LogisticRegression (multi_class='ovr')

---

In [None]:
features_train_ovr = features_train.copy()
features_test_ovr = features_test.copy()

In [None]:
scaler_ovr = MinMaxScaler()

scaler_ovr.fit(features_train_ovr)
features_train_ovr = scaler_ovr.transform(features_train_ovr)

In [None]:
features_test_ovr = scaler_ovr.transform(features_test_ovr)

In [None]:
lm = OneVsRestClassifier(
    LogisticRegression(
        multi_class='ovr',
        solver='lbfgs',
        random_state = RANDOM_STATE))

In [None]:
lm.fit(features_train_ovr, target)

In [None]:
preds_train = lm.predict(features_train_ovr)

In [None]:
plt.figure(figsize = (17, 15))

confmatrix = confusion_matrix(
        target,
        preds_train)
f = sns.heatmap(confmatrix,
                annot = True,
                cmap = 'YlOrRd',
                fmt = 'g')

f.set_title('Матрица ошибок (OVR)')
f.set_xlabel('Предсказания')
f.set_ylabel('Значения целевого признака')
plt.yticks(rotation = 0)
plt.show()

In [None]:
print(classification_report(
    target,
    preds_train
))

In [None]:
preds = lm.predict(features_test_ovr)

In [None]:
submit = pd.DataFrame()

submit['user_word'] = user_word
submit['preds'] = preds

In [None]:
submit.head()

In [None]:
user_word_frequency = pd.DataFrame(
    data = submit['user_word'].value_counts(ascending = False)
)
user_word_frequency.columns = ['quantity']
user_word_frequency

In [None]:
less_than_51 = [
    user_word_frequency.index.values[i]
    for i in range(len(user_word_frequency['quantity']))
    if user_word_frequency['quantity'][i] < 51
]

In [None]:
less_than_51.sort()
less_than_51

In [None]:
submit['preds'].unique()

In [None]:
submit['preds'].nunique()

In [None]:
submit_file = pd.DataFrame(
    submit.groupby('user_word')['preds'].agg(
        lambda x: x.value_counts().index[0])
    )

In [None]:
submit_file['new_preds'] = [
    -999 if submit_file.index.values[i] in less_than_51
    else submit_file['preds'][i]
    for i in range(len(submit_file['preds']))
]

In [None]:
submit_file = submit_file.drop(columns = ['preds'])

submit_file.columns = ['preds']

In [None]:
submit_file

In [None]:
submit_file['preds'].nunique()

In [None]:
submit_file.to_csv('submit_file.csv')

In [None]:
data = pd.read_csv('/content/submit_file.csv')

In [None]:
data

## BOOSTING

---

In [None]:
catboost = CatBoostClassifier(
    random_state = RANDOM_STATE,
    logging_level = 'Silent',
    loss_function = 'MultiClassOneVsAll'
)

In [None]:
pipe_catboost = imb_pipeline([
    ('feature_select', SelectKBest(score_func = chi2)),
    ('catboost', catboost)
])

In [None]:
pipe_catboost

In [None]:
pd.DataFrame(pipe_catboost.get_params().keys())

In [None]:
params_catboost = {
    'feature_select__k': [60],
    #'catboost__learning_rate': [0.0001, 0.001, 0.01, 0.1]
}

In [None]:
grid_catboost = RandomizedSearchCV(
    pipe_catboost,
    params_catboost,
    n_iter = 10,
    scoring = 'roc_auc',
    verbose = 1,
    cv = cv,
    refit = True,
    random_state = RANDOM_STATE,
    n_jobs = -1)

In [None]:
start = datetime.now()

grid_catboost.fit(features_train, target)

catboost_time_spent = datetime.now() - start
print('Время обучения модели CatBoostClassifier:\n',
      catboost_time_spent)

In [None]:
best_catboost = grid_catboost.best_estimator_
best_catboost

In [None]:
grid_catboost.best_params_

In [None]:
preds_train = best_catboost.predict(features_train)

In [None]:
plt.figure(figsize = (17, 15))

confmatrix = confusion_matrix(
        target,
        preds_train)
f = sns.heatmap(confmatrix,
                annot = True,
                cmap = 'YlOrRd',
                fmt = 'g')

f.set_title('Матрица ошибок (OVR)')
f.set_xlabel('Предсказания')
f.set_ylabel('Значения целевого признака')
plt.yticks(rotation = 0)
plt.show()

In [None]:
print(classification_report(
    target,
    preds_train
))

In [None]:
preds = best_catboost.predict(features_test)

In [None]:
submit = pd.DataFrame()

submit['user_word'] = user_word
submit['preds'] = preds

In [None]:
submit.head()

In [None]:
np.sort(submit['preds'].unique())

In [None]:
submit['preds'].nunique()

In [None]:
submit_file = pd.DataFrame(
    submit.groupby('user_word')['preds'].agg(
        lambda x: x.value_counts().index[0])
    )

In [None]:
submit_file

In [None]:
submit_file['preds'].nunique()

In [None]:
submit_file.to_csv('submit_file.csv')

In [None]:
data = pd.read_csv('/content/submit_file.csv')

In [None]:
data