In [33]:
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler

#models
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

##### We will use last month records for validation

In [29]:
train = pd.read_csv("data/train_preprocessed.csv")

val_index = train['ts'] > '2022-11-30'
validation = train[val_index].copy()
train = train[~val_index].copy()

print("Validation percent: ", validation.shape[0] / train.shape[0])

Validation percent:  0.2286883903717046


In [30]:
X = train.drop(['user_id', 'ts'], axis=1)
y = train['user_id']

X_val = validation.drop(['user_id', 'ts'], axis=1)
y_val = validation['user_id']

##### Also we will scale everything

In [32]:
scaler_X = StandardScaler()
scaler_X_val = StandardScaler()

X = pd.DataFrame(scaler_X.fit_transform(X), columns=X.columns)
X_val = pd.DataFrame(scaler_X_val.fit_transform(X_val), columns=X_val.columns)

##### Try various models with different hyperparameters

In [42]:

# Деревья уходили в переобучения. И ничего хорошего не выходило
model = KNeighborsClassifier(n_neighbors=10, metric='cosine') #104 14
#model = LogisticRegression()
#model = CatBoostClassifier(learning_rate=0.001, depth=6, l2_leaf_reg=3, iterations=1000)
#model = RandomForestClassifier(n_estimators=100, class_weight='balanced', max_depth=21, criterion='entropy')
clf = model.fit(X, y)

y_pred = clf.predict(X)

y_val_pred = clf.predict(X_val)


##### Evaluate quality

In [36]:
# Делаем валидацию нашей модель

# Создаем новые датасеты
y_word = pd.DataFrame()
y_val_word = pd.DataFrame()


# Что типа id в тестовой выборке
y_word['word'] = 'user_' + y.astype(str)
y_val_word['word'] = 'user_' + y_val.astype(str) 

# Целевая переменная ground truth
y_word['true'] = y
y_val_word['true'] = y_val

# Добавим предсказания
y_word['preds'] = y_pred
y_val_word['preds'] = y_val_pred



# Делаем датасеты с предсказаниями
y_pred_word = pd.DataFrame(y_word.groupby('word')[['true','preds']].agg(lambda x: x.value_counts().index[0]))
y_val_pred_word = pd.DataFrame(y_val_word.groupby('word')[['true','preds']].agg(lambda x: x.value_counts().index[0]))

# Добавляем сравнение с ground truth
y_pred_word['comp'] = y_pred_word['preds'] == y_pred_word['true']
y_val_pred_word['comp'] = y_val_pred_word['preds'] == y_val_pred_word['true']


# Веса юзеров мы не знаем, давайте возьмем равные веса для простоты = 1. Можно и не брать пролли
y_pred_word['norm'] = 1
y_val_pred_word['norm'] = 1


In [37]:
# Evaluate on train

true_answers = (y_pred_word['comp'] * y_pred_word['norm']).sum()
total_answers = y_pred_word['norm'].sum()
precent_true = round((true_answers/total_answers)*100, 1)

print('Accuracy train2', true_answers, total_answers, precent_true)

Оценка train2 50 50 100.0


In [39]:
# Evaluate on val

true_answers_val = (y_val_pred_word['comp'] * y_val_pred_word['norm']).sum()
total_answers_val = y_val_pred_word['norm'].sum()
precent_true_val = round((true_answers_val/total_answers_val)*100, 1)

print('Accuracy val', true_answers_val, total_answers_val, precent_true_val)

Accuracy val 34 43 79.1
