In [57]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import precision_score, recall_score, f1_score

In [2]:
def dedupe_df(df, time_delta="00:00:15", user="user_id"):
    dups =  df[df.groupby([user, "gate_id"])["ts"].diff().lt(pd.Timedelta(time_delta))]
    return df.drop(index=dups.index).reset_index(drop=True)

In [3]:
def pivot_group_date_df(df, user='user_id'):
    df_prep = df.copy()
    
    df_prep['date'] = df_prep['ts'].dt.date.values

    df_res = df_prep.set_index([user, 'date', df_prep.groupby([user, 'date']).cumcount()+1]).unstack().sort_index(level=1, axis=1)
    df_res.columns = df_res.columns.map('{0[0]}_{0[1]}'.format)
    df_res.reset_index(inplace=True)

    df_res = df_res.drop(columns=['date'])

    df_res = df_res.sort_values(by='ts_1', ascending=True)
    df_res.reset_index(drop=True, inplace=True)
    
    return df_res

In [4]:
def add_gate_features_df(df):
    df['first_gate_ts'] = df.loc[:, df.columns.str.startswith("ts_")].min(axis=1)
    df['last_gate_ts'] = df.loc[:, df.columns.str.startswith("ts_")].max(axis=1)

    df['duration'] = (df['last_gate_ts'] - df['first_gate_ts']).dt.seconds

    df['num_gates_passed'] = df.loc[:, df.columns.str.startswith('gate_id_')].notna().sum(axis=1)

    df['gate_seq'] = df.loc[:, df.columns.str.startswith('gate_id_')].fillna('').astype(str).apply(' '.join, 1).apply(lambda x: x.strip())

In [5]:
def add_time_features_df(df):
    df['min_1'] = df['first_gate_ts'].dt.minute.values
    df['hour_1'] = df['first_gate_ts'].dt.hour.values

    df['min_2'] = df['last_gate_ts'].dt.minute.values
    df['hour_2'] = df['last_gate_ts'].dt.hour.values
    
    df['day'] = df['first_gate_ts'].dt.day.values
    df['month'] = df['first_gate_ts'].dt.month.values
    df['dayofweek'] = df['first_gate_ts'].dt.dayofweek.values

In [6]:
def add_cv_features_df(train, test, ngr_rng=(1, 3), max_feat=None):
    
    vectorizer = CountVectorizer(ngram_range=ngr_rng, tokenizer=lambda s: s.split(), max_features=max_feat)

    vectorizer.fit(train['gate_seq'])
    vec_cols = vectorizer.get_feature_names_out()

    vec_train = pd.DataFrame(vectorizer.transform(train['gate_seq']).todense(), columns=vec_cols)
    vec_test = pd.DataFrame(vectorizer.transform(test['gate_seq']).todense(), columns=vec_cols)

    train = pd.concat([train, vec_train], axis=1)
    test = pd.concat([test, vec_test], axis=1)

    return train,  test 

In [7]:
def add_ohe_features_df(train, test, ctg_cols):
    
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoder.fit(train[ctg_cols])

    cols = []
    for ctg in encoder.categories_:
        cols = np.concatenate((cols, ctg))
    
    ohe_train = pd.DataFrame(encoder.transform(train[ctg_cols]), columns=cols)
    ohe_test = pd.DataFrame(encoder.transform(test[ctg_cols]), columns=cols)

    train = pd.concat([train, ohe_train], axis=1)
    test = pd.concat([test, ohe_test], axis=1)

    return train,  test

In [8]:
def prepare_eval_df(y_train, y_val, y_val_pred, y_pred):
#used code from baseline notebook provided by organisers
    user_dict_val = dict() 
    inverse_user_dict_val = dict()
    
    y_val_list = list(y_val.unique())
    for i in range(len(y_val_list)):
        user_dict_val[y_val_list[i]] = 'user_'+str(i)
        inverse_user_dict_val['user_'+str(i)] = y_val_list[i]
        
    user_dict_train = dict() 
    inverse_user_dict_train = dict()
    
    y_train_list = list(y_train.unique())
    for i in range(len(y_train_list)):
        user_dict_train[y_train_list[i]] = 'train_user_'+str(i)
        inverse_user_dict_train['train_user_'+str(i)] = y_train_list[i]
        
    y_val_word = y_val.copy()
    
    for y_val_id in y_val_list:
        y_val_word.loc[y_val == y_val_id] = user_dict_val[y_val_id]
    
    y_word = y_train.copy()

    for y_train_id in y_train_list:
        y_word.loc[y_train == y_train_id] = user_dict_train[y_train_id]

    val_words = pd.DataFrame()
    val_words['word'] = y_val_word
    val_words['true'] = y_val
    val_words['preds'] = y_val_pred

    train_words = pd.DataFrame()
    train_words['word'] = y_word
    train_words['true'] = y_train
    train_words['preds'] = y_pred

    return train_words, user_dict_train, inverse_user_dict_train, val_words, user_dict_val, inverse_user_dict_val


In [9]:
def calc_score(train_words, user_dict_train, inverse_user_dict_train, val_words, user_dict_val, inverse_user_dict_val):
    #used code from baseline notebook provided by organisers
    comp_df = pd.DataFrame(val_words.groupby('word')['preds'].agg(lambda x: x.value_counts().index[0]))
    comp_df_train = pd.DataFrame(train_words.groupby('word')['preds'].agg(lambda x: x.value_counts().index[0]))

    for idx in comp_df.index:
        comp_df.loc[idx, 'true'] = inverse_user_dict_val[idx]   
    comp_df = comp_df.astype(int)

    for idx in comp_df_train.index:
        comp_df_train.loc[idx, 'true'] = inverse_user_dict_train[idx]
    comp_df_train = comp_df_train.astype(int)

    comp_df['comp'] = comp_df['preds'] == comp_df['true']
    comp_df_train['comp'] = comp_df_train['preds'] == comp_df_train['true']

    comp_df['norm'] = 1
    comp_df_train['norm'] = 1

    true_answers = (comp_df['comp'] * comp_df['norm']).sum()
    total_answers = comp_df['norm'].sum()
    precent_true = round((true_answers/total_answers)*100, 1)

    true_answers_train = (comp_df_train['comp'] * comp_df_train['norm']).sum()
    total_answers_train = comp_df_train['norm'].sum()
    precent_true_train = round((true_answers_train/total_answers_train)*100, 1)

    return true_answers, total_answers, precent_true, true_answers_train, total_answers_train, precent_true_train

#### 1. Load the data

In [10]:
train = pd.read_csv('data/train.csv', index_col=0)
test = pd.read_csv('data/test.csv', index_col=0)

In [11]:
train['ts'] = pd.to_datetime(train['ts'])
test['ts'] = pd.to_datetime(test['ts'])

In [16]:
train.head()

Unnamed: 0,user_id,ts,gate_id
0,18,2022-07-29 09:08:54,7
1,18,2022-07-29 09:09:54,9
2,18,2022-07-29 09:09:54,9
3,18,2022-07-29 09:10:06,5
4,18,2022-07-29 09:10:08,5


In [17]:
test.head()

Unnamed: 0,ts,gate_id,user_word
37518,2023-01-03 08:21:00,9,gini
37519,2023-01-03 08:21:00,9,gini
37520,2023-01-03 08:21:18,5,gini
37521,2023-01-03 08:21:19,5,gini
37522,2023-01-03 08:21:39,10,gini


#### 2. Transform the data

In [18]:
print(train.shape)
print(test.shape)

(37518, 3)
(7125, 3)


In [38]:
train_dd = dedupe_df(train, user="user_id")
test_dd = dedupe_df(test, user="user_word")

print(train_dd.shape)
print(test_dd.shape)

(24771, 3)
(4718, 3)


In [39]:
# creating pivoted dataframes, grouping by user_id and date, resulting in dataframe with rows where for a given user_id all gate data within a day is combined 
train_piv = pivot_group_date_df(train, user="user_id")
train_dd_piv = pivot_group_date_df(train_dd, user="user_id")

print(train_piv.shape)
print(train_dd_piv.shape)

(2521, 141)
(2521, 115)


In [40]:
test_piv = pivot_group_date_df(test, user="user_word")
test_dd_piv = pivot_group_date_df(test_dd, user="user_word")

print(test_piv.shape)
print(test_dd_piv.shape)

(538, 113)
(538, 73)


In [41]:
#adding time features + gate sequence
add_gate_features_df(train_piv)
add_gate_features_df(train_dd_piv)

add_time_features_df(train_piv)
add_time_features_df(train_dd_piv)


add_gate_features_df(test_piv)
add_gate_features_df(test_dd_piv)

add_time_features_df(test_piv)
add_time_features_df(test_dd_piv)

In [42]:
train_dd_piv.columns

Index(['user_id', 'gate_id_1', 'ts_1', 'gate_id_2', 'ts_2', 'gate_id_3',
       'ts_3', 'gate_id_4', 'ts_4', 'gate_id_5',
       ...
       'duration', 'num_gates_passed', 'gate_seq', 'min_1', 'hour_1', 'min_2',
       'hour_2', 'day', 'month', 'dayofweek'],
      dtype='object', length=127)

In [43]:
train_dd_piv.head(3)

Unnamed: 0,user_id,gate_id_1,ts_1,gate_id_2,ts_2,gate_id_3,ts_3,gate_id_4,ts_4,gate_id_5,...,duration,num_gates_passed,gate_seq,min_1,hour_1,min_2,hour_2,day,month,dayofweek
0,18,7.0,2022-07-29 09:08:54,9.0,2022-07-29 09:09:54,5.0,2022-07-29 09:10:06,10.0,2022-07-29 09:10:34,11.0,...,34382,38,7.0 9.0 5.0 10.0 11.0 4.0 9.0 5.0 10.0 12.0 11...,8,9,41,18,29,7,4
1,1,7.0,2022-07-29 09:33:16,9.0,2022-07-29 09:34:18,5.0,2022-07-29 09:34:32,10.0,2022-07-29 09:35:00,11.0,...,31935,10,7.0 9.0 5.0 10.0 11.0 4.0 3.0 10.0 11.0 6.0,33,9,25,18,29,7,4
2,3,7.0,2022-07-29 09:40:40,9.0,2022-07-29 09:42:49,5.0,2022-07-29 09:43:01,10.0,2022-07-29 09:43:29,11.0,...,31905,11,7.0 9.0 5.0 10.0 11.0 4.0 7.0 3.0 10.0 11.0 4.0,40,9,32,18,29,7,4


In [44]:
train_cv, test_cv = add_cv_features_df(train_dd_piv, test_dd_piv, ngr_rng=(1, 5), max_feat=100)



In [45]:
print(train_cv.shape)
print(test_cv.shape)

(2521, 227)
(538, 185)


In [46]:
train_fin, test_fin = add_ohe_features_df(train_cv, test_cv, ctg_cols = ['gate_id_1', 'hour_1', 'hour_2', 'dayofweek'])

In [47]:
print(train_fin.shape)
print(test_fin.shape)

(2521, 276)
(538, 234)


#### 3. Preparing train, val, test sets

In [48]:
val_index = train_fin['ts_1'] > '2022-11-30'

In [49]:
train_df = train_fin[~val_index].copy()
val_df = train_fin[val_index].copy()
test_df = test_fin.copy()

y_train = train_df['user_id'].astype(int)
y_val = val_df['user_id'].astype(int)

In [50]:
print(train_df.shape)
print(val_df.shape)
print(test_df.shape)

(1982, 276)
(539, 276)
(538, 234)


In [51]:
print(train_df.columns[115:130])

Index(['first_gate_ts', 'last_gate_ts', 'duration', 'num_gates_passed',
       'gate_seq', 'min_1', 'hour_1', 'min_2', 'hour_2', 'day', 'month',
       'dayofweek', '10.0', '10.0 10.0', '10.0 11.0'],
      dtype='object')


In [52]:
train_cols = []

for c in train_df.columns[127:]:
    train_cols.append(c)

for c in train_df.columns[117:119]:
    train_cols.append(c)

#train_cols

In [53]:
train_df.loc[:, train_cols]

Unnamed: 0,10.0,10.0 10.0,10.0 11.0,10.0 11.0 4.0,10.0 11.0 4.0 3.0,10.0 11.0 4.0 3.0 10.0,10.0 11.0 4.0 5.0,10.0 11.0 4.0 7.0,10.0 11.0 4.0 7.0 3.0,10.0 11.0 4.0 7.0 5.0,...,3.0,3.0.1,4.0,4.0.1,5.0,5.0.1,6.0,6.0.1,duration,num_gates_passed
0,6,0,1,1,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,34382,38
1,2,0,2,1,1,1,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,31935,10
2,2,0,2,2,0,0,0,1,1,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,31905,11
3,4,0,1,1,0,0,0,1,1,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,31705,27
4,1,0,1,1,0,0,0,0,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,32560,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1977,1,0,1,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28678,7
1978,1,0,1,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29695,5
1979,2,0,2,1,0,0,0,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13920,11
1980,1,0,1,1,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9397,4


In [54]:
X_train = train_df[train_cols].astype(int)
X_val = val_df[train_cols].astype(int)

X_train.columns = X_train.columns.astype(str)
X_val.columns = X_val.columns.astype(str)

In [55]:
scaler_X = StandardScaler()
scaler_X_val = StandardScaler()

scaler_X.fit(X_train)
scaler_X_val.fit(X_val)

X_scaled = scaler_X.transform(X_train)
X_val_scaled = scaler_X_val.transform(X_val)

In [59]:
clf = LogisticRegression()
clf_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'class_weight': ['balanced'],
    #'max_iter': list(range(500, 1000, 100)),
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga']
}
param_search = GridSearchCV(clf, param_grid=clf_param_grid, refit=True, verbose=2, cv=5)

# fitting the model for grid search 
param_search.fit(X_scaled, y_train)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s




[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.1s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.2s
[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=liblinear; total time=   0.2s




[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=saga; total time=   5.6s




[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=saga; total time=   5.6s




[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=saga; total time=   5.5s




[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=saga; total time=   5.8s




[CV] END C=0.01, class_weight=balanced, penalty=l1, solver=saga; total time=   5.7s
[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   1.2s
[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   1.6s
[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   0.7s
[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   1.8s
[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   1.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s
[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=liblinear; total time=   0.9s
[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=liblinear; total time=   0.8s
[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=liblinear; total time=   0.8s
[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=liblinear; total time=   0.9s
[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=liblinear; total time=   0.9s




[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=saga; total time=   3.5s




[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=saga; total time=   3.5s




[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=saga; total time=   3.5s




[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=saga; total time=   3.5s




[CV] END C=0.01, class_weight=balanced, penalty=l2, solver=saga; total time=   3.5s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=0.1, class_weight=balanced, penalty=l1, solve



[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=saga; total time=   6.2s




[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=saga; total time=   6.2s




[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=saga; total time=   6.2s




[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=saga; total time=   6.3s




[CV] END C=0.1, class_weight=balanced, penalty=l1, solver=saga; total time=   6.4s
[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   3.2s
[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   2.9s
[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   2.2s
[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   3.8s
[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   2.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s
[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   1.6s
[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   1.5s
[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   1.5s
[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   1.5s
[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   1.5s




[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=saga; total time=   3.5s




[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=saga; total time=   3.4s




[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=saga; total time=   3.4s




[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=saga; total time=   3.5s




[CV] END C=0.1, class_weight=balanced, penalty=l2, solver=saga; total time=   3.6s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=1, class_weight=balanced, penalty=l1, solver=liblinear; total time



[CV] END C=1, class_weight=balanced, penalty=l1, solver=saga; total time=  12.5s




[CV] END C=1, class_weight=balanced, penalty=l1, solver=saga; total time=  12.8s




[CV] END C=1, class_weight=balanced, penalty=l1, solver=saga; total time=  13.7s




[CV] END C=1, class_weight=balanced, penalty=l1, solver=saga; total time=  12.9s




[CV] END C=1, class_weight=balanced, penalty=l1, solver=saga; total time=  12.9s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   7.2s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   6.9s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   2.8s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   5.3s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   6.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=1, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=1, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=1, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=1, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=1, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   2.9s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   2.7s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   2.7s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   2.8s
[CV] END C=1, class_weight=balanced, penalty=l2, solver=liblinear; total time=   2.7s




[CV] END C=1, class_weight=balanced, penalty=l2, solver=saga; total time=   3.5s




[CV] END C=1, class_weight=balanced, penalty=l2, solver=saga; total time=   3.5s




[CV] END C=1, class_weight=balanced, penalty=l2, solver=saga; total time=   3.6s




[CV] END C=1, class_weight=balanced, penalty=l2, solver=saga; total time=   3.6s




[CV] END C=1, class_weight=balanced, penalty=l2, solver=saga; total time=   3.7s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=newton-cg; total time=   0.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=lbfgs; total time=   0.0s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=liblinear; t



[CV] END C=10, class_weight=balanced, penalty=l1, solver=liblinear; total time=  27.9s




[CV] END C=10, class_weight=balanced, penalty=l1, solver=liblinear; total time=  30.9s
[CV] END C=10, class_weight=balanced, penalty=l1, solver=liblinear; total time=  29.7s




[CV] END C=10, class_weight=balanced, penalty=l1, solver=liblinear; total time=  29.2s




[CV] END C=10, class_weight=balanced, penalty=l1, solver=saga; total time=  10.5s




[CV] END C=10, class_weight=balanced, penalty=l1, solver=saga; total time=  10.0s




[CV] END C=10, class_weight=balanced, penalty=l1, solver=saga; total time=  10.4s




[CV] END C=10, class_weight=balanced, penalty=l1, solver=saga; total time=  10.6s




[CV] END C=10, class_weight=balanced, penalty=l1, solver=saga; total time=  10.7s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=newton-cg; total time=  11.8s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=newton-cg; total time=  10.7s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=newton-cg; total time=  10.1s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=newton-cg; total time=  13.9s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=newton-cg; total time=   8.6s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=10, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=10, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=10, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=10, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END C=10, class_weight=balanced, penalty=l2, solver=lbfgs; total time=   0.5s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=liblinear; total time=   5.5s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=liblinear; total time=   4.7s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=liblinear; total time=   4.9s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=liblinear; total time=   5.1s
[CV] END C=10, class_weight=balanced, penalty=l2, solver=liblinear; total time=   4.9s




[CV] END C=10, class_weight=balanced, penalty=l2, solver=saga; total time=   3.7s




[CV] END C=10, class_weight=balanced, penalty=l2, solver=saga; total time=   3.6s




[CV] END C=10, class_weight=balanced, penalty=l2, solver=saga; total time=   3.6s




[CV] END C=10, class_weight=balanced, penalty=l2, solver=saga; total time=   3.6s


40 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ivand\Code\Py-Projects\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ivand\Code\Py-Projects\.venv\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\ivand\Code\Py-Projects\.venv\lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\Users\ivand\Code\Py-Projects\.venv\lib\site-packages\s

[CV] END C=10, class_weight=balanced, penalty=l2, solver=saga; total time=   3.6s




In [60]:
print('Mean Accuracy: %.3f' % param_search.best_score_)
print('Config: %s' % param_search.best_params_)

Mean Accuracy: 0.444
Config: {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'saga'}


In [61]:
clf = LogisticRegression(random_state=1, C=1, class_weight='balanced', max_iter=3000, penalty='l2', solver='saga').fit(X_scaled, y_train)
clf_dflt = LogisticRegression(random_state=1, solver='saga').fit(X_scaled, y_train)

y_pred = clf.predict(X_scaled)
y_val_pred = clf.predict(X_val_scaled)

y_pred_dflt = clf_dflt.predict(X_scaled)
y_val_pred_dflt = clf_dflt.predict(X_val_scaled)



In [62]:
print('Tuned model F1 scores:')
print('Train score: %.3f' % f1_score(y_pred, y_train, average='weighted'))
print('Val score: %.3f' % f1_score(y_val_pred, y_val, average='weighted'))

print('Default model F1 scores:')
print('Train score: %.3f' % f1_score(y_pred_dflt, y_train, average='weighted'))
print('Val score: %.3f' % f1_score(y_val_pred_dflt, y_val, average='weighted'))

Tuned model F1 scores:
Train score: 0.774
Val score: 0.305
Default model F1 scores:
Train score: 0.684
Val score: 0.373


In [65]:
#calculating scores
train_words, user_dict_train, inverse_user_dict_train, val_words, user_dict_val, inverse_user_dict_val = prepare_eval_df(y_train, y_val, y_val_pred, y_pred)
true_answers, total_answers, precent_true, true_answers_train, total_answers_train, precent_true_train = calc_score(train_words, user_dict_train, inverse_user_dict_train, val_words, user_dict_val, inverse_user_dict_val)

print('Tuned model')
print('Score train:', true_answers_train, total_answers_train, precent_true_train)
print('Score val:', true_answers, total_answers, precent_true)

Tuned model
Score train: 50 50 100.0
Score val: 16 43 37.2


  y_val_word.loc[y_val == y_val_id] = user_dict_val[y_val_id]
  y_word.loc[y_train == y_train_id] = user_dict_train[y_train_id]


In [66]:
#calculating scores
train_words, user_dict_train, inverse_user_dict_train, val_words, user_dict_val, inverse_user_dict_val = prepare_eval_df(y_train, y_val, y_val_pred_dflt, y_pred_dflt)
true_answers, total_answers, precent_true, true_answers_train, total_answers_train, precent_true_train = calc_score(train_words, user_dict_train, inverse_user_dict_train, val_words, user_dict_val, inverse_user_dict_val)

print('Default model')
print('Score train:', true_answers_train, total_answers_train, precent_true_train)
print('Score val:', true_answers, total_answers, precent_true)

Default model
Score train: 44 50 88.0
Score val: 17 43 39.5


  y_val_word.loc[y_val == y_val_id] = user_dict_val[y_val_id]
  y_word.loc[y_train == y_train_id] = user_dict_train[y_train_id]


#### 4. Running test prediction

In [67]:
X = train_fin.copy()
y = X['user_id'].astype(int)

X_test = test_fin.copy()

In [68]:
X = X[train_cols].astype(int)
X_test = X_test[train_cols].astype(int)

In [69]:
X.columns = X.columns.astype(str)
X_test.columns = X_test.columns.astype(str)

In [70]:
scaler_X = StandardScaler()
#scaler_X_test = StandardScaler()

scaler_X.fit(X)
scaler_X.fit(X_test)

X_scaled = scaler_X.transform(X)
X_test_scaled = scaler_X.transform(X_test)

In [71]:
clf = LogisticRegression(random_state=1, class_weight='balanced', solver='saga').fit(X_scaled, y)



In [72]:
y_pred = clf.predict(X_scaled)
y_test_pred = clf.predict(X_test_scaled)

y_pred_proba = clf.predict_proba(X_scaled)
y_test_pred_proba = clf.predict_proba(X_test_scaled)

In [73]:
test_words = pd.DataFrame()

test_words['user_word'] = test_fin['user_word']
test_words['preds'] = y_test_pred

In [74]:
comp_df_test = pd.DataFrame(test_words.groupby('user_word')['preds'].agg(lambda x: x.value_counts().index[0]))

In [75]:
comp_df_test.shape[0]

34

In [76]:
comp_df_test

Unnamed: 0_level_0,preds
user_word,Unnamed: 1_level_1
aucroc,49
binary,12
blue,51
categorical,14
coefficient,15
collinear,33
distributed,0
epsilon,1
f1,6
fit,2


In [77]:
comp_df_test.to_csv('ans-11-23-23.csv')