# Моя версия реализации логистический регрессии. 
Оригинал: https://github.com/Yorko/mlcourse.ai/blob/master/jupyter_russian/topic04_linear_models/lesson4_practice_alice_benchmarks.ipynb

In [258]:
import pickle
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook
from scipy.sparse import csr_matrix, hstack
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [259]:
# загрузим обучающую и тестовую выборки
train_df = pd.read_csv('../mlprojects/data/train_sessions.csv',
                       index_col='session_id')
test_df = pd.read_csv('../mlprojects/data/test_sessions.csv',
                      index_col='session_id')

# приведем колонки time1, ..., time10 к временному формату
times = ['time%s' % i for i in range(1, 11)]
train_df[times] = train_df[times].apply(pd.to_datetime)
test_df[times] = test_df[times].apply(pd.to_datetime)

# отсортируем данные по времени
train_df = train_df.sort_values(by='time1')

# посмотрим на заголовок обучающей выборки
train_df.head()

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55.0,2013-01-12 08:05:57,,NaT,,NaT,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
54843,56,2013-01-12 08:37:23,55.0,2013-01-12 08:37:23,56.0,2013-01-12 09:07:07,55.0,2013-01-12 09:07:09,,NaT,...,NaT,,NaT,,NaT,,NaT,,NaT,0
77292,946,2013-01-12 08:50:13,946.0,2013-01-12 08:50:14,951.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:15,946.0,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948.0,2013-01-12 08:50:16,784.0,2013-01-12 08:50:16,949.0,2013-01-12 08:50:17,946.0,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948.0,2013-01-12 08:50:17,949.0,2013-01-12 08:50:18,948.0,2013-01-12 08:50:18,945.0,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947.0,2013-01-12 08:50:19,945.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:19,946.0,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950.0,2013-01-12 08:50:20,948.0,2013-01-12 08:50:20,947.0,2013-01-12 08:50:21,950.0,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946.0,2013-01-12 08:50:21,951.0,2013-01-12 08:50:22,946.0,2013-01-12 08:50:22,947.0,2013-01-12 08:50:22,0


In [260]:
# приведем колонки site1, ..., site10 к целочисленному формату и заменим пропуски нулями
sites = ['site%s' % i for i in range(1, 11)]
train_df[sites] = train_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')

# загрузим словарик сайтов
with open(r"../mlprojects/data/site_dic.pkl", "rb") as input_file:
    site_dict = pickle.load(input_file)

# датафрейм словарика сайтов
sites_dict_df = pd.DataFrame(list(site_dict.keys()), 
                          index=list(site_dict.values()), 
                          columns=['site'])
print(u'всего сайтов:', sites_dict_df.shape[0])
sites_dict_df.head()

всего сайтов: 48371


Unnamed: 0,site
25075,www.abmecatronique.com
13997,groups.live.com
42436,majeureliguefootball.wordpress.com
30911,cdt46.media.tourinsoft.eu
8104,www.hdwallpapers.eu


Выделим целевую переменную и объединим выборки, чтобы вместе привести их к разреженному формату.

In [261]:
# наша целевая переменная
y_train = train_df['target']

# объединенная таблица исходных данных
full_df = pd.concat([train_df.drop('target', axis=1), test_df])

# индекс, по которому будем отделять обучающую выборку от тестовой
idx_split = train_df.shape[0]

In [262]:
# табличка с индексами посещенных сайтов в сессии
full_sites = full_df[sites]
full_sites.head()

Unnamed: 0_level_0,site1,site2,site3,site4,site5,site6,site7,site8,site9,site10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,56,55,0,0,0,0,0,0,0,0
54843,56,55,56,55,0,0,0,0,0,0
77292,946,946,951,946,946,945,948,784,949,946
114021,945,948,949,948,945,946,947,945,946,946
146670,947,950,948,947,950,952,946,951,946,947


In [263]:
from scipy.sparse import csr_matrix

In [264]:
# csr_matrix?

In [265]:
# последовательность с индексами
sites_flatten = full_sites.values.flatten()

# искомая матрица
full_sites_sparse = csr_matrix(([1] * sites_flatten.shape[0],
                                sites_flatten,
                                range(0, sites_flatten.shape[0] + 10, 10)))[:, 1:]

In [266]:
X_train_sparse = full_sites_sparse[:idx_split]
X_test_sparse = full_sites_sparse[idx_split:]

In [267]:
def get_auc_lr_valid(X, y, C=1.0, ratio = 0.9, seed=17):
    '''
    X, y – выборка
    ratio – в каком отношении поделить выборку
    C, seed – коэф-т регуляризации и random_state 
              логистической регрессии
    '''
    train_len = int(ratio*X.shape[0])
    X_train = X[:train_len, :] 
    X_valid = X[train_len:, :]
    y_train = y[:train_len]
    y_valid = y[train_len:]
    logit = LogisticRegression(C=C, n_jobs=-1, random_state=seed)
    
    logit.fit(X_train, y_train)
    valid_pred = logit.predict_proba(X_valid)[:,1]
    
    return roc_auc_score(y_valid, valid_pred), logit

In [268]:
%%time
get_auc_lr_valid(X_train_sparse, y_train)



CPU times: user 146 ms, sys: 180 ms, total: 326 ms
Wall time: 4.38 s


(0.9197957084494166, LogisticRegression(n_jobs=-1, random_state=17))

In [269]:
logit = LogisticRegression(n_jobs=-1, random_state=17)
logit.fit(X_train_sparse, y_train)
test_pred = logit.predict_proba(X_test_sparse)[:,1]
test_pred.shape

(82797,)

In [270]:
pd.Series(test_pred, index=range(1,test_pred.shape[0]+1),name='target').to_csv('logit.csv',header=True,
                                                                              index_label='session_id')
                                                                

In [271]:
!head -10 logit.csv

session_id,target
1,0.0022195973555361945
2,2.518934875590028e-09
3,6.160145453086596e-09
4,1.3226767899696187e-08
5,2.729074909736547e-05
6,0.00015117963506244763
7,0.0004423768406002131
8,0.0001012456925994951
9,0.0007773309339707889


In [272]:
time = ['time%d' % i for i in range(1,11)]
train_df[time].head()

Unnamed: 0_level_0,time1,time2,time3,time4,time5,time6,time7,time8,time9,time10
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
21669,2013-01-12 08:05:57,2013-01-12 08:05:57,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
54843,2013-01-12 08:37:23,2013-01-12 08:37:23,2013-01-12 09:07:07,2013-01-12 09:07:09,NaT,NaT,NaT,NaT,NaT,NaT
77292,2013-01-12 08:50:13,2013-01-12 08:50:14,2013-01-12 08:50:15,2013-01-12 08:50:15,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:16,2013-01-12 08:50:17,2013-01-12 08:50:17
114021,2013-01-12 08:50:17,2013-01-12 08:50:17,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:18,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:19,2013-01-12 08:50:20
146670,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:20,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:21,2013-01-12 08:50:22,2013-01-12 08:50:22,2013-01-12 08:50:22


In [276]:
new_feat_train = pd.DataFrame(index = train_df.index)
new_feat_test = pd.DataFrame(index = test_df.index)

In [277]:
new_feat_train['year_month'] = train_df['time1'].apply(lambda ts:100 * ts.year + ts.month)
new_feat_test['year_month'] = test_df['time1'].apply(lambda ts:100 * ts.year + ts.month)

In [278]:
new_feat_train['year_month'].values.reshape(-1,1)

array([[201301],
       [201301],
       [201301],
       ...,
       [201404],
       [201404],
       [201404]])

In [279]:
scaler = StandardScaler()
scaler.fit(new_feat_train['year_month'].values.reshape(-1,1))

new_feat_train['year_month_scaled'] = scaler.transform(new_feat_train['year_month'].values.reshape(-1,1))
new_feat_test['year_month_scaled'] = scaler.transform(new_feat_test['year_month'].values.reshape(-1,1))

In [280]:
train_df['time1'].apply(lambda ts:100 * ts.year + ts.month ).head()

session_id
21669     201301
54843     201301
77292     201301
114021    201301
146670    201301
Name: time1, dtype: int64

In [281]:
new_feat_train['year_month_scaled'].values.reshape(-1,1)

array([[-1.74440496],
       [-1.74440496],
       [-1.74440496],
       ...,
       [ 0.68162559],
       [ 0.68162559],
       [ 0.68162559]])

In [282]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparse, 
                            new_feat_train['year_month_scaled'].values.reshape(-1,1)]))

#get_auc_lr_valid(X_train_sparse_new, y_train)

In [283]:
get_auc_lr_valid(X_train_sparse_new, y_train)

(0.9198902054055882, LogisticRegression(n_jobs=-1, random_state=17))

In [284]:
X_train_sparse.shape, new_feat_train['year_month_scaled'].values.reshape(-1,1).shape

((253561, 48371), (253561, 1))

In [342]:
new_feat_train['star_hours'] = train_df['time1'].apply(lambda x : x.hour)
new_feat_test['star_hours'] = test_df['time1'].apply(lambda x : x.hour)
new_feat_train['morning'] = new_feat_train['star_hours'].apply(lambda x : x <= 11, 1).astype(int)
new_feat_test['morning'] = new_feat_test['star_hours'].apply(lambda x : x <= 11, 1).astype(int)
new_feat_train['day'] = train_df['time1'].apply(lambda x : x.day)
new_feat_test['day'] = test_df['time1'].apply(lambda x : x.day)
new_feat_train['star_month'] = new_feat_train['day'].apply(lambda x : x <= 10, 1).astype(int)
new_feat_test['star_month'] = new_feat_test['day'].apply(lambda x : x <= 10, 1).astype(int)


In [329]:
# new_feat_train.drop('day', axis=1, inplace=True)
# new_feat_test.drop('day', axis=1, inplace=True)

In [343]:
# Масштабируем новый признак с помощью StandardScaler
scaler = StandardScaler()
scaler.fit(new_feat_train['star_month'].values.reshape(-1,1))

new_feat_train['star_month'] = scaler.transform(new_feat_train['star_month'].values.reshape(-1,1))
new_feat_test['star_month'] = scaler.transform(new_feat_test['star_month'].values.reshape(-1,1))

In [344]:
# Масштабируем новый признак с помощью StandardScaler
scaler = StandardScaler()
scaler.fit(new_feat_train['morning'].values.reshape(-1,1))

new_feat_train['morning'] = scaler.transform(new_feat_train['morning'].values.reshape(-1,1))
new_feat_test['morning'] = scaler.transform(new_feat_test['morning'].values.reshape(-1,1))

In [345]:
# Масштабируем новый признак с помощью StandardScaler
scaler = StandardScaler()
scaler.fit(new_feat_train['star_hours'].values.reshape(-1,1))

new_feat_train['star_hours'] = scaler.transform(new_feat_train['star_hours'].values.reshape(-1,1))
new_feat_test['star_hours'] = scaler.transform(new_feat_test['star_hours'].values.reshape(-1,1))

In [249]:
# def turning_hours(data):
#     scaler = StandardScaler()
#     for x in range(1,11):
#         scaler.fit(data[f'star_hour_{x}'.format(x)].values.reshape(-1,1))
#         data[f'star_hour_{x}'.format(x)] = scaler.transform(data[f'star_hour_{x}'\
#                                                                                      .format(x)].values.reshape(-1,1))



In [346]:
new_feat_test.shape

(82797, 6)

In [347]:
# Собираем всё вместе
X_train_sparse_new = csr_matrix(hstack([X_train_sparse, 
                            new_feat_train['morning'].values.reshape(-1,1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse, 
                            new_feat_test['morning'].values.reshape(-1,1)]))
X_train_sparse_new = csr_matrix(hstack([X_train_sparse, 
                            new_feat_train['star_hours'].values.reshape(-1,1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse, 
                            new_feat_test['star_hours'].values.reshape(-1,1)]))


In [348]:
X_train_sparse_new = csr_matrix(hstack([X_train_sparse, 
                            new_feat_train['star_month'].values.reshape(-1,1)]))
X_test_sparse_new = csr_matrix(hstack([X_test_sparse, 
                            new_feat_test['star_month'].values.reshape(-1,1)]))

In [349]:
get_auc_lr_valid(X_train_sparse_new, y_train)

(0.9218267137158861, LogisticRegression(n_jobs=-1, random_state=17))

In [245]:
# def data_merging(data):
#     for x in range(1,11):
#         X_train_sparse_new = X_train_sparse_new
#         X_train_sparse_new = csr_matrix(hstack([X_train_sparse, 
#                             data[f'star_hour_{x}'.format(x)].values.reshape(-1,1)]))
#     return X_train_sparse_new.shape

In [352]:
# Ну что же , наш рез-ат увеличился на 4%. Сохраним нашу модель и отправим их в Kaggle
logit_1 = LogisticRegression(n_jobs=-1, random_state=17)
logit_1.fit(X_train_sparse_new, y_train)
test_pred_2 = logit_1.predict_proba(X_test_sparse_new)[:,1]


In [334]:
pd.Series(test_pred_2, index=range(1,test_pred.shape[0]+1),name='target').to_csv('logit_morning.csv',header=True,
                                                                              index_label='session_id')
                                                                

In [338]:
get_auc_lr_valid(X_train_sparse_new, y_train)

(0.9218267137158861, LogisticRegression(n_jobs=-1, random_state=17))

In [336]:
X_train_sparse_new.shape, new_feat_train.shape

((253561, 48372), (253561, 5))

In [339]:
get_auc_lr_valid(X_train_sparse_new, y_train, C=1)

(0.9218267137158861, LogisticRegression(C=1, n_jobs=-1, random_state=17))

In [350]:
for C in np.logspace(-3, 1, 10):
    print(get_auc_lr_valid(X_train_sparse_new, y_train, C = C))

(0.8866524106082947, LogisticRegression(C=0.001, n_jobs=-1, random_state=17))
(0.897604547447133, LogisticRegression(C=0.0027825594022071257, n_jobs=-1, random_state=17))
(0.908693297297689, LogisticRegression(C=0.007742636826811269, n_jobs=-1, random_state=17))
(0.9167066693717553, LogisticRegression(C=0.021544346900318832, n_jobs=-1, random_state=17))
(0.921524353646858, LogisticRegression(C=0.05994842503189409, n_jobs=-1, random_state=17))
(0.9224735499094883, LogisticRegression(C=0.1668100537200059, n_jobs=-1, random_state=17))
(0.9215349203991454, LogisticRegression(C=0.46415888336127775, n_jobs=-1, random_state=17))
(0.9218890575543826, LogisticRegression(C=1.2915496650148828, n_jobs=-1, random_state=17))
(0.9226149934365373, LogisticRegression(C=3.593813663804626, n_jobs=-1, random_state=17))
(0.9224584545490773, LogisticRegression(C=10.0, n_jobs=-1, random_state=17))


In [354]:
logit = LogisticRegression(n_jobs=-1, C=3.5938)
logit.fit(X_train_sparse_new, y_train)
preds = logit.predict_proba(X_test_sparse_new)[:,1]
pd.Series(preds, index=range(1,test_pred.shape[0]+1),name='target').to_csv('logit_finish.csv',header=True,
                                                                              index_label='session_id')
                                                                