In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.metrics import roc_auc_score
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from scipy.sparse import hstack, vstack
from sklearn.linear_model import LogisticRegression

from collections import OrderedDict
from operator import itemgetter
%matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer

PATH = './data/'

In [2]:
train_df = pd.read_csv(os.path.join(PATH, 'train_sample.csv'), nrows=200000)

In [3]:
train_df.shape

(200000, 6)

In [4]:
train_df = train_df.join(pd.read_csv(os.path.join(PATH,'mlboot_train_answers.tsv'), delimiter='\t').set_index('cuid'), on='cuid', how='inner')

In [5]:
train_df.head()

Unnamed: 0,cuid,cat,cnt1,cnt2,cnt3,data_diff,target
0,d241d1f6f360965c303a30f8b471b095,1,"{""2025582"":1,""2048310"":2,""651030"":1,""1108422"":...","{""31297"":1,""28598"":1,""504988"":1,""6498"":1,""1438...",{},15,0
1,d241d1f6f360965c303a30f8b471b095,1,"{""38462"":2,""2048310"":3,""1108422"":1,""2025582"":1...","{""264772"":2,""124932"":2,""31297"":1,""19791"":1,""18...",{},16,0
2,d241d1f6f360965c303a30f8b471b095,1,"{""2025582"":4}","{""552308"":3,""186181"":2,""28598"":4,""504988"":3,""1...",{},17,0
3,d241d1f6f360965c303a30f8b471b095,1,"{""304818"":2,""815630"":1,""2048310"":4,""1108422"":1...","{""124932"":2,""31297"":1,""186181"":2,""14263"":1,""59...",{},18,0
4,d241d1f6f360965c303a30f8b471b095,1,"{""1184422"":1,""2048310"":4,""809001"":1,""2025582"":...","{""31297"":1,""19791"":1,""186181"":2,""599535"":2,""97...",{},19,0


** tf-idf **
Номер счетчика + значение

In [310]:
train_df['cnt1'] = train_df['cnt1'].apply(lambda x: x[1:-1])

In [311]:
tokenizer = lambda doc: doc[1:-1].split(',')
tf_idf = TfidfVectorizer(tokenizer = tokenizer, max_df = 0.70, min_df = 0.01, ngram_range=(1,1))

In [312]:
new_df_cnt1 = train_df.groupby(['cuid'])['cnt1'].sum()
tf_idf_cnt1 = tf_idf.fit_transform(new_df_cnt1)

In [313]:
target = train_df.groupby(['cuid'])['target'].min()

In [315]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_cnt1, target, test_size=0.2, random_state=17)
n = Normalizer()
X_train = n.fit_transform(X_train)
X_test = n.transform(X_test)

In [316]:
lr = LogisticRegression(penalty='l2', C=1)
cv = cross_val_score(lr,X_train, y_train, scoring='roc_auc', cv=5, n_jobs=-1)
print (cv.mean(), cv.std())
lr.fit(X_train,y_train)
pred = lr.predict_proba(X_test)
print (roc_auc_score(y_test,pred[:, 1]))

0.5593467681568368 0.020877833285020797
0.5463620673994004


** tf-idf2 **
Смотрим просто номер счетчика

In [146]:
import re

In [147]:
train_df['cnt1'] = train_df['cnt1'].apply(lambda x: ' '.join(re.findall(r'"\d+"', x)))

In [148]:
tokenizer = lambda doc: doc[1:-1].split(' ')
tf_idf = TfidfVectorizer(tokenizer = tokenizer, max_df = 0.75, min_df = 0.01, ngram_range=(1,1))

tf_idf_cnt1 = tf_idf.fit_transform(train_df.groupby(['cuid'])['cnt1'].sum())

In [149]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_cnt1, train_df.groupby(['cuid'])['target'].min(), test_size=0.2, random_state=17)
lr = LogisticRegression(penalty='l2', C=1)
cv = cross_val_score(lr,X_train, y_train, scoring='roc_auc', cv=5, n_jobs=-1)
print (cv.mean(), cv.std())
lr.fit(X_train,y_train)
pred = lr.predict_proba(X_test)
print (roc_auc_score(y_test,pred[:, 1]))

0.558682240718854 0.03188071548519023
0.6002634208374965


** А если ещё отсортировать **

In [150]:
train_df.head()

Unnamed: 0,cuid,cat,cnt1,cnt2,cnt3,data_diff,target
0,d241d1f6f360965c303a30f8b471b095,1,"""2025582"" ""2048310"" ""651030"" ""1108422"" ""412027""","{""31297"":1,""28598"":1,""504988"":1,""6498"":1,""1438...",{},15,0
1,d241d1f6f360965c303a30f8b471b095,1,"""38462"" ""2048310"" ""1108422"" ""2025582"" ""1661409...","{""264772"":2,""124932"":2,""31297"":1,""19791"":1,""18...",{},16,0
2,d241d1f6f360965c303a30f8b471b095,1,"""2025582""","{""552308"":3,""186181"":2,""28598"":4,""504988"":3,""1...",{},17,0
3,d241d1f6f360965c303a30f8b471b095,1,"""304818"" ""815630"" ""2048310"" ""1108422"" ""2025582...","{""124932"":2,""31297"":1,""186181"":2,""14263"":1,""59...",{},18,0
4,d241d1f6f360965c303a30f8b471b095,1,"""1184422"" ""2048310"" ""809001"" ""2025582"" ""38462""...","{""31297"":1,""19791"":1,""186181"":2,""599535"":2,""97...",{},19,0


In [151]:
train_df.sort_values(by='data_diff', ascending=True, inplace=True)

In [152]:
tf_idf = TfidfVectorizer(tokenizer = tokenizer, max_df = 0.75, min_df = 0.01, ngram_range=(1,1))

new_df_cnt1 = train_df.groupby(['cuid'])['cnt1'].sum()
tf_idf_cnt1 = tf_idf.fit_transform(new_df_cnt1)

In [153]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_cnt1, train_df.groupby(['cuid'])['target'].min(), test_size=0.2, random_state=17)
lr = LogisticRegression(penalty='l2', C=1)
cv = cross_val_score(lr,X_train, y_train, scoring='roc_auc', cv=5, n_jobs=-1)
print (cv.mean(), cv.std())
lr.fit(X_train,y_train)
pred = lr.predict_proba(X_test)
print (roc_auc_score(y_test,pred[:, 1]))

0.5587867383061891 0.032211563928363544
0.5989735670814788


** DictVectorizer **

In [7]:
train_df = pd.read_csv(os.path.join(PATH, 'train_sample.csv'), nrows=200000)
train_df = train_df.join(pd.read_csv(os.path.join(PATH,'mlboot_train_answers.tsv'), delimiter='\t').set_index('cuid'), on='cuid', how='inner')

In [8]:
train_df['int_id'] = range(len(train_df))
train_df.head()

Unnamed: 0,cuid,cat,cnt1,cnt2,cnt3,data_diff,target,int_id
0,d241d1f6f360965c303a30f8b471b095,1,"{""2025582"":1,""2048310"":2,""651030"":1,""1108422"":...","{""31297"":1,""28598"":1,""504988"":1,""6498"":1,""1438...",{},15,0,0
1,d241d1f6f360965c303a30f8b471b095,1,"{""38462"":2,""2048310"":3,""1108422"":1,""2025582"":1...","{""264772"":2,""124932"":2,""31297"":1,""19791"":1,""18...",{},16,0,1
2,d241d1f6f360965c303a30f8b471b095,1,"{""2025582"":4}","{""552308"":3,""186181"":2,""28598"":4,""504988"":3,""1...",{},17,0,2
3,d241d1f6f360965c303a30f8b471b095,1,"{""304818"":2,""815630"":1,""2048310"":4,""1108422"":1...","{""124932"":2,""31297"":1,""186181"":2,""14263"":1,""59...",{},18,0,3
4,d241d1f6f360965c303a30f8b471b095,1,"{""1184422"":1,""2048310"":4,""809001"":1,""2025582"":...","{""31297"":1,""19791"":1,""186181"":2,""599535"":2,""97...",{},19,0,4


In [9]:
train_df['cnt1'] = train_df['cnt1'].apply(lambda x: json.loads(x))

In [287]:
row_dict = dict()
for i in tqdm(train_df['cuid'].unique()):
    row_dict[i] = list(train_df[train_df['cuid']==i]['int_id'].values)

100%|██████████| 15220/15220 [04:55<00:00, 51.54it/s]


In [288]:
# Use DictVectorizer
dv = DictVectorizer(separator=':')
dv_cnt1 = dv.fit_transform(train_df['cnt1'])

In [289]:
dv_cnt1_group = csr_matrix(np.sum(dv_cnt1[row_dict[list(row_dict.keys())[0]]], axis=0))
for i in tqdm(range(1,len(row_dict.keys()))):
    dv_cnt1_group = vstack([dv_cnt1_group,csr_matrix(np.sum(dv_cnt1[row_dict[list(row_dict.keys())[i]]], axis=0))])

100%|██████████| 15219/15219 [01:01<00:00, 248.05it/s]


In [290]:
df = pd.DataFrame()
df['cuid'] = list(row_dict.keys())
df = df.join(train_df.groupby(['cuid'])['cuid','target'].min().set_index('cuid'), on='cuid', how='inner')

In [291]:
df.head()

Unnamed: 0,cuid,target
0,d559100ff3cd915d8e83f315f9a8e486,0
1,d8a827376aeb2a95ca1249674fa41c0e,0
2,d9bb2787568f8a7b60d0db7ef5bc4b96,0
3,d4e87b059ad8ec1a77d906e2abb4c564,0
4,d3d02f1ac1da74ee68be2f1a9139a9af,0


In [292]:
X_train, X_test, y_train, y_test = train_test_split(dv_cnt1_group, df.target, test_size=0.2, random_state=17)
n = Normalizer()
X_train = n.fit_transform(X_train)
X_test = n.transform(X_test)

In [293]:
lr = LogisticRegression(penalty='l2', C=1)
cv = cross_val_score(lr,X_train, y_train, scoring='roc_auc', cv=5, n_jobs=-1)
print (cv.mean(), cv.std())
lr.fit(X_train,y_train)
pred = lr.predict_proba(X_test)
print (roc_auc_score(y_test,pred[:, 1]))

0.5796496505795112 0.026305534899416727
0.578347661870943


** DictVectorizer2 **

In [2]:
train_df = pd.read_csv(os.path.join(PATH, 'train_sample.csv'), nrows=200000)
train_df = train_df.join(pd.read_csv(os.path.join(PATH,'mlboot_train_answers.tsv'), delimiter='\t').set_index('cuid'), on='cuid', how='inner')
train_df['cnt1'] = train_df['cnt1'].apply(lambda x: x[1:-1]+',' if len(x)>2 else '')
train_df['cnt2'] = train_df['cnt2'].apply(lambda x: x[1:-1]+',' if len(x)>2 else '')
train_df['cnt3'] = train_df['cnt3'].apply(lambda x: x[1:-1]+',' if len(x)>2 else '')

In [3]:
train_df.head()

Unnamed: 0,cuid,cat,cnt1,cnt2,cnt3,data_diff,target
0,d241d1f6f360965c303a30f8b471b095,1,"""2025582"":1,""2048310"":2,""651030"":1,""1108422"":2...","""31297"":1,""28598"":1,""504988"":1,""6498"":1,""14389...",,15,0
1,d241d1f6f360965c303a30f8b471b095,1,"""38462"":2,""2048310"":3,""1108422"":1,""2025582"":1,...","""264772"":2,""124932"":2,""31297"":1,""19791"":1,""186...",,16,0
2,d241d1f6f360965c303a30f8b471b095,1,"""2025582"":4,","""552308"":3,""186181"":2,""28598"":4,""504988"":3,""14...",,17,0
3,d241d1f6f360965c303a30f8b471b095,1,"""304818"":2,""815630"":1,""2048310"":4,""1108422"":1,...","""124932"":2,""31297"":1,""186181"":2,""14263"":1,""599...",,18,0
4,d241d1f6f360965c303a30f8b471b095,1,"""1184422"":1,""2048310"":4,""809001"":1,""2025582"":3...","""31297"":1,""19791"":1,""186181"":2,""599535"":2,""978...",,19,0


In [4]:
df = pd.DataFrame()
df['cuid'] = train_df.groupby(['cuid'])['cuid'].min()
df['target'] = train_df.groupby(['cuid'])['target'].min()

df['cnt1'] = train_df.groupby(['cuid'])['cnt1'].sum().apply(lambda x: x[:-1] if (len(x)>0 and x[-1]==',') else x)
df['cnt1'] = df['cnt1'].apply(lambda x: json.loads('{'+x+'}'))

df['cnt2'] = train_df.groupby(['cuid'])['cnt2'].sum().apply(lambda x: x[:-1] if (len(x)>0 and x[-1]==',') else x)
df['cnt2'] = df['cnt2'].apply(lambda x: json.loads('{'+x+'}'))

df['cnt3'] = train_df.groupby(['cuid'])['cnt3'].sum().apply(lambda x: x[:-1] if (len(x)>0 and x[-1]==',') else x)
df['cnt3'] = df['cnt3'].apply(lambda x: json.loads('{'+x+'}'))

In [5]:
df.head()

Unnamed: 0_level_0,cuid,target,cnt1,cnt2,cnt3
cuid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
d241d1f6f360965c303a30f8b471b095,d241d1f6f360965c303a30f8b471b095,0,"{u'911052': 1, u'1680022': 1, u'1610623': 1, u...","{u'20447': 1, u'20914': 1, u'48447': 1, u'2933...",{}
d2422273015f17d3b0323794d72267af,d2422273015f17d3b0323794d72267af,0,"{u'747319': 1, u'1633230': 1, u'292387': 1, u'...","{u'217672': 1, u'131444': 1, u'102722': 1, u'1...","{u'644856': 1, u'426842': 1, u'595423': 1, u'3..."
d24229997aaa398a23a50327b780ee1a,d24229997aaa398a23a50327b780ee1a,0,"{u'483271': 1, u'103705': 1, u'809001': 1, u'7...","{u'104481': 1, u'7938': 1, u'44187': 1, u'3032...",{}
d24237114486ca83116b146e7cd2aa9f,d24237114486ca83116b146e7cd2aa9f,0,"{u'741147': 1, u'1829710': 1, u'1489844': 1, u...","{u'38671': 2, u'1866': 2, u'13736': 1, u'11767...","{u'932673': 4, u'941437': 2, u'852719': 1, u'8..."
d242667ee758a8974dae6d1bda7bfae6,d242667ee758a8974dae6d1bda7bfae6,1,"{u'329362': 1, u'1849384': 1, u'20677': 1, u'1...","{u'7817': 1, u'60981': 1, u'74065': 2, u'42876...","{u'818712': 1, u'827802': 1, u'50857': 2, u'41..."


In [29]:
dv = DictVectorizer(separator=':')
dv_cnt1 = dv.fit_transform(df['cnt1'])
dv_cnt2 = dv.fit_transform(df['cnt2'])
dv_cnt3 = dv.fit_transform(df['cnt3'])

In [7]:
X_train, X_test, y_train, y_test = train_test_split(hstack([dv_cnt1,dv_cnt2,dv_cnt3]), df.target, test_size=0.2, random_state=17)
n = Normalizer()
X_train = n.fit_transform(X_train)
X_test = n.transform(X_test)

In [8]:
lr = LogisticRegression(penalty='l2', C=1)
cv = cross_val_score(lr,X_train, y_train, scoring='roc_auc', cv=5, n_jobs=-1)
print (cv.mean(), cv.std())
lr.fit(X_train,y_train)
pred = lr.predict_proba(X_test)
print (roc_auc_score(y_test,pred[:, 1]))

(0.6258443216095808, 0.06082239722682662)
0.5751957519575195


In [9]:
X_train.shape

(3413, 173325)

In [10]:
#SVD
from sklearn.decomposition import TruncatedSVD
from scipy import sparse as sp

In [30]:
svd = TruncatedSVD(n_components=100, random_state=17, n_iter=5)
dv_cnt1 = svd.fit_transform(dv_cnt1)
dv_cnt2 = svd.fit_transform(dv_cnt2)
dv_cnt3 = svd.fit_transform(dv_cnt3)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(np.hstack([dv_cnt1,dv_cnt2,dv_cnt3]), df.target, test_size=0.2, random_state=17)
n = Normalizer()
X_train = n.fit_transform(X_train)
X_test = n.transform(X_test)

In [36]:
lr = LogisticRegression(penalty='l2', C=1)
cv = cross_val_score(lr,X_train, y_train, scoring='roc_auc', cv=5, n_jobs=-1)
print (cv.mean(), cv.std())
lr.fit(X_train,y_train)
pred = lr.predict_proba(X_test)
print (roc_auc_score(y_test,pred[:, 1]))

(0.622330957284063, 0.05187117952752747)
0.6175261752617526
