In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import roc_auc_score
import json
from tqdm import tqdm
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from scipy.sparse import hstack

from collections import OrderedDict
from operator import itemgetter
%matplotlib inline

PATH = './data/'

In [2]:
train_df = pd.read_csv(os.path.join(PATH, 'train_sample.csv'), nrows=200000)

In [3]:
train_df.shape

(200000, 6)

In [4]:
train_df = train_df.join(pd.read_csv(os.path.join(PATH,'mlboot_train_answers.tsv'), delimiter='\t').set_index('cuid'), on='cuid', how='inner')

In [5]:
train_df.head()

Unnamed: 0,cuid,cat,cnt1,cnt2,cnt3,data_diff,target
0,d241d1f6f360965c303a30f8b471b095,1,"{""2025582"":1,""2048310"":2,""651030"":1,""1108422"":...","{""31297"":1,""28598"":1,""504988"":1,""6498"":1,""1438...",{},15,0
1,d241d1f6f360965c303a30f8b471b095,1,"{""38462"":2,""2048310"":3,""1108422"":1,""2025582"":1...","{""264772"":2,""124932"":2,""31297"":1,""19791"":1,""18...",{},16,0
2,d241d1f6f360965c303a30f8b471b095,1,"{""2025582"":4}","{""552308"":3,""186181"":2,""28598"":4,""504988"":3,""1...",{},17,0
3,d241d1f6f360965c303a30f8b471b095,1,"{""304818"":2,""815630"":1,""2048310"":4,""1108422"":1...","{""124932"":2,""31297"":1,""186181"":2,""14263"":1,""59...",{},18,0
4,d241d1f6f360965c303a30f8b471b095,1,"{""1184422"":1,""2048310"":4,""809001"":1,""2025582"":...","{""31297"":1,""19791"":1,""186181"":2,""599535"":2,""97...",{},19,0


** tf-idf **
Номер счетчика + значение

In [10]:
train_df['cnt1'] = train_df['cnt1'].apply(lambda x: x[1:-1])

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
tokenizer = lambda doc: doc[1:-1].split(',')
tf_idf = TfidfVectorizer(tokenizer = tokenizer, max_df = 0.75, min_df = 0.01, ngram_range=(1,1))

In [12]:
new_df_cnt1 = train_df.groupby(['cuid'])['cnt1'].sum()
tf_idf_cnt1 = tf_idf.fit_transform(new_df_cnt1)

In [15]:
target = train_df.groupby(['cuid'])['target'].min()

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_cnt1, target, test_size=0.2, random_state=17)

In [23]:
lr = LogisticRegression(penalty='l2', C=1)
cv = cross_val_score(lr,X_train, y_train, scoring='roc_auc', cv=5, n_jobs=-1)
print (cv.mean(), cv.std())
lr.fit(X_train,y_train)
pred = lr.predict_proba(X_test)
print (roc_auc_score(y_test,pred[:, 1]))

(0.5782739699960068, 0.05846832494114528)
0.5024750247502474


** tf-idf2 **
Смотрим просто номер счетчика

In [29]:
train_df['cnt1'][0].split(',')

['"2025582":1', '"2048310":2', '"651030":1,"1108422":2,"412027":1']

In [30]:
import re

In [33]:
re.findall(r'"\d+"', train_df['cnt1'][0])

['"2025582"', '"2048310"', '"651030"', '"1108422"', '"412027"']

In [35]:
' '.join(re.findall(r'"\d+"', train_df['cnt1'][0]))

'"2025582" "2048310" "651030" "1108422" "412027"'

In [41]:
train_df['cnt1'] = train_df['cnt1'].apply(lambda x: ' '.join(re.findall(r'"\d+"', x)))

In [44]:
tokenizer = lambda doc: doc[1:-1].split(' ')
tf_idf = TfidfVectorizer(tokenizer = tokenizer, max_df = 0.75, min_df = 0.01, ngram_range=(1,1))

new_df_cnt1 = train_df.groupby(['cuid'])['cnt1'].sum()
tf_idf_cnt1 = tf_idf.fit_transform(new_df_cnt1)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_cnt1, target, test_size=0.2, random_state=17)
lr = LogisticRegression(penalty='l2', C=1)
cv = cross_val_score(lr,X_train, y_train, scoring='roc_auc', cv=5, n_jobs=-1)
print (cv.mean(), cv.std())
lr.fit(X_train,y_train)
pred = lr.predict_proba(X_test)
print (roc_auc_score(y_test,pred[:, 1]))

(0.6046213376523429, 0.046087909125584074)
0.45657456574565747


** А если ещё отсортировать **

In [66]:
train_df.head()

Unnamed: 0,cuid,cat,cnt1,cnt2,cnt3,data_diff,target
0,d241d1f6f360965c303a30f8b471b095,1,"{""2025582"":1,""2048310"":2,""651030"":1,""1108422"":...","{""31297"":1,""28598"":1,""504988"":1,""6498"":1,""1438...",{},15,0
1,d241d1f6f360965c303a30f8b471b095,1,"{""38462"":2,""2048310"":3,""1108422"":1,""2025582"":1...","{""264772"":2,""124932"":2,""31297"":1,""19791"":1,""18...",{},16,0
2,d241d1f6f360965c303a30f8b471b095,1,"{""2025582"":4}","{""552308"":3,""186181"":2,""28598"":4,""504988"":3,""1...",{},17,0
3,d241d1f6f360965c303a30f8b471b095,1,"{""304818"":2,""815630"":1,""2048310"":4,""1108422"":1...","{""124932"":2,""31297"":1,""186181"":2,""14263"":1,""59...",{},18,0
4,d241d1f6f360965c303a30f8b471b095,1,"{""1184422"":1,""2048310"":4,""809001"":1,""2025582"":...","{""31297"":1,""19791"":1,""186181"":2,""599535"":2,""97...",{},19,0


In [55]:
train_df.sort_values(by='data_diff', ascending=True, inplace=True)

In [56]:
new_df_cnt1 = train_df.groupby(['cuid'])['cnt1'].sum()
tf_idf_cnt1 = tf_idf.fit_transform(new_df_cnt1)

In [57]:
X_train, X_test, y_train, y_test = train_test_split(tf_idf_cnt1, train_df.groupby(['cuid'])['target'].min(), test_size=0.2, random_state=17)
lr = LogisticRegression(penalty='l2', C=1)
cv = cross_val_score(lr,X_train, y_train, scoring='roc_auc', cv=5, n_jobs=-1)
print (cv.mean(), cv.std())
lr.fit(X_train,y_train)
pred = lr.predict_proba(X_test)
print (roc_auc_score(y_test,pred[:, 1]))

(0.6044909548547224, 0.046015901032640395)
0.4574445744457445


** DictVectorizer **

Запоминаем номера строк для каждого cuid, use DictVectorizer

In [6]:
from sklearn.feature_extraction import DictVectorizer

In [24]:
train_df['int_id'] = range(len(train_df))
train_df.head()

Unnamed: 0,cuid,cat,cnt1,cnt2,cnt3,data_diff,target,int_id
0,d241d1f6f360965c303a30f8b471b095,1,"{u'1108422': 2, u'651030': 1, u'412027': 1, u'...","{""31297"":1,""28598"":1,""504988"":1,""6498"":1,""1438...",{},15,0,0
1,d241d1f6f360965c303a30f8b471b095,1,"{u'1108422': 1, u'1661409': 1, u'2025582': 1, ...","{""264772"":2,""124932"":2,""31297"":1,""19791"":1,""18...",{},16,0,1
2,d241d1f6f360965c303a30f8b471b095,1,{u'2025582': 4},"{""552308"":3,""186181"":2,""28598"":4,""504988"":3,""1...",{},17,0,2
3,d241d1f6f360965c303a30f8b471b095,1,"{u'1108422': 1, u'1673256': 1, u'2025582': 2, ...","{""124932"":2,""31297"":1,""186181"":2,""14263"":1,""59...",{},18,0,3
4,d241d1f6f360965c303a30f8b471b095,1,"{u'1474584': 1, u'1849384': 1, u'1673256': 1, ...","{""31297"":1,""19791"":1,""186181"":2,""599535"":2,""97...",{},19,0,4


In [11]:
train_df['cnt1'] = train_df['cnt1'].apply(lambda x: json.loads(x))

In [29]:
row_dict = dict()
for i in train_df['cuid'].unique():
    row_dict[i] = list(train_df[train_df['cuid']==i]['int_id'].values)

In [13]:
# Use DictVectorizer
dv = DictVectorizer(separator=':')
dv_cnt1 = dv.fit_transform(train_df['cnt1'])

In [39]:
for key in row_dict.keys():
    

['d349fc7cab24e1c2545c5660b4823950',
 'd339d1dd96ad066d722bc1e1bdde3d9e',
 'd491d87d56d1be4c23a58a697d4446b9',
 'd383d9f19132c89fb15bfeeea2301510',
 'd2ba56677159f7b32a5a31d0b2c12fb7',
 'd324ed580fb94a3aeddb87fb48250e1d',
 'd382058074ba55b36332eb35c78a5337',
 'd289cf50fa605db482daf1289cc0120a',
 'd49dc3aad5d7d7eff4a1669b192b2b64',
 'd4ad24bc039d7ae351e063265cedc32b',
 'd47c4b42a11d508284e630c4662b0e24',
 'd2c827f2fbab87cbb12140fbfca14f81',
 'd4be8eada8d4aeb5ea7d03edacaf9c6d',
 'd341bdc28387b62b38b7a87fb02bd7c6',
 'd3f3a2688783bd5ec0a5603ae6c6a5ca',
 'd276cf8f91ec88023346d2cbfe79b56f',
 'd40a7ced6ac1e8ae9ceee0b3abc4b8e8',
 'd2752c5afc649d79f88ed1a3e7b2965a',
 'd2a92a9b815af3f496a65c6523618d9a',
 'd28c9b6edf7a1683001a8ea9b73a6c7a',
 'd3615f3bf75c27c603ff80afa1fc468c',
 'd4510a60b2e6fbddc818f5726c4dddd4',
 'd2caf44ef1965cdcc320e41b665e355c',
 'd25d5875909314c6ff8783f7a4ce2d18',
 'd3bef1c1cf7a58ae6e853e69f6c3dfa0',
 'd4821d713aee890cb68d2dbdd7b284f4',
 'd33041d73e6711edab5f3186298b192d',
 

In [47]:
np.sum(dv_cnt1[row_dict[row_dict.keys()[0]]], axis=0).shape

(1, 142250)

In [18]:
csr_matrix.todense(dv_cnt1[[0,2]]+dv_cnt1[[0,1]])


matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [37]:
hstack([train_df['int_id'].reshape(-1,1), dv_cnt1])

  """Entry point for launching an IPython kernel.


<128430x142251 sparse matrix of type '<type 'numpy.float64'>'
	with 1268090 stored elements in COOrdinate format>

In [34]:
train_df['cuid'].reshape(-1,1).shape

  """Entry point for launching an IPython kernel.


(128430, 1)

In [32]:
dv_cnt1.shape

(128430, 142250)