In [7]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import eli5
import numpy as np

import warnings


In [2]:
def site_sparse_matrix(path_to_train, path_to_test, pickle_file, vector_params):
    
    # all site columns
    sites = ['site%d' %i for i in range(1,11)]
    times = ['time%d' %i for i in range(1,11)]
    
    train_df = pd.read_csv(path_to_train, index_col='session_id', parse_dates=times)
    test_df = pd.read_csv(path_to_test, index_col='session_id', parse_dates=times)
    
    train_df = train_df.sort_values(by='time1')
    # clean data 
    train_df[sites] = train_df[sites].fillna(0).astype('int')
    test_df[sites] = test_df[sites].fillna(0).astype('int')
    
    train_df[times] = train_df[times].fillna(0)
    test_df[times] = test_df[times].fillna(0)
    
    with open(pickle_file,'rb') as input_file:
        site_id_dict = pickle.load(input_file)
    
    site_map={}
    for _name in site_id_dict:
        site_map[site_id_dict[_name]] = _name
    
    site_map[0]='unknown'

    #convert site id -> site name
    # it will help us to analysis after EDA
    # pd.dataframe .apply (axis=1) mean process each row
    train_session = train_df[sites].apply(lambda row: ' '.join([site_map[i] for i in row]), axis=1).tolist()
    test_session = test_df[sites].apply(lambda row: ' '.join([site_map[i] for i in row]), axis=1).tolist()
    
    # have word list now process bag of word 
    vectorizer = TfidfVectorizer(**vector_params)
    X_train = vectorizer.fit_transform(train_session)
    X_test = vectorizer.transform(test_session)
    y_train = train_df['target'].astype('int').values
    train_times = train_df[times]
    test_times = test_df[times]
    
    
    return X_train, X_test, y_train, vectorizer, train_times, test_times

# Function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)
    
    

In [3]:
# prepare site sparse matrix
train_session, test_session, y_train, tfv, train_times, test_times = site_sparse_matrix('train_sessions.csv','test_sessions.csv','site_dic.pkl',
                                                                                       vector_params={'ngram_range':(1,5),'max_features':50000,'tokenizer':lambda s: s.split(" ")})             

In [8]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score,TimeSeriesSplit

logit = LogisticRegression(C=1.0, random_state=17, solver='liblinear', max_iter=100, n_jobs=-1)
time_split = TimeSeriesSplit(n_splits=10)
cv_scores1 = cross_val_score(logit, train_session, y_train, cv=time_split, 
                            scoring='roc_auc', n_jobs=4)
print(cv_scores1)
logit.fit(train_session, y_train)
eli5.show_weights(estimator=logit, feature_names=tfv.get_feature_names(), top=30)

[0.83124023 0.65993466 0.85673565 0.92824237 0.84777206 0.88954524
 0.88829128 0.87710523 0.92023038 0.92624125]


Weight?,Feature
+5.880,youwatch.org
+5.380,cid-ed6c3e6a5c6608a4.users.storage.live.com
+5.222,fr.glee.wikia.com
+5.114,vk.com
+4.875,www.info-jeunes.net
+4.499,www.banque-chalus.fr
+4.220,www.express.co.uk
+4.147,www.audienceinsights.net
+4.089,www.melty.fr
+4.003,glee.hypnoweb.net


In [75]:
y_test_true = logit.predict_proba(test_session)[:,1]
write_to_submission_file(y_test_true, 'baseline2.csv')

In [5]:
test_session.shape

(82797, 50000)

In [5]:
# only use site feature
# For text feature, we want to use test count as feature for training
# ngram_range -> how many vocabulary for each element in matrix (1,3)-> 1 to 3 words
# fit & fit_transform
vectorizer = CountVectorizer(ngram_range=(1, 5), max_features=50000)

alice_df[sites].fillna(0).to_csv('train_session_text.txt', sep=' ', index=None, header=None)
test_df[sites].fillna(0).to_csv('test_session.txt', sep=' ', index=None, header=None)

with open('train_session_text.txt') as input_train_file:
    train_site_cnt_matrix = vectorizer.fit_transform(input_train_file)
    
with open('test_session.txt') as inp_test_file:
    test_site_cnt_matrix = vectorizer.transform(inp_test_file)

In [6]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio=0.9):
    train_index = int(round(X.shape[0]*ratio))
    X_train, y_train = X[:train_index], y[:train_index]
    lr = LogisticRegression(random_state=seed, C=C, n_jobs=-1, solver='lbfgs',max_iter=500).fit(X_train, y_train)
    y_predict = lr.predict_proba(X[train_index:])[:,1]
    # Note: roc_auc_score input y_label and y-predict proba not binary value
    roc_auc_val = roc_auc_score(y[train_index:], y_predict)
    print('ROC value: {: .2f} %'.format(roc_auc_val*100))
    
    return lr
    
    
def write_to_submission_file(predict_labels, out_file, target='target', index_label='session_id'):
    predict_df = pd.DataFrame(predict_labels, index=np.arange(1,predict_labels.shape[0]+1), columns=[target])
    predict_df.to_csv(out_file, index_label=index_label)   

In [7]:
%%time
y = alice_df['target'].values
model = get_auc_lr_valid(train_site_cnt_matrix, y)




ROC value:  91.15 %
Wall time: 13 s


In [117]:
model = LogisticRegression(random_state=17, C=1.0, n_jobs=-1, solver='lbfgs',max_iter=500).fit(train_site_cnt_matrix, y)
y_test_predict = model.predict_proba(test_site_cnt_matrix)[:,1]
write_to_submission_file(y_test_predict, 'BW_submission.csv')

In [111]:
test_site_cnt_matrix.shape

(82797, 50000)

In [118]:
# Function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)


In [122]:
# Train the model on the whole training data set
# Use random_state=17 for reproducibility
# Parameter C=1 by default, but here we set it explicitly
lr = LogisticRegression(C=1.0, random_state=17, solver='lbfgs', max_iter=500).fit(train_site_cnt_matrix, y)

# Make a prediction for test data set
y_test = lr.predict_proba(test_site_cnt_matrix)[:, 1]

# Write it to the file which could be submitted
write_to_submission_file(y_test, 'baseline_1.csv')

In [1]:
vectorizer = CountVectorizer(max_features=10)

NameError: name 'CountVectorizer' is not defined

In [8]:
a = {'1':2,'2':3}

In [13]:
for i in zip(*a):
    print(i)

('1', '2')


In [14]:
# Load websites dictionary
with open(r'site_dic.pkl','rb') as input_file:
    site_dict = pickle.load(input_file)

In [22]:
df = pd.DataFrame({'site_id': list(site_dict.values()), 'site_name': list(site_dict.keys())})

In [27]:
alice_df[sites] = alice_df[sites].apply(lambda )

Unnamed: 0_level_0,site1,time1,site2,time2,site3,time3,site4,time4,site5,time5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
session_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21669,56,2013-01-12 08:05:57,55,2013-01-12 08:05:57,0,,0,,0,,...,,0,,0,,0,,0,,0
54843,56,2013-01-12 08:37:23,55,2013-01-12 08:37:23,56,2013-01-12 09:07:07,55,2013-01-12 09:07:09,0,,...,,0,,0,,0,,0,,0
77292,946,2013-01-12 08:50:13,946,2013-01-12 08:50:14,951,2013-01-12 08:50:15,946,2013-01-12 08:50:15,946,2013-01-12 08:50:16,...,2013-01-12 08:50:16,948,2013-01-12 08:50:16,784,2013-01-12 08:50:16,949,2013-01-12 08:50:17,946,2013-01-12 08:50:17,0
114021,945,2013-01-12 08:50:17,948,2013-01-12 08:50:17,949,2013-01-12 08:50:18,948,2013-01-12 08:50:18,945,2013-01-12 08:50:18,...,2013-01-12 08:50:18,947,2013-01-12 08:50:19,945,2013-01-12 08:50:19,946,2013-01-12 08:50:19,946,2013-01-12 08:50:20,0
146670,947,2013-01-12 08:50:20,950,2013-01-12 08:50:20,948,2013-01-12 08:50:20,947,2013-01-12 08:50:21,950,2013-01-12 08:50:21,...,2013-01-12 08:50:21,946,2013-01-12 08:50:21,951,2013-01-12 08:50:22,946,2013-01-12 08:50:22,947,2013-01-12 08:50:22,0


In [35]:
site_df['site_id'==1]

KeyError: False

In [36]:
site_dict

{'www.abmecatronique.com': 25075,
 'groups.live.com': 13997,
 'majeureliguefootball.wordpress.com': 42436,
 'cdt46.media.tourinsoft.eu': 30911,
 'www.hdwallpapers.eu': 8104,
 'img378.imageshack.us': 37491,
 'ecologie.nature.free.fr': 5462,
 'www.ibcn.intec.ugent.be': 35425,
 'kissanime.com': 30151,
 'www.carolineconduiteformation.com': 38268,
 'images.mystockphoto.com': 43641,
 'journalph.csphares.qc.ca': 36959,
 'www.uqo.ca': 40935,
 'd8d94e0wul1nb.cloudfront.net': 12346,
 'openapi.elong.com': 31023,
 'flamenco-o.blogspot.com': 31114,
 'www.pages-annuaire.net': 28165,
 'smart2000.pagesperso-orange.fr': 31877,
 'fast.forbes.com': 31070,
 'i1-js-14-3-01-10077-536503633-i.init.cedexis-radar.net': 12938,
 'i1-js-14-3-01-11074-716595896-i.init.cedexis-radar.net': 42002,
 'www.pacajob.com': 5671,
 'mathaa.epfl.ch': 32074,
 'cbv.sfr.bench.cedexis.com': 7104,
 'fbcdn-sphotos-b-a.akamaihd.net': 1939,
 'www.mystere-tv.com': 6708,
 'www.mon-ip.fr': 43589,
 'www.aqua-passion.com': 45241,
 'reunio

In [42]:
site_map = {}
for _name in site_dict:
    site_map[site_dict[_name]] = _name
site_map[0]='unknown'

In [43]:
train_session = alice_df[sites].apply(lambda row: ' '.join([site_map[i] for i in row]), axis=1).tolist()

In [44]:
train_session

['safebrowsing.clients.google.com safebrowsing-cache.google.com unknown unknown unknown unknown unknown unknown unknown unknown',
 'safebrowsing.clients.google.com safebrowsing-cache.google.com safebrowsing.clients.google.com safebrowsing-cache.google.com unknown unknown unknown unknown unknown unknown',
 'www.apache.org www.apache.org download.eclipse.org www.apache.org www.apache.org www.webtide.com download.oracle.com javadl-esd-secure.oracle.com www.caucho.com www.apache.org',
 'www.webtide.com download.oracle.com www.caucho.com download.oracle.com www.webtide.com www.apache.org public.dhe.ibm.com www.webtide.com www.apache.org www.apache.org',
 'public.dhe.ibm.com jope.ow2.org download.oracle.com public.dhe.ibm.com jope.ow2.org master.dl.sourceforge.net www.apache.org download.eclipse.org www.apache.org public.dhe.ibm.com',
 'master.dl.sourceforge.net public.dhe.ibm.com download.jboss.org www.apache.org public.dhe.ibm.com www.apache.org download.jboss.org dfn.dl.sourceforge.net ww

In [27]:
def func(a,b,c):
    return a+b+c

params = {'a':1,'b': 2,'c':3}

In [28]:
func(**params)

6