In [106]:
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
import numpy as np

import warnings


In [3]:
alice_df = pd.read_csv("train_sessions.csv", index_col='session_id', parse_dates=['time1'])
test_df = pd.read_csv("test_sessions.csv", index_col='session_id')

In [5]:
# Data manipulation
sites = ['site%s' %i for i in range(1,11)]
alice_df[sites] = alice_df[sites].fillna(0).astype('int')
test_df[sites] = test_df[sites].fillna(0).astype('int')
# sort the data by time
alice_df = alice_df.sort_values(by='time1', ascending=True)

In [6]:
# Load websites dictionary
with open(r'site_dic.pkl','rb') as input_file:
    site_dict = pickle.load(input_file)

# create dataframe with site # and site name
site_df = pd.DataFrame({'site_id':list(site_dict.values()),'site_name': list(site_dict.keys())})

In [121]:
# only use site feature
# For text feature, we want to use test count as feature for training
# ngram_range -> how many vocabulary for each element in matrix (1,3)-> 1 to 3 words
# fit & fit_transform
vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=50000)

alice_df[sites].fillna(0).to_csv('train_session_text.txt', sep=' ', index=None, header=None)
test_df[sites].fillna(0).to_csv('test_session.txt', sep=' ', index=None, header=None)

with open('train_session_text.txt') as input_train_file:
    train_site_cnt_matrix = vectorizer.fit_transform(input_train_file)
    
with open('test_session.txt') as inp_test_file:
    test_site_cnt_matrix = vectorizer.transform(inp_test_file)

In [114]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio=0.9):
    train_index = int(round(X.shape[0]*ratio))
    X_train, y_train = X[:train_index], y[:train_index]
    lr = LogisticRegression(random_state=seed, C=C, n_jobs=-1, solver='lbfgs',max_iter=500).fit(X_train, y_train)
    y_predict = lr.predict_proba(X[train_index:])[:,1]
    # Note: roc_auc_score input y_label and y-predict proba not binary value
    roc_auc_val = roc_auc_score(y[train_index:], y_predict)
    print('ROC value: {: .2f} %'.format(roc_auc_val*100))
    
    return lr
    
    
def write_to_submission_file(predict_labels, out_file, target='target', index_label='session_id'):
    predict_df = pd.DataFrame(predict_labels, index=np.arange(1,predict_labels.shape[0]+1), columns=[target])
    predict_df.to_csv(out_file, index_label=index_label)   

In [120]:
%%time
y = alice_df['target'].values
model = get_auc_lr_valid(train_site_cnt_matrix, y)




ROC value:  91.33 %
Wall time: 11.2 s


In [117]:
model = LogisticRegression(random_state=17, C=1.0, n_jobs=-1, solver='lbfgs',max_iter=500).fit(train_site_cnt_matrix, y)
y_test_predict = model.predict_proba(test_site_cnt_matrix)[:,1]
write_to_submission_file(y_test_predict, 'BW_submission.csv')

In [111]:
test_site_cnt_matrix.shape

(82797, 50000)

In [118]:
# Function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file,
                             target='target', index_label="session_id"):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])
    predicted_df.to_csv(out_file, index_label=index_label)


In [122]:
# Train the model on the whole training data set
# Use random_state=17 for reproducibility
# Parameter C=1 by default, but here we set it explicitly
lr = LogisticRegression(C=1.0, random_state=17, solver='lbfgs', max_iter=500).fit(train_site_cnt_matrix, y)

# Make a prediction for test data set
y_test = lr.predict_proba(test_site_cnt_matrix)[:, 1]

# Write it to the file which could be submitted
write_to_submission_file(y_test, 'baseline_1.csv')