In [10]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['train_sessions.csv', 'test_sessions.csv', 'train.zip', 'site_dic.pkl', 'sample_submission.csv']


## Tools

In [11]:
import pandas as pd
import numpy as np
import re

def write_to_submission_file(predicted_labels, out_file, target='target', index_label='session_id'):
    predicted_df = pd.DataFrame(predicted_labels,
                                index = np.arange(1, predicted_labels.shape[0] + 1),
                                columns=[target])

    predicted_df.to_csv(out_file, index_label=index_label)


def get_domen(url):
    if re.search('[a-zA-Z]', url):
        return url.split('.')[-1]
    return 'ip'


def get_part_of_day(x):
    x = int(x.hour)
    if (0 <= x < 12):
        return 0
    elif (12 <= x < 15):
        return 1
    elif (15 <= x < 18):
       return 2
    elif (18 <= x < 21):
       return 3
    else:
       return 4


def one_site_mean_duration(x):
    site_times = [datetime for datetime in list(x) if not pd.isnull(datetime)]
    durations = [site_times[i] - site_times[i-1] for i in range(1, len(site_times))]
    durations = list(map(lambda x: x.seconds, durations))

    if (len(durations) > 0):
        return np.mean(durations)

    return 0


def count_alice_top_sites(top_sites, x):
    x_list = list(x)
    x_list = [site for site in x_list if site in top_sites]

    return len(x_list)

## Preprocessing

In [12]:
import pickle
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from scipy.sparse import hstack
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier


def base_preprocessing(train, test):

    times = ['time%s' % i for i in range(1, 11)]
    train[times] = train[times].apply(pd.to_datetime)
    test[times] = test[times].apply(pd.to_datetime)

    train = train.sort_values(by='time1')

    sites = ['site%s' % i for i in range(1, 11)]
    train[sites] = train[sites].fillna(0).astype('int')
    test[sites] = test[sites].fillna(0).astype('int')

    X_train = train.iloc[:, :-1]
    y_train = train.iloc[:, -1]

    return X_train, test, y_train

def main_preprocessing(X_train, X_test, y_train):
    '''
    Sites TfidfVectorizer + Domens CountVectorizer + Start Hour (OHE) + Start Session Time (yyyy/mm) (OHE/OHE) + Start Part Of Day (OHE) + Start Day Of Week (OHE)
    + Session Duration (seconds) + Mean Time On One Site (Seconds) + Count Alice Top Sites(10) + Start Day Of Year (OHE)

    '''
    with open(r'../input/site_dic.pkl', 'rb') as input_file:
        site_dict = pickle.load(input_file)

    # Sites
    sites = ['site%s' % i for i in range(1, 11)]
    X_train_sites = X_train[sites].apply(lambda x: ' '.join(map(str, x.values)), axis=1)
    X_test_sites = X_test[sites].apply(lambda x: ' '.join(map(str, x.values)), axis=1)

    sites_vectorizer = TfidfVectorizer(max_features=10000, max_df=0.1, ngram_range=(1, 2)).fit(X_train_sites)
    sites_train = sites_vectorizer.transform(X_train_sites)
    sites_test = sites_vectorizer.transform(X_test_sites)


    # Domens
    invert_site_dict = {v: k for k, v in site_dict.items()}
    sites = ['site%s' % i for i in range(1, 11)]
    X_train_domens = X_train[sites].applymap(lambda x: get_domen(invert_site_dict[x]) if x in invert_site_dict else 'nan')
    X_test_domens = X_test[sites].applymap(lambda x: get_domen(invert_site_dict[x]) if x in invert_site_dict else 'nan')

    X_train_domens = X_train_domens[sites].apply(lambda x: ' '.join(map(str, x.values)), axis=1)
    X_test_domens = X_test_domens[sites].apply(lambda x: ' '.join(map(str, x.values)), axis=1)

    domens_vectorizer = CountVectorizer(max_df=0.1, min_df=0.02).fit(X_train_domens)
    domens_train = domens_vectorizer.transform(X_train_domens)
    domens_test = domens_vectorizer.transform(X_test_domens)


    # Start Hour (OHE)
    start_hour_train_catseries = X_train['time1'].apply(lambda x: x.hour).astype('category')
    start_hour_test_catseries = X_test['time1'].apply(lambda x: x.hour).astype('category', categories=list(start_hour_train_catseries.cat.categories))

    start_hour_train = pd.get_dummies(start_hour_train_catseries)
    start_hour_test = pd.get_dummies(start_hour_test_catseries)


    # Start Session Time (yyyy/mm) (OHE/OHE)
    start_year_train_catseries = X_train['time1'].apply(lambda x: x.year).astype('category')
    start_year_test_catseries = X_test['time1'].apply(lambda x: x.year).astype('category', categories=list(start_year_train_catseries.cat.categories))

    start_year_train = pd.get_dummies(start_year_train_catseries)
    start_year_test = pd.get_dummies(start_year_test_catseries)


    start_month_train_catseries = X_train['time1'].apply(lambda x: x.month).astype('category')
    start_month_test_catseries = X_test['time1'].apply(lambda x: x.month).astype('category', categories=list(start_month_train_catseries.cat.categories))

    start_month_train = pd.get_dummies(start_month_train_catseries)
    start_month_test = pd.get_dummies(start_month_test_catseries)


    # Start Part Of Day (OHE)
    part_of_day_train_catseries = X_train['time1'].apply(get_part_of_day).astype('category')
    part_of_day_test_catseries = X_test['time1'].apply(get_part_of_day).astype('category', categories=list(part_of_day_train_catseries.cat.categories))

    part_of_day_train = pd.get_dummies(part_of_day_train_catseries)
    part_of_day_test = pd.get_dummies(part_of_day_test_catseries)


    # Start Day Of Week (OHE)
    day_of_week_train_catseries = X_train['time1'].apply(lambda x: x.dayofweek).astype('category')
    day_of_week_test_catseries = X_test['time1'].apply(lambda x: x.dayofweek).astype('category', categories=list(day_of_week_train_catseries.cat.categories))

    day_of_week_train = pd.get_dummies(day_of_week_train_catseries)
    day_of_week_test = pd.get_dummies(day_of_week_test_catseries)


    # Session Duration (Seconds)
    times = ['time%s' % i for i in range(1, 11)]
    X_train_times = X_train[times]
    X_test_times = X_test[times]

    duration_train = pd.DataFrame((X_train_times.max(axis=1) - X_train_times.min(axis=1)).dt.total_seconds())
    duration_test = pd.DataFrame((X_test_times.max(axis=1) - X_test_times.min(axis=1)).dt.total_seconds())

    scaler_duration = StandardScaler().fit(duration_train)

    duration_train = scaler_duration.transform(duration_train)
    duration_test = scaler_duration.transform(duration_test)


    # Mean Time On One Site (Seconds)
    times = ['time%s' % i for i in range(1, 11)]
    X_train_times = X_train[times]
    X_test_times = X_test[times]

    one_site_mean_duration_train = pd.DataFrame(X_train_times.apply(one_site_mean_duration, axis=1))
    one_site_mean_duration_test = pd.DataFrame(X_test_times.apply(one_site_mean_duration, axis=1))

    scaler_mean_duration = StandardScaler().fit(one_site_mean_duration_train)

    one_site_mean_duration_train = scaler_mean_duration.transform(one_site_mean_duration_train)
    one_site_mean_duration_test = scaler_mean_duration.transform(one_site_mean_duration_test)


    # Count Alice Top Sites(50)
    sites = ['site%s' % i for i in range(1, 11)]
    X_train_sites_alice = X_train.iloc[y_train[y_train == 1].index, :][sites]
    alice_sites = X_train_sites_alice.stack().value_counts()
    alice_top_sites = list(alice_sites.drop(alice_sites.index[0]))[:10]

    alice_sites_train = pd.DataFrame(X_train[sites].apply((lambda x: count_alice_top_sites(alice_top_sites, x)), axis=1))
    alice_sites_test = pd.DataFrame(X_test[sites].apply((lambda x: count_alice_top_sites(alice_top_sites, x)), axis=1))

    scaler_top_sites = StandardScaler().fit(alice_sites_train)

    alice_sites_train = scaler_top_sites.transform(alice_sites_train)
    alice_sites_test = scaler_top_sites.transform(alice_sites_test)


    # Start Day Of Year (OHE)
    day_of_year_train_catseries = X_train['time1'].apply(lambda x: x.dayofyear).astype('category')
    day_of_year_test_catseries = X_test['time1'].apply(lambda x: x.dayofyear).astype('category', categories=list(day_of_year_train_catseries.cat.categories))

    day_of_year_train = pd.get_dummies(day_of_year_train_catseries)
    day_of_year_test = pd.get_dummies(day_of_year_test_catseries)


    # Prefinal concat
    X_train = csr_matrix(hstack([sites_train, domens_train, start_hour_train, start_year_train, start_month_train, part_of_day_train, day_of_week_train, duration_train, one_site_mean_duration_train, alice_sites_train, day_of_year_train]))
    X_test = csr_matrix(hstack([sites_test, domens_test, start_hour_test, start_year_test, start_month_test, part_of_day_test, day_of_week_test, duration_test, one_site_mean_duration_test, alice_sites_test, day_of_year_test]))


    return X_train, X_test

## MAIN

In [None]:
import warnings
import pandas as pd
import numpy as np
warnings.filterwarnings('ignore')

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from scipy import io

train = pd.read_csv('../input/train_sessions.csv', index_col='session_id')
test = pd.read_csv('../input/test_sessions.csv', index_col='session_id')

X_train, X_test, y_train = base_preprocessing(train, test)
X_train, X_test = main_preprocessing(X_train, X_test, y_train)

from imblearn.over_sampling import SMOTE
smote=SMOTE(kind='borderline1',random_state=17)
X_train,y_train = smote.fit_sample(X_train,y_train)
#X_train, X_test = io.mmread('X_train.mtx'), io.mmread('X_test.mtx')

from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=17)

X_train, y_train = ros.fit_sample(X_train, y_train)

    # from evolutionary_search import EvolutionaryAlgorithmSearchCV
    # from sklearn.model_selection import ShuffleSplit
    #
    # lr = LogisticRegression(n_jobs=-1)
    # params = {'penalty': ['l2'],
    #           'C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1] + list(range(2, 100, 5)) + [1000],
    #           'class_weight': ['balanced'],
    #           'solver': ['newton-cg', 'lbfgs', 'sag']}
    # cv = ShuffleSplit(test_size=0.30, n_splits=1)
    #
    # evo = EvolutionaryAlgorithmSearchCV(estimator=lr,
    #                                     params=params,
    #                                     scoring='roc_auc',
    #                                     cv=cv,
    #                                     verbose=True,
    #                                     population_size=100,
    #                                     gene_mutation_prob=0.10,
    #                                     gene_crossover_prob=0.5,
    #                                     tournament_size=5,
    #                                     generations_number=10)
    # evo.fit(X_train, y_train)

    #{'penalty': 'l2', 'max_iter': 100, 'solver': 'sag', 'class_weight': 'balanced', 'C': 37} with fitness: 0.9947851116234038




    #||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

    # #HOLD OUT
    # train_len = int(0.9 * X_train.shape[0])
    # X_for_train = X_train[:train_len, :]
    # X_for_valid = X_train[train_len:, :]
    # y_for_train = y_train[:train_len]
    # y_for_valid = y_train[train_len:]
    #
    # logit = LogisticRegression(n_jobs=-1, random_state=17)
    # logit.fit(X_for_train, y_for_train)
    #
    # valid_pred = logit.predict_proba(X_for_valid)[:, 1]
    #
    # print(roc_auc_score(y_for_valid, valid_pred))



    # # VALIDATION CURVES
    # for i in [10, 20, 50, 100, 200, 500, 700, 1000, 1500, 2000]:
    #     X_train1, X_test1 = variant_5_preprocessing(X_train, X_test, y_train, i)
    #     train_len = int(0.9 * X_train1.shape[0])
    #     X_for_train = X_train1[:train_len, :]
    #     X_for_valid = X_train1[train_len:, :]
    #     y_for_train = y_train[:train_len]
    #     y_for_valid = y_train[train_len:]
    #
    #     logit = LogisticRegression(n_jobs=-1, random_state=17)
    #     logit.fit(X_for_train, y_for_train)
    #
    #     valid_pred = logit.predict_proba(X_for_valid)[:, 1]
    #
    #     print('param={0}, roc_auc={1}'.format(i, roc_auc_score(y_for_valid, valid_pred)))


    # # CROSS-VALIDATION
    # log_regressor = LogisticRegression(random_state=17)
    # skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17)
    #
    # cross_val_scores = np.mean(cross_val_score(log_regressor, X_train, y_train, cv=skf, scoring='roc_auc', n_jobs=-1))

    #SUBMIT
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(n_jobs=-1)
logit.fit(X_train, y_train)

predictions = logit.predict_proba(X_test)[:, 1]

write_to_submission_file(predictions, 'submission_1.csv')

In [None]:
write_to_submission_file(predictions, '../input/submission_1.csv')

In [None]:
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import ShuffleSplit
lr = LogisticRegression(n_jobs=-1)
params = {'penalty': ['l2'],
          'C': np.logspace(-3,10,20),
          'class_weight': ['balanced']}
gcv = GridSearchCV(estimator=lr,
                   param_grid=params,
                   scoring='roc_auc',
                   cv=3,
                   verbose=True)
gcv.fit(X_train, y_train)
print (gsv.best_params_,gsv.best_score_)