In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Lasso

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
# date-parser
date_parser_func = lambda x: pd.datetime.strptime(x, '%d-%b-%y')

In [3]:
data = pd.read_csv('./data/Train_seers_accuracy.csv', parse_dates=['Transaction_Date', 'DOB'], date_parser=date_parser_func)
sub = pd.read_csv('./data/Sample_K7zT2mf.csv')

In [4]:
data.head()

Unnamed: 0,Transaction_ID,Transaction_Date,Store_ID,Number_of_EMI,Purchased_in_Sale,Var1,Var2,Var3,Client_ID,Gender,DOB,Referred_Friend,Sales_Executive_ID,Sales_Executive_Category,Lead_Source_Category,Payment_Mode,Product_Category,Transaction_Amount
0,TRA98825550,2003-01-01,STO1281,2,N,1,1,1,345821599,F,1971-08-19,NO,SD23011859,B,Advertisment,Credit/Debit Card,Cat A,17455
1,TRA98825710,2003-01-01,STO1247,2,N,1,2,1,345821734,M,1976-07-03,NO,SD23000293,B,Advertisment,Credit/Debit Card,Cat A,16503
2,TRA98823874,2003-01-01,STO1244,2,N,1,1,1,345820365,F,2059-01-04,NO,SD23011768,B,Advertisment,Credit/Debit Card,Cat A,15012
3,TRA98823889,2003-01-01,STO1256,2,N,1,1,1,345820377,M,2060-05-11,NO,SD23011691,B,Advertisment,Credit/Debit Card,Cat A,16051
4,TRA98824521,2003-01-01,STO1445,2,N,1,2,1,345820841,F,2062-03-24,NO,SD23003031,B,Reference,Cheque,Cat A,15108


In [5]:
data.loc[:, 'transaction_year'] = data.Transaction_Date.dt.year
data.loc[:, 'transaction_month'] = data.Transaction_Date.dt.month
data.loc[:, 'store_id_number'] = data.Store_ID.map(lambda x: int(x[3:]))

In [6]:
transaction_count_map = data.Client_ID.value_counts().to_dict()

In [7]:
data.loc[:, 'transaction_count'] = data.Client_ID.map(lambda x: int(transaction_count_map[x] > 1))

In [8]:
mask = (data.Transaction_Date >= pd.to_datetime('2006-01-01')) & (data.Transaction_Date <= pd.to_datetime('2006-12-31'))

In [9]:
data_train = data[~mask]
data_test = data[mask]

In [10]:
data_train.shape, data_test.shape

((364000, 22), (75541, 22))

In [11]:
features = ['Client_ID', 'transaction_year']

In [12]:
X_train = data_train[features]
y_train =  data_train.transaction_count

X_test = data_test[features]
y_test = data_test.transaction_count

In [13]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((364000, 2), (75541, 2), (364000,), (75541,))

In [26]:
model = RandomForestClassifier(n_estimators=100, max_depth=4, n_jobs=-1)
model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [27]:
predsTrain = model.predict_proba(X_train)[:, 1]
predsTest = model.predict_proba(X_test)[:, 1]

In [28]:
print 'ROC AUC Score on training set %f ' %(roc_auc_score(y_train, predsTrain))
print 'ROC AUC Score on test set %f ' %(roc_auc_score(y_test, predsTest))

ROC AUC Score on training set 0.657612 
ROC AUC Score on test set 0.648783 


In [29]:
test = data.Client_ID.value_counts().reset_index().rename(columns={'index': 'Client_ID', 'Client_ID': 'freq'})

In [30]:
test.loc[:, 'transaction_year'] = 2007

In [31]:
test.loc[:, 'Cross_Sell'] = model.predict_proba(test[features])[:, 1]

In [32]:
test[['Client_ID', 'Cross_Sell']].to_csv('./submissions/hundred_estimators_max_depth_4.csv', index=False)

In [33]:
test[features]

Unnamed: 0,Client_ID,transaction_year
0,345629580,2007
1,345724132,2007
2,346234361,2007
3,346021065,2007
4,345913383,2007
5,346100533,2007
6,345709003,2007
7,346008305,2007
8,346159301,2007
9,345844309,2007
