In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('../../data/mini.csv')
data.head()

Unnamed: 0,customer_id,occurence,cost,item_id
0,416705,2017-05-07 21:58:10.000000,299.0,515274
1,13891,2018-02-10 17:35:11.000000,1090.0,828115
2,9081,2017-12-21 17:13:44.000000,499.0,695501
3,470904,2017-10-31 10:39:49.000000,290.0,899821
4,58500,2018-03-09 20:57:29.000000,150.0,518554


In [3]:
data['occurence'] = pd.to_datetime(data['occurence'])

In [4]:
hz = pd.DataFrame(data.groupby('customer_id')['cost'].mean())
hz['number of transactions'] = data.groupby('customer_id')['cost'].count()
hz['total'] = data.groupby('customer_id')['cost'].sum()

In [5]:
data['month'] = pd.to_datetime(data['occurence']).dt.month
data['year'] = pd.to_datetime(data['occurence']).dt.year


In [6]:
data = data[data['year'] == 2018]
data = data[data['month'] <= 6]
data.head()

Unnamed: 0,customer_id,occurence,cost,item_id,month,year
1,13891,2018-02-10 17:35:11,1090.0,828115,2,2018
4,58500,2018-03-09 20:57:29,150.0,518554,3,2018
5,572181,2018-05-06 20:25:39,990.0,656673,5,2018
7,114804,2018-05-06 13:39:20,3990.0,958574,5,2018
16,568563,2018-06-21 00:57:26,1050.0,969194,6,2018


In [7]:
first_five_month = data[data['month'] <= 5]
# first_five_month = first_five_month.drop_duplicates('customer_id')
first_five_month = first_five_month[['customer_id']].drop_duplicates(subset='customer_id', keep=False)

In [8]:
first_five_month.head()

Unnamed: 0,customer_id
60,568402
61,530299
136,112048
152,561707
207,560111


In [9]:
target_month = data[data['month'] == 6]
# target_month = target_month.drop_duplicates('customer_id')
target_month = target_month[['customer_id']].drop_duplicates(subset='customer_id', keep=False)

In [10]:
target_month['target'] = 'Yes'

In [11]:
target_data = pd.merge(first_five_month, target_month, how='left', on=['customer_id'])
target_data['target'] = (target_data.target.isnull()).astype('int')
target_data.target.value_counts()

1    42830
0     3229
Name: target, dtype: int64

In [12]:
df = pd.merge(target_data, hz, how='inner', on=['customer_id'])
df.head()

Unnamed: 0,customer_id,target,cost,number of transactions,total
0,568402,1,590.0,1,590.0
1,530299,1,500.0,1,500.0
2,112048,1,3858.333333,3,11575.0
3,561707,1,1346.666667,3,4040.0
4,560111,1,3890.0,1,3890.0


In [13]:
y = df['target']
X = df.drop(['customer_id', 'target'], axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

In [15]:
clf = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=17)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best')

In [16]:
prediction = clf.predict(X_test)

In [17]:
from ipywidgets import Image
from io import StringIO
import pydotplus
from sklearn.tree import export_graphviz

dot_data = StringIO()
export_graphviz(clf, feature_names=[(i) for i in X_train.columns], 
                out_file=dot_data, filled=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(value=graph.create_png())

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x03\x01\x00\x00\x01\xf1\x08\x06\x00\x00\x005\x12\xb0…

In [18]:
accuracy_score(prediction, y_test)

0.9283543204515848

In [24]:
tree_params = {'max_depth': list(range(1, 11)), 
               'min_samples_leaf': list(range(1, 5)),}

tree_grid = GridSearchCV(clf, tree_params,
                         cv=5, n_jobs=-1,
                        verbose=True)
tree_grid.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    2.9s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=17,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'min_samples_leaf': [1, 2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=True)

In [25]:
tree_grid.best_params_

{'max_depth': 1, 'min_samples_leaf': 1}

In [26]:
tree_grid.best_score_

0.9302792628979293

In [22]:
prediction = tree_grid.predict(X_test)

In [23]:
accuracy_score(prediction, y_test)

0.9283543204515848