In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [2]:
train = pd.read_csv('readyfortrain.csv')

X = train[['regular_points_received_sum',
       'regular_points_received_mean', 'express_points_received_sum',
       'express_points_received_mean', 'regular_points_spent_sum',
       'regular_points_spent_mean', 'express_points_spent_sum',
       'express_points_spent_mean', 'purchase_sum_sum', 'purchase_sum_mean',
       'product_quantity_sum', 'product_quantity_mean', 'trn_sum_from_iss_sum',
       'trn_sum_from_iss_mean', 'trn_sum_from_red_sum',
       'trn_sum_from_red_mean', 'netto_sum', 'netto_mean',
       'is_own_trademark_sum', 'is_own_trademark_mean', 'is_alcohol_sum',
       'is_alcohol_mean', 'age', 'gender', 'dif_date']].values

Y = train['res'].values.astype(int)

test = pd.read_csv('readyforpredict.csv')

test = test.drop(labels = 'dif_date_sum', axis = 1)

test.columns = ['client_id', 'regular_points_received_sum',
       'regular_points_received_mean', 'express_points_received_sum',
       'express_points_received_mean', 'regular_points_spent_sum',
       'regular_points_spent_mean', 'express_points_spent_sum',
       'express_points_spent_mean', 'purchase_sum_sum', 'purchase_sum_mean',
       'product_quantity_sum', 'product_quantity_mean', 'trn_sum_from_iss_sum',
       'trn_sum_from_iss_mean', 'trn_sum_from_red_sum',
       'trn_sum_from_red_mean', 'netto_sum', 'netto_mean',
       'is_own_trademark_sum', 'is_own_trademark_mean', 'is_alcohol_sum',
       'is_alcohol_mean', 'age', 'gender', 'dif_date']

X_pred = test[['regular_points_received_sum',
       'regular_points_received_mean', 'express_points_received_sum',
       'express_points_received_mean', 'regular_points_spent_sum',
       'regular_points_spent_mean', 'express_points_spent_sum',
       'express_points_spent_mean', 'purchase_sum_sum', 'purchase_sum_mean',
       'product_quantity_sum', 'product_quantity_mean', 'trn_sum_from_iss_sum',
       'trn_sum_from_iss_mean', 'trn_sum_from_red_sum',
       'trn_sum_from_red_mean', 'netto_sum', 'netto_mean',
       'is_own_trademark_sum', 'is_own_trademark_mean', 'is_alcohol_sum',
       'is_alcohol_mean', 'age', 'gender', 'dif_date']].values

ID = test['client_id'].values

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.4)

## Decision Tree

In [147]:
dtc = DecisionTreeClassifier(max_depth=2, max_features=10, max_leaf_nodes=4)
dtc = dtc.fit(X_train, y_train)

In [148]:
#Predict the response for test dataset
dtc_res = dtc.predict(X_test)

In [149]:
#dtc_res

In [150]:
metrics.accuracy_score(y_test, dtc_res)

0.6817761447710458

In [37]:
pd.DataFrame({'client_id': ID, 'uplift': dtc_res[:,0]}).to_csv('DTree_maxdepth2_68acc.csv', index = False)

## Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression

In [206]:
logreg = LogisticRegression(penalty = 'elasticnet', solver = 'saga', l1_ratio = 0)

In [207]:
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=0, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='elasticnet',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False)

In [208]:
logreg_pred = logreg.predict(X_test)

In [209]:
metrics.accuracy_score(y_test,logreg_pred)

0.6809263147370526

In [None]:
pd.DataFrame({'client_id': ID, 'uplift': babah[:,0]}).to_csv('submissionBLYAT.csv', index = False)

## random forest

In [5]:
from sklearn.ensemble import RandomForestClassifier

In [224]:
rfc = RandomForestClassifier(max_depth=2)
rfc.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [227]:
rfc_res = rfc.predict_proba(X_pred)

In [226]:
metrics.accuracy_score(y_test,rfc_res)

0.6827134573085383

In [228]:
pd.DataFrame({'client_id': ID, 'uplift': rfc_res[:,0]}).to_csv('RandomForClass_maxdepth2_acc682713.csv', index = False)

## XGBOOOOOOOST

In [6]:
from xgboost import XGBClassifier

In [25]:
model = XGBClassifier(max_depth=1, seed = 100)
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=1,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=100,
              silent=None, subsample=1, verbosity=1)

In [26]:
boost_res = model.predict(X_test)

In [12]:
metrics.accuracy_score(y_test, boost_res) #3

0.6800639872025595

In [9]:
metrics.accuracy_score(y_test, boost_res) #maxdepth 1 seed =1

0.6807263547290542

In [27]:
metrics.accuracy_score(y_test, boost_res)

0.6807263547290542

In [None]:
pd.DataFrame({'client_id': ids, 'uplift': babahboost[:,0]}).to_csv('xgboost.csv')

## CATBOOST

In [286]:
from catboost import CatBoostClassifier

In [337]:
model = CatBoostClassifier(iterations=7,
                           depth=4,
                           learning_rate=0.2,
                           loss_function='CrossEntropy',
                           verbose=True)

In [338]:
model.fit(X_train, y_train)

0:	learn: 0.6613390	total: 34.7ms	remaining: 208ms
1:	learn: 0.6407545	total: 117ms	remaining: 293ms
2:	learn: 0.6278008	total: 149ms	remaining: 198ms
3:	learn: 0.6198459	total: 179ms	remaining: 134ms
4:	learn: 0.6142409	total: 213ms	remaining: 85ms
5:	learn: 0.6104997	total: 256ms	remaining: 42.7ms
6:	learn: 0.6082272	total: 289ms	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1d400d0d9e8>

In [339]:
cat = model.predict(X_test)

In [None]:
cat = cat[:,0]

In [None]:
pd.DataFrame({'client_id': ids, 'uplift': cat}).to_csv('catboost.csv') 10 000 2 0.2 logloss 

In [329]:
metrics.accuracy_score(y_test, cat) #15 2 0.2 crossEntropy 

0.6827134573085383

In [340]:
metrics.accuracy_score(y_test, cat) #30 2 0.2 crossentropy

0.6827134573085383

## Adaboost

In [252]:
from sklearn.ensemble import AdaBoostClassifier

In [282]:
ada = AdaBoostClassifier(n_estimators=12, learning_rate=0.3)

In [283]:
ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.3,
                   n_estimators=12, random_state=None)

In [284]:
ada_res = ada.predict(X_test)

In [260]:
metrics.accuracy_score(y_test, ada_res) 100 0,2

0.6810387922415517

In [264]:
metrics.accuracy_score(y_test, ada_res) 50 0,2

0.6816386722655469

In [268]:
metrics.accuracy_score(y_test, ada_res) 25 0,2

0.6827134573085383

In [273]:
metrics.accuracy_score(y_test, ada_res) #12 0,2

0.6827134573085383

In [285]:
metrics.accuracy_score(y_test, ada_res) 

0.6827134573085383