In [101]:
%matplotlib inline

from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from dmba import classificationSummary

In [102]:
hcp = pd.read_csv('Hair-Care-Product.csv')

In [103]:
hcp.head()

Unnamed: 0,Purchase,Age,Hair Color,U.S. Region,Validation,Promotion_ord,Gender_ord,Residence_ord
0,0,25,Black,Southwest,1,1,0,1
1,0,30,Black,Northwest,1,0,0,1
2,0,45,Red,Northeast,1,0,0,0
3,0,35,Blond,Southwest,0,0,0,1
4,0,33,Brown,Southwest,0,1,0,1


In [104]:
len(hcp)

10000

In [105]:
hcp['Hair Color'].unique()

array(['Black', 'Red', 'Blond', 'Brown'], dtype=object)

In [106]:
hcp['U.S. Region'].unique()

array(['Southwest', 'Northwest', 'Northeast', 'Southeast'], dtype=object)

In [107]:
hcp = pd.get_dummies(hcp, drop_first = True)

In [118]:
#treatment = hcp[hcp['Promotion_ord'] == 1]

In [117]:
#notreatment = hcp[hcp['Promotion_ord'] == 0]

To understand propensity modelling, read this: https://datascience.foundation/sciencewhitepaper/propensity-modelling-for-business

In [110]:
#Partition the data
X = hcp.drop(columns = ['Purchase'])
y = hcp['Purchase']


classes = list(hcp.Purchase.unique())

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=3)


# Train a random forest classifier using the training set
rfModel = RandomForestClassifier(n_estimators=100, random_state=1)
rfModel.fit(X_train, y_train)

classificationSummary(y_valid, rfModel.predict(X_valid), class_names=classes)

Confusion Matrix (Accuracy 0.9835)

       Prediction
Actual    0    1
     0 3934   21
     1   45    0


In [111]:
uplift_df = X_valid.copy()  # Need to create a copy to allow modifying data

uplift_df.Promotion_ord = 1
predTreatment = rfModel.predict_proba(uplift_df)
uplift_df.Promotion_ord = 0
predControl = rfModel.predict_proba(uplift_df)

upliftResult_df = pd.DataFrame({
    'probPromotion': predTreatment[:,1],
    'probNoPromotion': predControl[:,1],
    'uplift': predTreatment[:,1] - predControl[:,1],
    }, index=uplift_df.index)
print(upliftResult_df.head(3))

      probPromotion  probNoPromotion  uplift
5876           0.00              0.0    0.00
6555           0.01              0.0    0.01
1448           0.00              0.0    0.00


Promotion only has a slight effect on member 6555 but generally, there isn't a need to waste money on sales promotion as it does not change the purchase behaviour of these three members.

In [112]:
#Partition the data
X = hcp.drop(columns = ['Purchase'])
y = hcp['Purchase']


classes = list(hcp.Purchase.unique())

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.40, random_state=3)


# Train a random forest classifier using the training set
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

classificationSummary(y_valid, knn.predict(X_valid), class_names=classes)

Confusion Matrix (Accuracy 0.9872)

       Prediction
Actual    0    1
     0 3949    6
     1   45    0


In [113]:
uplift_df = X_valid.copy()  # Need to create a copy to allow modifying data

uplift_df.Promotion_ord = 1
predTreatment = knn.predict_proba(uplift_df)
uplift_df.Promotion_ord = 0
predControl = knn.predict_proba(uplift_df)

upliftResult_df = pd.DataFrame({
    'probPromotion': predTreatment[:,1],
    'probNoPromotion': predControl[:,1],
    'uplift': predTreatment[:,1] - predControl[:,1],
    }, index=uplift_df.index)
print(upliftResult_df.head(3))

      probPromotion  probNoPromotion  uplift
5876            0.0              0.0     0.0
6555            0.0              0.0     0.0
1448            0.0              0.0     0.0


Promotion has no effect on the first three members so there isn't a need to waste money on sales promotion on them as it does not change their purchase behaviour