In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('cleaned_df.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

pd.set_option('display.max_columns', None)

In [3]:
df.columns

Index(['Age', 'Education', 'Marital_Status', 'Income', 'Kidhome', 'Teenhome',
       'Recency', 'Days_As_Customer', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'Total_Purchases', 'NumDealsPurchases',
       'NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases',
       'NumWebVisitsMonth', 'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3',
       'AcceptedCmp4', 'AcceptedCmp5', 'Response', 'Total_AcceptedCmp',
       'AcceptedAnyCmp', 'Complain'],
      dtype='object')

In [4]:
#Dropping all campaign columns except for the 'AcceptedAnyCmp'. I do this because if these columns are included the accuracy of the model will be 100%.

cols_to_drop = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'Response', 'Total_AcceptedCmp']

df.drop(columns=cols_to_drop, inplace=True)

In [5]:
df.head()

Unnamed: 0,Age,Education,Marital_Status,Income,Kidhome,Teenhome,Recency,Days_As_Customer,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,Total_Purchases,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedAnyCmp,Complain
0,67.0,Graduation,Single,58138.0,0,0,58,4357,635,88,546,172,88,88,1617,3,8,10,4,7,1,0
1,70.0,Graduation,Single,46344.0,1,1,38,3807,11,1,6,2,1,6,27,2,1,1,2,5,0,0
2,59.0,Graduation,Together,71613.0,0,0,26,4006,426,49,127,111,21,42,776,1,8,2,10,4,0,0
3,40.0,Graduation,Together,26646.0,1,0,26,3833,11,4,20,10,3,5,53,2,2,0,4,6,0,0
4,43.0,PhD,Married,58293.0,1,0,94,3855,173,43,118,46,27,15,422,5,5,3,6,5,0,0


In [6]:
#Turning 'Marital_Status' and 'Education' into dummy variables.

ml_df = pd.get_dummies(df, columns=['Marital_Status','Education'], dtype=int)

In [7]:
ml_df.head()

Unnamed: 0,Age,Income,Kidhome,Teenhome,Recency,Days_As_Customer,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,Total_Purchases,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedAnyCmp,Complain,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Together,Marital_Status_Widow,Education_2n Cycle,Education_Basic,Education_Graduation,Education_Master,Education_PhD
0,67.0,58138.0,0,0,58,4357,635,88,546,172,88,88,1617,3,8,10,4,7,1,0,0,0,1,0,0,0,0,1,0,0
1,70.0,46344.0,1,1,38,3807,11,1,6,2,1,6,27,2,1,1,2,5,0,0,0,0,1,0,0,0,0,1,0,0
2,59.0,71613.0,0,0,26,4006,426,49,127,111,21,42,776,1,8,2,10,4,0,0,0,0,0,1,0,0,0,1,0,0
3,40.0,26646.0,1,0,26,3833,11,4,20,10,3,5,53,2,2,0,4,6,0,0,0,0,0,1,0,0,0,1,0,0
4,43.0,58293.0,1,0,94,3855,173,43,118,46,27,15,422,5,5,3,6,5,0,0,0,1,0,0,0,0,0,0,0,1


In [8]:
#Creating my test/train split.

X = ml_df.drop('AcceptedAnyCmp', axis=1)
y = ml_df['AcceptedAnyCmp']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [9]:
#Running the bare model without optimization.

rfr = RandomForestClassifier(random_state=42)

rfr.fit(X_train, y_train)
results = rfr.predict(X_test)
acc_score = accuracy_score(y_test, results)

print(f'Accuracy score of plain model: {round(acc_score, 3)}')

Accuracy score of plain model: 0.793


In [10]:
#Using GridSearchCV to optimize the model. Resulted in no change in model efficiency :(.

gcv_rfr = {'max_depth':[None, 3, 5, 7], 'n_estimators':[50, 100, 150, 200], 'min_samples_split':[2,3,4,5]}

grid_rfr = GridSearchCV(rfr, gcv_rfr)
grid_rfr.fit(X_train, y_train)

new_results = grid_rfr.predict(X_test)
new_acc_score = accuracy_score(y_test, new_results)

print(f'Accuracy score of optimized model: {round(new_acc_score, 3)}')

Accuracy score of optimized model: 0.793


In [11]:
#Finding out which features are most important for the model.

important_features = grid_rfr.best_estimator_.feature_importances_

ft_imp = {
    'Features':X_test.columns,
    'Importance':important_features
}

Feature_Importance = pd.DataFrame(ft_imp)

Feature_Importance.sort_values('Importance', ascending=False).head(10)

Unnamed: 0,Features,Importance
6,MntWines,0.134118
12,Total_Purchases,0.093134
1,Income,0.072836
8,MntMeatProducts,0.068478
5,Days_As_Customer,0.062629
11,MntGoldProds,0.060875
4,Recency,0.06073
15,NumCatalogPurchases,0.052666
0,Age,0.047204
9,MntFishProducts,0.045245
