###Importing data

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
%cd /content/drive/My Drive/Colab Notebooks/Analytics_Enabled_Marketing
# !pwd

/content/drive/My Drive/Colab Notebooks/Analytics_Enabled_Marketing


In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

In [14]:
#import dataset
dataset=pd.read_excel("a1_Dataset_10Percent.xlsx")
# !ls

In [15]:
# shows count of rows and columns
dataset.shape

(22223, 11)

In [16]:
#shows first few rows of the code
dataset.head()

Unnamed: 0,ID,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,17147654,5.0,,,,,,Tin,0.01,5.0,0
1,8415498,15.0,,,M,,,Gold,8000.0,5.0,1
2,12107603,,,,M,Midlands,East,Tin,0.01,,1
3,14400995,8.0,28.0,,F,,,Tin,0.01,,1
4,28724674,14.0,67.0,,,,,Tin,0.01,7.0,0


### Data preparation

In [17]:
#dropping customer ID column from the dataset

dataset=dataset.drop(['ID'],axis=1)

dataset.head()

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,5.0,,,,,,Tin,0.01,5.0,0
1,15.0,,,M,,,Gold,8000.0,5.0,1
2,,,,M,Midlands,East,Tin,0.01,,1
3,8.0,28.0,,F,,,Tin,0.01,,1
4,14.0,67.0,,,,,Tin,0.01,7.0,0


In [18]:
# explore missing values

dataset.isna().sum()

DemAffl            1085
DemAge             1508
DemClusterGroup     674
DemGender          2512
DemReg              465
DemTVReg            465
LoyalClass            0
LoyalSpend            0
LoyalTime           281
TargetBuy             0
dtype: int64

In [19]:
# filling missing values with mean/mode*
# all categorical variables we are imputing with mode, and the continiuous ones are inputed with mean value
dataset['DemAffl']=dataset['DemAffl'].fillna(dataset['DemAffl'].mode()[0])
dataset['DemAge']=dataset['DemAge'].fillna(dataset['DemAge'].mode()[0])
dataset['DemClusterGroup']=dataset['DemClusterGroup'].fillna(dataset['DemClusterGroup'].mode()[0])
dataset['DemGender']=dataset['DemGender'].fillna(dataset['DemGender'].mode()[0])
dataset['DemReg']=dataset['DemReg'].fillna(dataset['DemReg'].mode()[0])
dataset['DemTVReg']=dataset['DemTVReg'].fillna(dataset['DemTVReg'].mode()[0])
dataset['LoyalTime']=dataset['LoyalTime'].fillna(dataset['LoyalTime'].mean())

In [20]:
# explore missing values post missing value fix

dataset.isna().sum()

DemAffl            0
DemAge             0
DemClusterGroup    0
DemGender          0
DemReg             0
DemTVReg           0
LoyalClass         0
LoyalSpend         0
LoyalTime          0
TargetBuy          0
dtype: int64

In [21]:
dataset.head()

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,5.0,51.0,C,F,South East,London,Tin,0.01,5.0,0
1,15.0,51.0,C,M,South East,London,Gold,8000.0,5.0,1
2,8.0,51.0,C,M,Midlands,East,Tin,0.01,6.56467,1
3,8.0,28.0,C,F,South East,London,Tin,0.01,6.56467,1
4,14.0,67.0,C,F,South East,London,Tin,0.01,7.0,0


###Coverting category to numeric

In [23]:
# converting to mumeric

from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()

dataset['DemClusterGroup'] = number.fit_transform(dataset['DemClusterGroup'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemGender'] = number.fit_transform(dataset['DemGender'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemReg'] = number.fit_transform(dataset['DemReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['DemTVReg'] = number.fit_transform(dataset['DemTVReg'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

dataset['LoyalClass'] = number.fit_transform(dataset['LoyalClass'].astype('str'))
integer_mapping = {l: i for i, l in enumerate(number.classes_)}
print(integer_mapping)

{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6}
{'0': 0, '1': 1, '2': 2}
{'0': 0, '1': 1, '2': 2, '3': 3, '4': 4}
{'0': 0, '1': 1, '10': 2, '11': 3, '12': 4, '2': 5, '3': 6, '4': 7, '5': 8, '6': 9, '7': 10, '8': 11, '9': 12}
{'0': 0, '1': 1, '2': 2, '3': 3}


In [24]:
dataset.head()

Unnamed: 0,DemAffl,DemAge,DemClusterGroup,DemGender,DemReg,DemTVReg,LoyalClass,LoyalSpend,LoyalTime,TargetBuy
0,5.0,51.0,2,0,3,6,3,0.01,5.0,0
1,15.0,51.0,2,1,3,6,0,8000.0,5.0,1
2,8.0,51.0,2,1,0,5,3,0.01,6.56467,1
3,8.0,28.0,2,0,3,6,3,0.01,6.56467,1
4,14.0,67.0,2,0,3,6,3,0.01,7.0,0


###Checking for Multicollinearity

In [25]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
#feachers should have minimal correlation among them=> multicollinearity
#calculter vif:variance_inflation_factor for each input variable and ensure that are under permissible value:10

def calc_vif(z):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = z.columns

    vif["VIF"] = [variance_inflation_factor(z.values, i) for i in range(z.shape[1])]

    return(vif)

  import pandas.util.testing as tm


In [26]:
z = dataset.iloc[:,0:9]
calc_vif(z)

Unnamed: 0,variables,VIF
0,DemAffl,6.30171
1,DemAge,10.852084
2,DemClusterGroup,3.646834
3,DemGender,1.435901
4,DemReg,2.835402
5,DemTVReg,6.949169
6,LoyalClass,3.881907
7,LoyalSpend,1.864635
8,LoyalTime,3.152996


### Variable selection

In [27]:
y = dataset.iloc[:, 9].values
X = dataset.iloc[:, 0:9].values

In [28]:
# splitting dataset into training and test (in ratio 80:20)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Modelling

In [29]:
classifier =  LogisticRegression(max_iter=200)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(y_pred)

[0 0 0 ... 0 0 0]


In [30]:
# Exporting Logistic Regression Classifier to later use in prediction
import joblib
joblib.dump(classifier, './c2_Classifier_LoyalCustomers')

['./c2_Classifier_LoyalCustomers']

In [31]:
#check model performance
print(confusion_matrix(y_test,y_pred))

[[3212  155]
 [ 716  362]]


In [32]:
print(accuracy_score(y_test, y_pred))

0.8040494938132733


In [33]:
predictions = classifier.predict_proba(X_test)
predictions

array([[0.80248539, 0.19751461],
       [0.72193259, 0.27806741],
       [0.59701057, 0.40298943],
       ...,
       [0.83676099, 0.16323901],
       [0.85061901, 0.14938099],
       [0.89324246, 0.10675754]])

In [34]:
# writing model output file

df_prediction_prob = pd.DataFrame(predictions, columns = ['prob_0', 'prob_1'])
df_test_dataset = pd.DataFrame(y_test,columns= ['Actual Outcome'])
df_x_test = pd.DataFrame(X_test)
df_prediction= pd.DataFrame(y_pred,columns= ['prediction(0/1)'])
dfx=pd.concat([df_x_test,df_test_dataset,df_prediction, df_prediction_prob], axis=1)

dfx.to_excel("c1_ModelOutput_10Percent.xlsx")

dfx.tail(10)



Unnamed: 0,0,1,2,3,4,5,6,7,8,Actual Outcome,prediction(0/1),prob_0,prob_1
4435,8.0,58.0,1.0,2.0,1.0,10.0,2.0,3000.0,3.0,0,0,0.864693,0.135307
4436,7.0,51.0,2.0,0.0,0.0,2.0,1.0,20513.95,3.0,0,0,0.820155,0.179845
4437,8.0,75.0,2.0,1.0,1.0,4.0,0.0,12000.0,4.0,0,0,0.940867,0.059133
4438,7.0,51.0,2.0,1.0,1.0,10.0,0.0,12000.0,2.0,0,0,0.83879,0.16121
4439,16.0,50.0,3.0,0.0,1.0,10.0,0.0,6053.35,6.0,1,1,0.374114,0.625886
4440,7.0,55.0,3.0,0.0,0.0,5.0,2.0,50.0,10.0,0,0,0.856749,0.143251
4441,4.0,65.0,0.0,0.0,3.0,6.0,2.0,3000.0,11.0,0,0,0.958787,0.041213
4442,10.0,62.0,0.0,1.0,1.0,10.0,0.0,6000.0,4.0,0,0,0.836761,0.163239
4443,7.0,54.0,3.0,1.0,3.0,6.0,0.0,9250.0,2.0,0,0,0.850619,0.149381
4444,5.0,54.0,5.0,0.0,3.0,6.0,3.0,0.01,6.0,0,0,0.893242,0.106758


In [35]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier(n_neighbors=2)

def K_NN(X_train, X_test, y_train, y_test ,k):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print(confusion_matrix(y_test,y_pred))
    return score

param=[{"n_neighbors":list(range(1,15))}]
knn = GridSearchCV(KNeighborsClassifier(),param,cv=5,scoring='accuracy')
Res_KNN = knn.fit(X_train, y_train)
best_k = Res_KNN.best_params_["n_neighbors"]
print("Meilleure valeur de k : ",best_k)

print(K_NN(X_train, X_test, y_train, y_test ,best_k))

Meilleure valeur de k :  13
[[3135  232]
 [ 759  319]]
0.777052868391451


Coding ends here