# Customer Conversion Prediction

## 1. Importing important libraries

In [33]:
#Importing the important libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTEENN
import seaborn
import matplotlib.pyplot as plt

## 2. Importing the dataset and displaying the summary

In [34]:
#Importing my traing dataset and getting an intial look on how data looks
act_data = pd.read_csv("C:\\Users\\ADMIN\\Downloads\\train.csv")
act_data = act_data.copy()

In [35]:
act_data

Unnamed: 0,age,job,marital,education_qual,call_type,day,mon,dur,num_calls,prev_outcome,y
0,58,management,married,tertiary,unknown,5,may,261,1,unknown,no
1,44,technician,single,secondary,unknown,5,may,151,1,unknown,no
2,33,entrepreneur,married,secondary,unknown,5,may,76,1,unknown,no
3,47,blue-collar,married,unknown,unknown,5,may,92,1,unknown,no
4,33,unknown,single,unknown,unknown,5,may,198,1,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,cellular,17,nov,977,3,unknown,yes
45207,71,retired,divorced,primary,cellular,17,nov,456,2,unknown,yes
45208,72,retired,married,secondary,cellular,17,nov,1127,5,success,yes
45209,57,blue-collar,married,secondary,telephone,17,nov,508,4,unknown,no


In [36]:
act_data.shape

(45211, 11)

In [37]:
act_data.describe()

Unnamed: 0,age,day,dur,num_calls
count,45211.0,45211.0,45211.0,45211.0
mean,40.93621,15.806419,258.16308,2.763841
std,10.618762,8.322476,257.527812,3.098021
min,18.0,1.0,0.0,1.0
25%,33.0,8.0,103.0,1.0
50%,39.0,16.0,180.0,2.0
75%,48.0,21.0,319.0,3.0
max,95.0,31.0,4918.0,63.0


## 3. Data Jar

#### 3.1 Cleaning the data

##### 3.1.1 Finding the missing values

In [38]:
#Finding missing values in our data
act_data.isnull().sum()
#There are no missing values in our dataset

age               0
job               0
marital           0
education_qual    0
call_type         0
day               0
mon               0
dur               0
num_calls         0
prev_outcome      0
y                 0
dtype: int64

##### 3.1.2 Finding and deleting duplicate rows

In [39]:
act_data.shape

(45211, 11)

In [40]:
#Finding duplicate values and droping them
act_data = act_data.drop_duplicates()

In [41]:
act_data.shape

(45205, 11)

##### 3.1.3 Removing outliers from our numerical data

In [42]:
act_data.describe()

Unnamed: 0,age,day,dur,num_calls
count,45205.0,45205.0,45205.0,45205.0
mean,40.937087,15.80688,258.183055,2.763898
std,10.61913,8.32234,257.538504,3.098189
min,18.0,1.0,0.0,1.0
25%,33.0,8.0,103.0,1.0
50%,39.0,16.0,180.0,2.0
75%,48.0,21.0,319.0,3.0
max,95.0,31.0,4918.0,63.0


In [43]:
for i in act_data.select_dtypes(include=['int64', 'float64']):
        iqr = act_data[i].quantile(0.75) - act_data[i].quantile(0.25)
        upper_threshold = act_data[i].quantile(0.75) + (1.5 * iqr) # q3 + 1.5iqr
        lower_threshold = act_data[i].quantile(0.25) - (1.5 * iqr) # q1 - 1.5iqr
        act_data = act_data.copy()
        act_data[i] = act_data[i].clip(lower_threshold, upper_threshold)

In [44]:
act_data.describe()

Unnamed: 0,age,day,dur,num_calls
count,45205.0,45205.0,45205.0,45205.0
mean,40.869052,15.80688,234.9562,2.392235
std,10.395247,8.32234,176.75476,1.600152
min,18.0,1.0,0.0,1.0
25%,33.0,8.0,103.0,1.0
50%,39.0,16.0,180.0,2.0
75%,48.0,21.0,319.0,3.0
max,70.5,31.0,643.0,6.0


##### 3.1.4 Handling invalid data

In [45]:
#Frequency of unique elements in catogorical column
cat_cols = act_data.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(act_data[cat_cols].melt(var_name='column', value_name='value').value_counts()).rename(columns={0: 'counts'}).sort_values(by=['column', 'counts']))

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
call_type,telephone,2906
call_type,unknown,13017
call_type,cellular,29282
education_qual,unknown,1857
education_qual,primary,6850
education_qual,tertiary,13299
education_qual,secondary,23199
job,unknown,288
job,student,938
job,housemaid,1240


In [46]:
#As we can see in education_qual and job column their are few unknown elements which can be replaced with mode
#Replacing unknown values in education_qual and job columns with mode
act_data = act_data.copy()
act_data['education_qual']=act_data['education_qual'].replace('unknown', act_data['education_qual'].mode()[0])
act_data['job']=act_data['job'].replace('unknown', act_data['job'].mode()[0])

#Also call_type and prev_outcome column have unknown values but their frequency is higher and can be treated as seprated element itself
#Renaming the unknown values so as to better analyse and does not coincide with each other
act_data['call_type']=act_data['call_type'].replace('unknown', 'unknown_call_type')
act_data['prev_outcome']=act_data['prev_outcome'].replace('unknown', 'unknown_prev_outcome')

In [47]:
#Frequency of unique elements in catogorical column after renaming
cat_cols = act_data.select_dtypes(include=object).columns.tolist()
(pd.DataFrame(act_data[cat_cols].melt(var_name='column', value_name='value').value_counts()).rename(columns={0: 'counts'}).sort_values(by=['column', 'counts']))

Unnamed: 0_level_0,Unnamed: 1_level_0,counts
column,value,Unnamed: 2_level_1
call_type,telephone,2906
call_type,unknown_call_type,13017
call_type,cellular,29282
education_qual,primary,6850
education_qual,tertiary,13299
education_qual,secondary,25056
job,student,938
job,housemaid,1240
job,unemployed,1303
job,entrepreneur,1487


#### 3.2 Encoding our categorical data

In [48]:
act_data

Unnamed: 0,age,job,marital,education_qual,call_type,day,mon,dur,num_calls,prev_outcome,y
0,58.0,management,married,tertiary,unknown_call_type,5,may,261,1,unknown_prev_outcome,no
1,44.0,technician,single,secondary,unknown_call_type,5,may,151,1,unknown_prev_outcome,no
2,33.0,entrepreneur,married,secondary,unknown_call_type,5,may,76,1,unknown_prev_outcome,no
3,47.0,blue-collar,married,secondary,unknown_call_type,5,may,92,1,unknown_prev_outcome,no
4,33.0,blue-collar,single,secondary,unknown_call_type,5,may,198,1,unknown_prev_outcome,no
...,...,...,...,...,...,...,...,...,...,...,...
45206,51.0,technician,married,tertiary,cellular,17,nov,643,3,unknown_prev_outcome,yes
45207,70.5,retired,divorced,primary,cellular,17,nov,456,2,unknown_prev_outcome,yes
45208,70.5,retired,married,secondary,cellular,17,nov,643,5,success,yes
45209,57.0,blue-collar,married,secondary,telephone,17,nov,508,4,unknown_prev_outcome,no


In [49]:
for i in act_data.select_dtypes(include=['object'], exclude=['int64', 'float64']).columns[:-1]:
    
    # Get one hot encoding of job column
    one_hot = pd.get_dummies(act_data[i])
    # Drop column B as it is now encoded
    act_data = act_data.drop(i,axis = 1)
    # Join the encoded df
    act_data = act_data.join(one_hot)

In [50]:
#Get label encoding for y column
act_data["y"] = act_data["y"].map({"yes":1,"no":0}) #encoding binary class data (run only once)

In [51]:
act_data

Unnamed: 0,age,day,dur,num_calls,y,admin.,blue-collar,entrepreneur,housemaid,management,...,jun,mar,may,nov,oct,sep,failure,other,success,unknown_prev_outcome
0,58.0,5,261,1,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,1
1,44.0,5,151,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,33.0,5,76,1,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,1
3,47.0,5,92,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4,33.0,5,198,1,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51.0,17,643,3,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
45207,70.5,17,456,2,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
45208,70.5,17,643,5,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
45209,57.0,17,508,4,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,1


#### 3.3 Split the dataset in train and test

In [52]:
#Spliting the dat into train and test
x = act_data[['age', 'day', 'dur', 'num_calls', 'admin.', 'blue-collar','entrepreneur', 'housemaid', 'management', 'retired',
              'self-employed','services', 'student', 'technician', 'unemployed', 'divorced','married', 'single', 'primary',
              'secondary', 'tertiary', 'cellular','telephone', 'unknown_call_type', 'apr', 'aug', 'dec', 'feb', 'jan','jul',
              'jun', 'mar', 'may', 'nov', 'oct', 'sep', 'failure', 'other','success', 'unknown_prev_outcome']].values

y = act_data[['y']].values

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3, random_state = 101)

#### 3.4 Balancing the dataset

In [57]:
#Balancing the dataset
smt = SMOTEENN(sampling_strategy='all')
x_train, y_train = smt.fit_resample(x_train, y_train)

#### 3.5 Standardize the dataset

In [62]:
#Standarize the dataset before fitting it into the model
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

### 4. Model, Loss, Learning, and Evaluation

In [63]:
class ClassificationModel:
    
    def __init__(self, x_train, x_test, y_train, y_test):
        self.x_test = x_test
        self.x_train = x_train
        self.y_train = y_train
        self.y_test = y_test
        
        
    def log_reg_model(self):
        from sklearn.linear_model import LogisticRegression

        log_reg = LogisticRegression() # initialise the model, ready to be used
        log_reg.fit(self.x_train, self.y_train) #training of data happens 
        
        y_pred = log_reg.predict_proba(self.x_test)
        roc = roc_auc_score(self.y_test, y_pred[:,1])

        return roc
  

    def knn_model(self):
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.model_selection import cross_val_score # import all the functions reqd for cross validation

        '''
        hcv = 0
        for i in [1,2,3,4,5,6,7,8,9,10]:
            knn = KNeighborsClassifier(i) #initialising the model
            knn.fit(x_train,y_train) # training the model
            if np.mean(cross_val_score(knn, x_train, y_train, cv=10, scoring = "roc_auc")) > hcv:
                hcv = np.mean(cross_val_score(knn, x_train, y_train, cv=10, scoring = "roc_auc"))
                khp = i
            else:
                break
        '''

        knn = KNeighborsClassifier(6)
        knn.fit(x_train,y_train)
        y_pred = knn.predict(x_test)
        roc = roc_auc_score(y_test, y_pred)
        
        return roc

    
    def dec_tree_model(self):
        from sklearn.tree import DecisionTreeClassifier

        dt = DecisionTreeClassifier(max_depth=9)
        dt.fit(self.x_train, self.y_train) # it will ask all possible questions, compute the information gain and choose the best split

        # Predict test set labels
        y_pred = dt.predict(self.x_test)
        roc = roc_auc_score(self.y_test, y_pred)
        
        return roc
    
    def ens_model(self):
        from sklearn.ensemble import VotingClassifier # this is the function that ensembles my model
        from sklearn.linear_model import LogisticRegression
        from sklearn import tree
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.metrics import roc_auc_score

        model1 = LogisticRegression(random_state=1)
        model2 = tree.DecisionTreeClassifier(max_depth=9, random_state=1)
        model3 = KNeighborsClassifier(6)
        model = VotingClassifier(estimators=[('lr', model1), ('dt', model2),('knn',model3)], voting='soft') # it will stitich all the models together, voting = hard means max voting 
        # we need to give the list of models that we are trying to combine
       
        model.fit(self.x_train, self.y_train) # to train all of the models
        model.predict(self.x_test)
        y_pred = model.predict_proba(self.x_test)
        roc = roc_auc_score(self.y_test,y_pred[:,1])
        
        return roc
    
    def rf_model(self):
        from sklearn.ensemble import RandomForestClassifier
        rf = RandomForestClassifier(max_depth=10,n_estimators=100, max_features='sqrt')
        rf.fit(self.x_train, self.y_train)
        
        y_pred = rf.predict(self.x_test)
        roc = roc_auc_score(self.y_test, y_pred)
        
        """
        from sklearn.inspection import permutation_importance
        importances = rf.feature_importances_

        sorted_indices = np.argsort(importances)[::-1]

        feat_labels = act_data.columns[1:]

        for f in range(x_train.shape[1]):
            print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[sorted_indices[f]],
                            importances[sorted_indices[f]]))
        """
        
        return roc
    
    def xg_model(self):
        from xgboost import XGBClassifier
        
        model = XGBClassifier(learning_rate=0.5,n_estimators=100,verbosity=None)
        model.fit(self.x_train, self.y_train)
       
        y_pred = model.predict(self.x_test)
        roc = roc_auc_score(y_test, y_pred)
        return roc


In [64]:
clsmod = ClassificationModel(x_train, x_test, y_train, y_test)

In [65]:
knn_score = clsmod.knn_model()
print(knn_score)

0.7868251910099828


In [66]:
log_reg_score = clsmod.log_reg_model()
print(log_reg_score)

0.9034467843982364


In [67]:
dec_score = clsmod.dec_tree_model()
print(dec_score)

0.8077389528848978


In [68]:
ens_score = clsmod.ens_model()
print(ens_score)

0.9148680534919513
