In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/Travel.csv')
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


### **Data Cleaning**

In [None]:
cat_values = ['TypeofContact','Occupation','ProductPitched','MaritalStatus', 'Designation', 'Gender']

for value in cat_values:
  print(df[value].value_counts())

TypeofContact
Self Enquiry       3444
Company Invited    1419
Name: count, dtype: int64
Occupation
Salaried          2368
Small Business    2084
Large Business     434
Free Lancer          2
Name: count, dtype: int64
ProductPitched
Basic           1842
Deluxe          1732
Standard         742
Super Deluxe     342
King             230
Name: count, dtype: int64
MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64
Designation
Executive         1842
Manager           1732
Senior Manager     742
AVP                342
VP                 230
Name: count, dtype: int64
Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64


In [None]:
df['MaritalStatus'] = df['MaritalStatus'].replace('Unmarried', 'Single' )
df['Gender'] = df['Gender'].replace('Fe Male', 'Female' )

In [None]:
df['MaritalStatus'].unique()

array(['Single', 'Divorced', 'Married'], dtype=object)

In [None]:
df.isnull().sum()

Unnamed: 0,0
CustomerID,0
ProdTaken,0
Age,226
TypeofContact,25
CityTier,0
DurationOfPitch,251
Occupation,0
Gender,0
NumberOfPersonVisiting,0
NumberOfFollowups,45


In [None]:
df.Age.fillna(df.Age.median(), inplace = True)
df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace = True)
df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace = True)
df.NumberOfFollowups.fillna(df.NumberOfFollowups.median(), inplace = True)
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.median(), inplace = True)
df.NumberOfTrips.fillna(df.NumberOfTrips.median(), inplace = True)
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.median(), inplace = True)
df.MonthlyIncome.fillna(df.MonthlyIncome.median(), inplace = True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
CustomerID,0
ProdTaken,0
Age,0
TypeofContact,0
CityTier,0
DurationOfPitch,0
Occupation,0
Gender,0
NumberOfPersonVisiting,0
NumberOfFollowups,0


In [None]:
df.drop('CustomerID', inplace = True, axis = 1)

In [None]:
df.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


### ***Feature Engineering***

In [None]:
df['total visiting'] = df['NumberOfPersonVisiting'] + df['NumberOfChildrenVisiting']

df.drop(columns= ['NumberOfPersonVisiting', 'NumberOfChildrenVisiting'], axis = 1, inplace = True)
df.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,total visiting
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Single,1.0,1,2,1,Manager,20993.0,3.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Single,7.0,1,3,0,Executive,17090.0,3.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [None]:
X = df.drop('ProdTaken', axis = 1)

In [None]:
y = df['ProdTaken']

In [None]:
y.value_counts()

Unnamed: 0_level_0,count
ProdTaken,Unnamed: 1_level_1
0,3968
1,920


In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.20)
X_train.shape, X_test.shape

((3910, 17), (978, 17))

### ***Feature Transformation***

In [None]:
cat_features = X.select_dtypes(include = 'object').columns
num_features = X.select_dtypes(exclude= 'object').columns

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


scaler = StandardScaler()
encoder = OneHotEncoder(drop = 'first')

transformer = ColumnTransformer([('OneHotEncoder', encoder, cat_features ), ('StandardScaler',scaler,num_features )])

In [None]:
print(transformer)

ColumnTransformer(transformers=[('OneHotEncoder', OneHotEncoder(drop='first'),
                                 Index(['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
       'MaritalStatus', 'Designation'],
      dtype='object')),
                                ('StandardScaler', StandardScaler(),
                                 Index(['Age', 'CityTier', 'DurationOfPitch', 'NumberOfFollowups',
       'PreferredPropertyStar', 'NumberOfTrips', 'Passport',
       'PitchSatisfactionScore', 'OwnCar', 'MonthlyIncome', 'total visiting'],
      dtype='object'))])


In [None]:
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)

### ***Model Building***

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [None]:
models = [ RandomForestClassifier(), GaussianNB(), KNeighborsClassifier(), SVC(),DecisionTreeClassifier() ]

for model in models:
  model.fit(X_train,y_train)
  y_pred = model.predict(X_test)

  print(model, '\nConfusion matrix \n', confusion_matrix(y_pred,y_test))
  print('Score' , accuracy_score(y_pred,y_test))
  print('Report', classification_report(y_pred,y_test))
  print('--------------------------------------------------')

RandomForestClassifier() 
Confusion matrix 
 [[775  70]
 [  8 125]]
Score 0.9202453987730062
Report               precision    recall  f1-score   support

           0       0.99      0.92      0.95       845
           1       0.64      0.94      0.76       133

    accuracy                           0.92       978
   macro avg       0.82      0.93      0.86       978
weighted avg       0.94      0.92      0.93       978

--------------------------------------------------
GaussianNB() 
Confusion matrix 
 [[536  60]
 [247 135]]
Score 0.6860940695296524
Report               precision    recall  f1-score   support

           0       0.68      0.90      0.78       596
           1       0.69      0.35      0.47       382

    accuracy                           0.69       978
   macro avg       0.69      0.63      0.62       978
weighted avg       0.69      0.69      0.66       978

--------------------------------------------------
KNeighborsClassifier() 
Confusion matrix 
 [[757  80]
 [

### ***Hyperparameter Tuning***

In [None]:
params = { 'n_estimators' : [100,120,140,150] ,
          'criterion': ['gini', 'entropy', 'log_los'],
           'max_depth' : [5,8,10,2],
           'min_samples_split' : [2,8,15,23],
           'max_features': [5,7,10, 'auto']
}

In [None]:
from sklearn.model_selection import RandomizedSearchCV

randomcv = RandomizedSearchCV(estimator = RandomForestClassifier(), param_distributions = params, scoring= 'accuracy', cv = 3, verbose = 3, n_iter= 25)


In [None]:
randomcv.fit(X_train,y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV 1/3] END criterion=gini, max_depth=8, max_features=10, min_samples_split=2, n_estimators=140;, score=0.873 total time=   0.9s
[CV 2/3] END criterion=gini, max_depth=8, max_features=10, min_samples_split=2, n_estimators=140;, score=0.886 total time=   1.1s
[CV 3/3] END criterion=gini, max_depth=8, max_features=10, min_samples_split=2, n_estimators=140;, score=0.881 total time=   1.1s
[CV 1/3] END criterion=log_los, max_depth=8, max_features=7, min_samples_split=2, n_estimators=140;, score=nan total time=   0.0s
[CV 2/3] END criterion=log_los, max_depth=8, max_features=7, min_samples_split=2, n_estimators=140;, score=nan total time=   0.0s
[CV 3/3] END criterion=log_los, max_depth=8, max_features=7, min_samples_split=2, n_estimators=140;, score=nan total time=   0.0s
[CV 1/3] END criterion=log_los, max_depth=10, max_features=10, min_samples_split=23, n_estimators=150;, score=nan total time=   0.0s
[CV 2/3] END criterion=log

In [None]:
y_pred = randomcv.predict(X_test)

In [None]:
print(confusion_matrix(y_pred,y_test))
print(accuracy_score(y_pred,y_test))
print(classification_report(y_pred,y_test))

[[773  87]
 [ 10 108]]
0.9008179959100204
              precision    recall  f1-score   support

           0       0.99      0.90      0.94       860
           1       0.55      0.92      0.69       118

    accuracy                           0.90       978
   macro avg       0.77      0.91      0.82       978
weighted avg       0.93      0.90      0.91       978



In [None]:
randomcv.best_params_

{'n_estimators': 100,
 'min_samples_split': 2,
 'max_features': 10,
 'max_depth': 10,
 'criterion': 'gini'}