### Data Collection & Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline


In [2]:
df = pd.read_csv("Travel.csv")
df.head(2)

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0


### Data Cleaning

#### Handling Missing Values
1. Handling Missing Values
2. Handling Duplicates
3. Check Data Type
4. Understand the Data Set

In [3]:
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [4]:
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [5]:
df['Gender'] = df['Gender'].replace('Fe Male', 'Female') # Replacing "Fe Male" with "Female"

In [6]:
df['Gender'].value_counts()

Gender
Male      2916
Female    1972
Name: count, dtype: int64

In [7]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [8]:
df['MaritalStatus'] = df['MaritalStatus'].replace('Single','Unmarried')
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Unmarried    1598
Divorced      950
Name: count, dtype: int64

In [9]:
# Checking Missing Values
features_with_nan = [features for features in df.columns if df[features].isnull().sum()>=1]
for feature in features_with_nan:
    print(feature, np.round(df[feature].isnull().mean()*100,5), '% missing values')


Age 4.62357 % missing values
TypeofContact 0.51146 % missing values
DurationOfPitch 5.13502 % missing values
NumberOfFollowups 0.92062 % missing values
PreferredPropertyStar 0.53191 % missing values
NumberOfTrips 2.86416 % missing values
NumberOfChildrenVisiting 1.35025 % missing values
MonthlyIncome 4.76678 % missing values


In [10]:
df[features_with_nan].select_dtypes(exclude = 'object').describe()

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


### Imputing Null Values
1. Impute Median Value for Age
2. Impute Mode for TypeofContact
3. Impute Median for DurationOfPitch
4. Impute Mode for NumberOfFollowups as it's Discrete Feature
5. Impute Mode for PreferredPropertyStar
6. Impute Median for NumberOfTrips
7. Impute Mode for NumberOfChildrenVisiting
8. Impute Median for MonthlyIncome

In [11]:
# Age
df['Age'].fillna(df['Age'].median(), inplace=True)

# TypeofContact
df['TypeofContact'].fillna(df['TypeofContact'].mode()[0], inplace=True)

# DurationOfPitch
df['DurationOfPitch'].fillna(df['DurationOfPitch'].median(), inplace=True)

# NumberOfFollowups
df['NumberOfFollowups'].fillna(df['NumberOfFollowups'].mode()[0], inplace=True)

# PreferredPropertyStar
df['PreferredPropertyStar'].fillna(df['PreferredPropertyStar'].mode()[0], inplace = True)

# NumberOfTrips
df['NumberOfTrips'].fillna(df['NumberOfTrips'].median(), inplace = True)

# NumberOfChildrenVisiting
df['NumberOfChildrenVisiting'].fillna(df['NumberOfChildrenVisiting'].mode()[0], inplace=True)

# MonthlyIncome
df['MonthlyIncome'].fillna(df['MonthlyIncome'].median(), inplace=True)

In [12]:
df.isnull().sum()

CustomerID                  0
ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

In [13]:
df.drop('CustomerID', axis = 1).head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,3,0,0.0,Executive,17090.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


### Feature Engineering

#### Feature Extraction

In [14]:
df['TotalVisiting'] = df['NumberOfChildrenVisiting'] + df['NumberOfPersonVisiting']
df.drop(columns=['NumberOfChildrenVisiting', 'NumberOfPersonVisiting'], axis = 1, inplace = True)

In [15]:
# Selecting all numeric features and storing it in variable named num_features
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print("Number of numerical features is :", len(num_features))

Number of numerical features is : 13


In [16]:
# Selecting all categorical features and storing it in variable named cat_features
cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print(f"Number of categorical feature is {len(cat_features)}")

Number of categorical feature is 6


In [17]:
# Selecting all discrete features and storing it in discrete_features
discrete_features = [feature for feature in num_features if len(df[feature].unique()) <= 25]
print(f"Number of discrete features is {len(discrete_features)}")

Number of discrete features is 9


In [18]:
# Selecting all continous features and storing in in continous_features
continous_features = [feature for feature in num_features if feature not in discrete_features]
print(f"Number of continous features is {len(continous_features)}")

Number of continous features is 4


### Train Test Split

In [19]:
from sklearn.model_selection import train_test_split

X = df.drop('ProdTaken', axis = 1)
y = df['ProdTaken']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.8, random_state=42)
X_train.shape, X_test.shape

((3910, 18), (978, 18))

In [21]:
# Creating column transformer with 3 types of transformer
cat_features = X.select_dtypes(include='object').columns
num_features = X.select_dtypes(exclude='object').columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [22]:
# Applying transformation in training dataset 
# NOTE : in training dateset USE fit_transform() & in test dataset USE transform() to avoid data leakage

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

### Random Forest Classifier Model Training

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score,f1_score,roc_curve, roc_auc_score


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [29]:
# This is an efficient way of training models
models = {
    "Random Forest":RandomForestClassifier(),
    "Decision Tree":DecisionTreeClassifier(),
    "AdaBoost Classifier" : AdaBoostClassifier(),
    "Gradient Boost Classifier":GradientBoostingClassifier()
} # we can add many algorithms here to see which is performing better

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train) # Model Training

    # Making predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train,y_train_pred)
    model_train_f1 = f1_score(y_train,y_train_pred,average='weighted')
    model_train_precision = precision_score(y_train,y_train_pred)
    model_train_recall = recall_score(y_train,y_train_pred)
    model_train_roc_auc_score = roc_auc_score(y_train,y_train_pred)

    # Test set performance
    model_test_accuracy = accuracy_score(y_test,y_test_pred)
    model_test_f1 = f1_score(y_test,y_test_pred, average='weighted')
    model_test_precision = precision_score(y_test,y_test_pred)
    model_test_recall = recall_score(y_test,y_test_pred)
    model_test_roc_auc_score = roc_auc_score(y_test,y_test_pred)

    print(list(models.keys())[i])

    print('-----------------------------------')
    print('Model Performance for training set')
    print('- Accuracy : {:.4f}'.format(model_train_accuracy))
    print('- F1 score : {:.4f}'.format(model_train_f1))
    print('- Precision : {:.4f}'.format(model_train_precision))
    print('- Recall : {:.4f}'.format(model_train_recall))
    print('- ROC AUC Score : {:.4f}'.format(model_train_roc_auc_score))

    print('-----------------------------------')
    print('Model Performance for testing set')
    print('- Accuracy : {:.4f}'.format(model_test_accuracy))
    print('- F1 score : {:.4f}'.format(model_test_f1))
    print('- Precision : {:.4f}'.format(model_test_precision))
    print('- Recall : {:.4f}'.format(model_test_recall))
    print('- ROC AUC Score : {:.4f}'.format(model_test_roc_auc_score))


    print('='*35)
    print('\n')

Random Forest
-----------------------------------
Model Performance for training set
- Accuracy : 1.0000
- F1 score : 1.0000
- Precision : 1.0000
- Recall : 1.0000
- ROC AUC Score : 1.0000
-----------------------------------
Model Performance for testing set
- Accuracy : 0.9121
- F1 score : 0.9035
- Precision : 0.9487
- Recall : 0.5812
- ROC AUC Score : 0.7868


Decision Tree
-----------------------------------
Model Performance for training set
- Accuracy : 1.0000
- F1 score : 1.0000
- Precision : 1.0000
- Recall : 1.0000
- ROC AUC Score : 1.0000
-----------------------------------
Model Performance for testing set
- Accuracy : 0.9008
- F1 score : 0.8995
- Precision : 0.7640
- Recall : 0.7120
- ROC AUC Score : 0.8293


AdaBoost Classifier
-----------------------------------
Model Performance for training set
- Accuracy : 0.8691
- F1 score : 0.8518
- Precision : 0.7863
- Recall : 0.4088
- ROC AUC Score : 0.6917
-----------------------------------
Model Performance for testing set
- Acc

#### We can clearly see by using random forest instead of decision tree , our accuracy for test data has increased because decision tree leads to overfitting i.e. low bias and high variance and random forest helps us in making generalized model by converting high variance into low variance. Therefore, by using Random Forest we get generalized model with high train and test accuracy i.e. low bias and low variance

In [31]:
# Hyperparameter Tuning
rf_params = {
    "max_depth":[5,8,15,None,10],
    "max_features":[5,7,"auto",8],
    "min_samples_split":[2,8,15,20],
    "n_estimators":[100,200,500,1000]         
             }
rf_params

adaboost_params = {
    "n_estimators":[50,60,70,80,90],
    "algorithm":['SAMME','SAMME.R']
}

gboost_params = {
    "loss":['log_loss','deviance','exponential'],
    "criterion":['friedman_mse','squared_error','mse'],
    "min_samples_split":[2,8,15,20],
    "n_estimators":[100,200,500,1000],
    "max_depth":[5,8,15,None,10]
}

In [32]:
# Models list for hyperparameter tuning
randomcv_models = [
    ("RF",RandomForestClassifier(),rf_params),
    ("AdaBoost",AdaBoostClassifier(),adaboost_params),
    ("Gradient Boost",GradientBoostingClassifier(),gboost_params)
]

In [33]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name , model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model,
                                param_distributions=params,
                                n_iter=100,
                                cv=3,
                                verbose=2,
                                n_jobs=-1)
    
    random.fit(X_train,y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f"---------------------------Best params for {model_name}---------------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END max_depth=8, max_features=5, min_samples_split=2, n_estimators=500; total time=   1.5s
[CV] END max_depth=8, max_features=5, min_samples_split=2, n_estimators=500; total time=   1.5s
[CV] END max_depth=8, max_features=5, min_samples_split=2, n_estimators=500; total time=   1.6s
[CV] END max_depth=5, max_features=8, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=5, max_features=8, min_samples_split=2, n_estimators=200; total time=   0.5s
[CV] END max_depth=8, max_features=auto, min_samples_split=8, n_estimators=500; total time=   0.0s
[CV] END max_depth=8, max_features=auto, min_samples_split=8, n_estimators=500; total time=   0.0s
[CV] END max_depth=8, max_features=auto, min_samples_split=8, n_estimators=500; total time=   0.0s
[CV] END max_depth=5, max_features=8, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=8, max_features=8, min_samples_split=20, n_es



[CV] END .................algorithm=SAMME.R, n_estimators=50; total time=   0.2s
[CV] END ...................algorithm=SAMME, n_estimators=90; total time=   0.3s
[CV] END .................algorithm=SAMME.R, n_estimators=50; total time=   0.2s
[CV] END .................algorithm=SAMME.R, n_estimators=50; total time=   0.2s
[CV] END ...................algorithm=SAMME, n_estimators=90; total time=   0.3s
[CV] END ...................algorithm=SAMME, n_estimators=90; total time=   0.3s
[CV] END .................algorithm=SAMME.R, n_estimators=60; total time=   0.2s
[CV] END .................algorithm=SAMME.R, n_estimators=60; total time=   0.2s




[CV] END .................algorithm=SAMME.R, n_estimators=60; total time=   0.3s
[CV] END .................algorithm=SAMME.R, n_estimators=70; total time=   0.3s
[CV] END .................algorithm=SAMME.R, n_estimators=70; total time=   0.3s
[CV] END .................algorithm=SAMME.R, n_estimators=70; total time=   0.3s
[CV] END .................algorithm=SAMME.R, n_estimators=80; total time=   0.2s
[CV] END .................algorithm=SAMME.R, n_estimators=80; total time=   0.3s
[CV] END .................algorithm=SAMME.R, n_estimators=80; total time=   0.3s
[CV] END .................algorithm=SAMME.R, n_estimators=90; total time=   0.3s




[CV] END .................algorithm=SAMME.R, n_estimators=90; total time=   0.2s
[CV] END .................algorithm=SAMME.R, n_estimators=90; total time=   0.2s
Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END criterion=mse, loss=exponential, max_depth=15, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END criterion=mse, loss=exponential, max_depth=15, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END criterion=mse, loss=exponential, max_depth=15, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END criterion=squared_error, loss=deviance, max_depth=8, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END criterion=squared_error, loss=deviance, max_depth=8, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END criterion=squared_error, loss=deviance, max_depth=8, min_samples_split=8, n_estimators=200; total time=   0.0s
[CV] END criterion=mse, loss=deviance, max_depth=15, min_samples_split=15

In [34]:
# Now training the model using the best param found through RandomizedSearchCV
models = {
    "Random Forest":RandomForestClassifier(n_estimators=500,min_samples_split=2,max_features=8,max_depth=None),
    "AdaBoost Classifier":AdaBoostClassifier(n_estimators=70,algorithm='SAMME.R'),
    "Gradient Boost Classifier":GradientBoostingClassifier(n_estimators=500,min_samples_split=20,max_depth=10,loss='exponential',criterion='squared_error')
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train) # Model Training

    # Making predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Training set performance
    model_train_accuracy = accuracy_score(y_train,y_train_pred)
    model_train_f1 = f1_score(y_train,y_train_pred,average='weighted')
    model_train_precision = precision_score(y_train,y_train_pred)
    model_train_recall = recall_score(y_train,y_train_pred)
    model_train_roc_auc_score = roc_auc_score(y_train,y_train_pred)

    # Test set performance
    model_test_accuracy = accuracy_score(y_test,y_test_pred)
    model_test_f1 = f1_score(y_test,y_test_pred, average='weighted')
    model_test_precision = precision_score(y_test,y_test_pred)
    model_test_recall = recall_score(y_test,y_test_pred)
    model_test_roc_auc_score = roc_auc_score(y_test,y_test_pred)

    print(list(models.keys())[i])

    print('-----------------------------------')
    print('Model Performance for training set')
    print('- Accuracy : {:.4f}'.format(model_train_accuracy))
    print('- F1 score : {:.4f}'.format(model_train_f1))
    print('- Precision : {:.4f}'.format(model_train_precision))
    print('- Recall : {:.4f}'.format(model_train_recall))
    print('- ROC AUC Score : {:.4f}'.format(model_train_roc_auc_score))

    print('-----------------------------------')
    print('Model Performance for testing set')
    print('- Accuracy : {:.4f}'.format(model_test_accuracy))
    print('- F1 score : {:.4f}'.format(model_test_f1))
    print('- Precision : {:.4f}'.format(model_test_precision))
    print('- Recall : {:.4f}'.format(model_test_recall))
    print('- ROC AUC Score : {:.4f}'.format(model_test_roc_auc_score))


    print('='*35)
    print('\n')


Random Forest
-----------------------------------
Model Performance for training set
- Accuracy : 1.0000
- F1 score : 1.0000
- Precision : 1.0000
- Recall : 1.0000
- ROC AUC Score : 1.0000
-----------------------------------
Model Performance for testing set
- Accuracy : 0.9192
- F1 score : 0.9120
- Precision : 0.9590
- Recall : 0.6126
- ROC AUC Score : 0.8031


AdaBoost Classifier
-----------------------------------
Model Performance for training set
- Accuracy : 0.8731
- F1 score : 0.8576
- Precision : 0.7935
- Recall : 0.4321
- ROC AUC Score : 0.7032
-----------------------------------
Model Performance for testing set
- Accuracy : 0.8476
- F1 score : 0.8268
- Precision : 0.7188
- Recall : 0.3613
- ROC AUC Score : 0.6635


Gradient Boost Classifier
-----------------------------------
Model Performance for training set
- Accuracy : 1.0000
- F1 score : 1.0000
- Precision : 1.0000
- Recall : 1.0000
- ROC AUC Score : 1.0000
-----------------------------------
Model Performance for testi