In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
df = pd.read_csv("Travel.csv")

In [16]:
df.drop(columns="CustomerID", inplace=True)

In [17]:
df["NumberOfTotalVisiting"] = df["NumberOfPersonVisiting"] + df["NumberOfChildrenVisiting"]
df.drop(columns=["NumberOfPersonVisiting", "NumberOfChildrenVisiting"], inplace=True)

In [18]:
df.isnull().sum()

ProdTaken                   0
Age                       226
TypeofContact              25
CityTier                    0
DurationOfPitch           251
Occupation                  0
Gender                      0
NumberOfFollowups          45
ProductPitched              0
PreferredPropertyStar      26
MaritalStatus               0
NumberOfTrips             140
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
Designation                 0
MonthlyIncome             233
NumberOfTotalVisiting      66
dtype: int64

In [19]:
df["Age"] = df["Age"].fillna(df["Age"].median())
df["TypeofContact"] = df["TypeofContact"].fillna(df["TypeofContact"].mode()[0])
df["DurationOfPitch"] = df["DurationOfPitch"].fillna(df["DurationOfPitch"].median())
df["NumberOfFollowups"] = df["NumberOfFollowups"].fillna(df["NumberOfFollowups"].median())
df["PreferredPropertyStar"] = df["PreferredPropertyStar"].fillna(df["PreferredPropertyStar"].median())
df["NumberOfTrips"] = df["NumberOfTrips"].fillna(df["NumberOfTrips"].median())
df["MonthlyIncome"] = df["MonthlyIncome"].fillna(df["MonthlyIncome"].median())
df["NumberOfTotalVisiting"] = df["NumberOfTotalVisiting"].fillna(df["NumberOfTotalVisiting"].median())

In [20]:
df["NumberOfTotalVisiting"] = df["NumberOfTotalVisiting"].astype(int)
df["MonthlyIncome"] = df["MonthlyIncome"].astype(int)
df["PreferredPropertyStar"] = df["PreferredPropertyStar"].astype(int)
df["DurationOfPitch"] = df["DurationOfPitch"].astype(int)
df["Age"] = df["Age"].astype(int)
df["NumberOfFollowups"] = df["NumberOfFollowups"].astype(int)
df["NumberOfTrips"] = df["NumberOfTrips"].astype(int)

In [21]:
df["TypeofContact"].unique()

array(['Self Enquiry', 'Company Invited'], dtype=object)

In [22]:
df["MaritalStatus"].unique()

array(['Single', 'Divorced', 'Married', 'Unmarried'], dtype=object)

In [23]:
df["Gender"] = df["Gender"].replace("Fe Male", "Female")

In [24]:
df["MaritalStatus"] = df["MaritalStatus"].replace("Unmarried", "Single")

In [25]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


X = df.drop(columns="ProdTaken")
Y = df["ProdTaken"]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=42)

cat_features = X_train.select_dtypes(include="O").columns
num_features = X_train.select_dtypes(exclude="O").columns

encoder = OneHotEncoder(drop="first")
scaler = StandardScaler()

preprocessor = ColumnTransformer(
    [
        ("cat_features", encoder, cat_features),
        ("num_features", scaler, num_features)
    ]

)

In [26]:
preprocessor

In [27]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [28]:
X_train.shape

(3421, 26)

In [29]:
X_test.shape

(1467, 26)

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score


def evalute_model(true, predict):
    print(f"accuracy score : {accuracy_score(true, predict)}")
    print(f"precision score :  : {precision_score(true, predict)}")
    print(f"recall score :  : {recall_score(true, predict)}")
    print(f"classification report :  : {classification_report(true, predict)}")
    return


models = {
    "Random forest" : RandomForestClassifier(),
    "Decision Tree" : DecisionTreeClassifier(),
    "Logistic" : LogisticRegression(),
    "Adaboost" : AdaBoostClassifier()
}

for name, model in models.items():
    estimator = model
    estimator.fit(X_train, Y_train)
    y_pred_train = estimator.predict(X_train)
    y_pred_test = estimator.predict(X_test)
    
    print(f"accuracy for {name} model")
    
    # train accuracy
    print("------- accuracy for train data---------")
    evalute_model(Y_train, y_pred_train)
    
    # test accuracy
    print("------- accuracy for test data---------")
    evalute_model(Y_test, y_pred_test)
    


accuracy for Random forest model
------- accuracy for train data---------
accuracy score : 1.0
precision score :  : 1.0
recall score :  : 1.0
classification report :  :               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2775
           1       1.00      1.00      1.00       646

    accuracy                           1.00      3421
   macro avg       1.00      1.00      1.00      3421
weighted avg       1.00      1.00      1.00      3421

------- accuracy for test data---------
accuracy score : 0.9250170415814588
precision score :  : 0.9555555555555556
recall score :  : 0.6277372262773723
classification report :  :               precision    recall  f1-score   support

           0       0.92      0.99      0.96      1193
           1       0.96      0.63      0.76       274

    accuracy                           0.93      1467
   macro avg       0.94      0.81      0.86      1467
weighted avg       0.93      0.93      0.92      146



accuracy for Adaboost model
------- accuracy for train data---------
accuracy score : 0.8555977784273604
precision score :  : 0.7303030303030303
recall score :  : 0.37306501547987614
classification report :  :               precision    recall  f1-score   support

           0       0.87      0.97      0.92      2775
           1       0.73      0.37      0.49       646

    accuracy                           0.86      3421
   macro avg       0.80      0.67      0.70      3421
weighted avg       0.84      0.86      0.84      3421

------- accuracy for test data---------
accuracy score : 0.8350374914792092
precision score :  : 0.6081081081081081
recall score :  : 0.3284671532846715
classification report :  :               precision    recall  f1-score   support

           0       0.86      0.95      0.90      1193
           1       0.61      0.33      0.43       274

    accuracy                           0.84      1467
   macro avg       0.73      0.64      0.67      1467
weighted av

## hyper parameter tuning

In [31]:
rf_params = {
    "n_estimators" : [100, 400],
    "criterion" : ["gini", "entropy", "log_loss"],
    "max_depth": [None, 5, 10, 15],
    "min_samples_split" : [2,8, 15],
    "max_features" : ["sqrt", "log2", None]
} 
ada_params = {
    "n_estimators": [50, 100, 150, 200],
    "algorithm" : ["SAMME", "SAMME.R"],
    "learning_rate" : [1, 2, 5, 0.5]
}

In [32]:
gridcv_models = [
    ("RF", RandomForestClassifier(), rf_params),
    ("adaboost", AdaBoostClassifier(), ada_params)
]

In [36]:
from sklearn.model_selection import RandomizedSearchCV

best_params = {}

for name, model, params in gridcv_models:
    grid = RandomizedSearchCV(estimator = model, param_distributions= params, n_jobs=-1, verbose=2, cv=5, scoring="accuracy", n_iter= 100)
    grid.fit(X_train, Y_train)
    
    
    y_pred_train = grid.predict(X_train)
    y_pred_test = grid.predict(X_test)
    
    print(f"accuracy for {name} model")
    
    # train accuracy
    print("------- accuracy for train data---------")
    evalute_model(Y_train, y_pred_train)
    
    # test accuracy
    print("------- accuracy for test data---------")
    evalute_model(Y_test, y_pred_test)
    
    best_params[name] = grid.best_params_
    


Fitting 5 folds for each of 100 candidates, totalling 500 fits
accuracy for RF model
------- accuracy for train data---------
accuracy score : 1.0
precision score :  : 1.0
recall score :  : 1.0
classification report :  :               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2775
           1       1.00      1.00      1.00       646

    accuracy                           1.00      3421
   macro avg       1.00      1.00      1.00      3421
weighted avg       1.00      1.00      1.00      3421

------- accuracy for test data---------
accuracy score : 0.9359236537150648
precision score :  : 0.9285714285714286
recall score :  : 0.7116788321167883
classification report :  :               precision    recall  f1-score   support

           0       0.94      0.99      0.96      1193
           1       0.93      0.71      0.81       274

    accuracy                           0.94      1467
   macro avg       0.93      0.85      0.88      1467




accuracy for adaboost model
------- accuracy for train data---------
accuracy score : 0.8517977199649225
precision score :  : 0.7725490196078432
recall score :  : 0.30495356037151705
classification report :  :               precision    recall  f1-score   support

           0       0.86      0.98      0.91      2775
           1       0.77      0.30      0.44       646

    accuracy                           0.85      3421
   macro avg       0.82      0.64      0.68      3421
weighted avg       0.84      0.85      0.82      3421

------- accuracy for test data---------
accuracy score : 0.8425357873210634
precision score :  : 0.6972477064220184
recall score :  : 0.2773722627737226
classification report :  :               precision    recall  f1-score   support

           0       0.85      0.97      0.91      1193
           1       0.70      0.28      0.40       274

    accuracy                           0.84      1467
   macro avg       0.78      0.62      0.65      1467
weighted av

In [37]:
best_params

{'RF': {'n_estimators': 100,
  'min_samples_split': 2,
  'max_features': None,
  'max_depth': None,
  'criterion': 'gini'},
 'adaboost': {'n_estimators': 100, 'learning_rate': 1, 'algorithm': 'SAMME'}}

In [41]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

Rf = RandomForestClassifier(n_estimators=100, min_samples_split=2, max_features=None, max_depth=None, criterion="gini")
adaboost = AdaBoostClassifier(n_estimators=100, learning_rate=1, algorithm="SAMME")

Rf.fit(X_train, Y_train)

y_pred = Rf.predict(X_test)

evalute_model(Y_test, y_pred)


accuracy score : 0.9406952965235174
precision score :  : 0.9348837209302325
recall score :  : 0.7335766423357665
classification report :  :               precision    recall  f1-score   support

           0       0.94      0.99      0.96      1193
           1       0.93      0.73      0.82       274

    accuracy                           0.94      1467
   macro avg       0.94      0.86      0.89      1467
weighted avg       0.94      0.94      0.94      1467



In [42]:
adaboost.fit(X_train, Y_train)

y_pred = adaboost.predict(X_test)

evalute_model(Y_test, y_pred)


accuracy score : 0.8425357873210634
precision score :  : 0.6972477064220184
recall score :  : 0.2773722627737226
classification report :  :               precision    recall  f1-score   support

           0       0.85      0.97      0.91      1193
           1       0.70      0.28      0.40       274

    accuracy                           0.84      1467
   macro avg       0.78      0.62      0.65      1467
weighted avg       0.82      0.84      0.81      1467

