In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [63]:
data = pd.read_csv('travel.csv')
data.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4662 non-null   float64
 3   TypeofContact             4863 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4637 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4843 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4862 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4748 non-null   float64
 14  Passport

In [65]:
data.isna().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [66]:
data['Gender'] = data['Gender'].replace('Fe Male', 'Female')
data['MaritalStatus'] = data['MaritalStatus'].replace('Single', 'Unmarried')

In [67]:
num_cols = data.select_dtypes(include=['number']).columns
for col in num_cols:
    data[col].fillna(data[col].median(), inplace=True)

# Replace missing values for categorical columns with the mode
cat_cols = data.select_dtypes(include=['object']).columns
for col in cat_cols:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [68]:
data.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,36.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [69]:
data.drop(['CustomerID'], axis=1, inplace=True)

In [70]:
data['Total'] = data['NumberOfPersonVisiting'] + data['NumberOfChildrenVisiting']

In [71]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [72]:
data.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome,Total
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0,3.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0,5.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,3,0,0.0,Executive,17090.0,3.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0,3.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0,2.0


In [73]:
for col in cat_cols:
    print(data[col].value_counts())

TypeofContact
Self Enquiry       3469
Company Invited    1419
Name: count, dtype: int64
Occupation
Salaried          2368
Small Business    2084
Large Business     434
Free Lancer          2
Name: count, dtype: int64
Gender
Male      2916
Female    1972
Name: count, dtype: int64
ProductPitched
Basic           1842
Deluxe          1732
Standard         742
Super Deluxe     342
King             230
Name: count, dtype: int64
MaritalStatus
Married      2340
Unmarried    1598
Divorced      950
Name: count, dtype: int64
Designation
Executive         1842
Manager           1732
Senior Manager     742
AVP                342
VP                 230
Name: count, dtype: int64


In [74]:
import pickle

In [75]:
ohe_cols = ['TypeofContact', 'Gender', 'MaritalStatus']
ohe = OneHotEncoder(sparse_output=False)
ohe_encoded = ohe.fit_transform(data[ohe_cols])
ohe_df = pd.DataFrame(ohe_encoded, columns=ohe.get_feature_names_out(ohe_cols))
data = pd.concat([data, ohe_df], axis=1).drop(columns=ohe_cols)

# Save One-Hot Encoder
with open('ohe_model.pkl', 'wb') as file:
    pickle.dump(ohe, file)

# Label Encoding for specified columns
le_cols = ['Occupation', 'Designation', 'ProductPitched']
label_encoders = {}
for col in le_cols:
    le = LabelEncoder()
    temp = le.fit_transform(data[col])
    temp = pd.DataFrame(temp, columns=[col])
    data.drop(col, axis=1, inplace=True)
    data = pd.concat([data, temp], axis=1)
    label_encoders[col] = le

    # Save each Label Encoder
    with open(f'le_{col}_model.pkl', 'wb') as file:
        pickle.dump(le, file)

data.head()

Unnamed: 0,ProdTaken,Age,CityTier,DurationOfPitch,NumberOfPersonVisiting,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,...,TypeofContact_Company Invited,TypeofContact_Self Enquiry,Gender_Female,Gender_Male,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Unmarried,Occupation,Designation,ProductPitched
0,1,41.0,3,6.0,3,3.0,3.0,1.0,1,2,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,2,2,1
1,0,49.0,1,14.0,3,4.0,4.0,2.0,0,3,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,2,2,1
2,1,37.0,1,8.0,3,4.0,3.0,7.0,1,3,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0,1,0
3,0,33.0,1,9.0,2,3.0,3.0,2.0,1,5,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,2,1,0
4,0,36.0,1,8.0,2,3.0,4.0,1.0,0,5,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,3,1,0


In [76]:
X = data.drop(['ProdTaken'], axis=1)
y = data['ProdTaken']

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=16)

In [78]:
StandardScaler = StandardScaler()
X_train = pd.DataFrame(StandardScaler.fit_transform(X_train))
X_test = pd.DataFrame(StandardScaler.transform(X_test), columns=X.columns)

In [79]:
with open('scaler.pkl', 'wb') as file:
    pickle.dump(StandardScaler, file)

In [80]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.268661,-0.714038,-0.294906,0.130719,-0.732503,1.804111,0.419739,1.563806,-0.056919,-1.270028,...,-0.638619,0.638619,-0.830302,0.830302,-0.490091,-0.954655,1.426453,-0.533315,-0.762518,-0.932567
1,0.923739,-0.714038,-1.004306,1.511746,-1.747759,-0.722196,-0.128058,-0.639466,0.680416,0.787384,...,-0.638619,0.638619,-0.830302,0.830302,-0.490091,1.047499,-0.70104,-0.533315,2.321768,0.629238
2,-1.150676,-0.714038,-0.886072,-1.250307,-0.732503,-0.722196,-0.128058,-0.639466,-0.056919,-1.270028,...,1.565879,-1.565879,-0.830302,0.830302,-0.490091,-0.954655,1.426453,1.044677,-0.762518,-0.932567
3,-0.713957,-0.714038,1.123894,1.511746,0.282754,-0.722196,-0.128058,-0.639466,-0.794254,0.787384,...,-0.638619,0.638619,1.204381,-1.204381,-0.490091,-0.954655,1.426453,-0.533315,-0.762518,-0.932567
4,-0.168058,-0.714038,1.71506,0.130719,1.298011,-0.722196,0.967536,1.563806,0.680416,0.787384,...,-0.638619,0.638619,-0.830302,0.830302,2.040436,-0.954655,-0.70104,-0.533315,0.265577,-0.151665


In [88]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score,roc_curve 

In [93]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'XGBoost': XGBClassifier(tree_method='gpu_hist'),
    'SVC': SVC(),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB()
}

In [94]:
param_grid = {
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10], 
        'solver': ['liblinear', 'lbfgs']
    },
    'Decision Tree': {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'criterion': ['gini', 'entropy']
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200], 
        'max_depth': [3, 5, 10, None], 
        'min_samples_split': [2, 5, 10]
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200], 
        'learning_rate': [0.01, 0.1, 0.2], 
        'max_depth': [3, 5, 10]
    },
    'AdaBoost': {
        'n_estimators': [50, 100, 200], 
        'learning_rate': [0.01, 0.1, 1]
    },
    'XGBoost': {
        'n_estimators': [50, 100, 200], 
        'learning_rate': [0.01, 0.1, 0.2], 
        'max_depth': [3, 5, 10]
    },
    'SVC': {
        'C': [0.01, 0.1, 1, 10], 
        'kernel': ['linear', 'rbf']
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 10], 
        'weights': ['uniform', 'distance']
    },
    'Naive Bayes': {}  # No hyperparameters for GaussianNB
}


In [95]:
from sklearn.model_selection import GridSearchCV

best_models = {}
best_params = {}  # Store best params
best_scores = {}  # Store best scores

for model_name, model in models.items():
    if param_grid.get(model_name):  # Check if params exist for the model
        grid_search = GridSearchCV(model, param_grid[model_name], cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_models[model_name] = grid_search.best_estimator_
        best_params[model_name] = grid_search.best_params_
        best_scores[model_name] = grid_search.best_score_

    else:
        model.fit(X_train, y_train)
        best_models[model_name] = model  # Use the default model if no params
        best_params[model_name] = "Default Parameters"
        best_scores[model_name] = model.score(X_test, y_test)  # Evaluate on test set
    print(f"{model_name} has been trained successfully.")
    print(f"Best Parameters: {best_params[model_name]}")
    print(f"Best Accuracy: {best_scores[model_name]}")


Logistic Regression has been trained successfully.
Best Parameters: {'C': 0.01, 'solver': 'lbfgs'}
Best Accuracy: 0.844248376460442
Decision Tree has been trained successfully.
Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2}
Best Accuracy: 0.8857016999304861
Random Forest has been trained successfully.
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy: 0.8938909561319054
Gradient Boosting has been trained successfully.
Best Parameters: {'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 100}
Best Accuracy: 0.932623573013743
AdaBoost has been trained successfully.
Best Parameters: {'learning_rate': 1, 'n_estimators': 50}
Best Accuracy: 0.8491556107371073
XGBoost has been trained successfully.
Best Parameters: {'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 200}
Best Accuracy: 0.9263513387928375
SVC has been trained successfully.
Best Parameters: {'C': 10, 'kernel': 'rbf'}
Best Accuracy: 0.9088

In [96]:
# Print results
for model_name in best_models.keys():
    print(f"Model: {model_name}")
    print(f"Best Parameters: {best_params[model_name]}")
    print(f"Best Accuracy Score: {best_scores[model_name]:.4f}")
    print("-" * 50)

Model: Logistic Regression
Best Parameters: {'C': 0.01, 'solver': 'lbfgs'}
Best Accuracy Score: 0.8442
--------------------------------------------------
Model: Decision Tree
Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2}
Best Accuracy Score: 0.8857
--------------------------------------------------
Model: Random Forest
Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy Score: 0.8939
--------------------------------------------------
Model: Gradient Boosting
Best Parameters: {'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 100}
Best Accuracy Score: 0.9326
--------------------------------------------------
Model: AdaBoost
Best Parameters: {'learning_rate': 1, 'n_estimators': 50}
Best Accuracy Score: 0.8492
--------------------------------------------------
Model: XGBoost
Best Parameters: {'learning_rate': 0.2, 'max_depth': 10, 'n_estimators': 200}
Best Accuracy Score: 0.9264
-----------------------

In [None]:
param_grid = {
    'Gradient Boosting': {
        'n_estimators': [50, 120, 180, 250, 400],  
        'learning_rate': [0.01, 0.03, 0.05, 0.08, 0.1, 0.25, 0.5],  
        'max_depth': [10, 15, 60, 120],  
        'min_samples_split': [2, 5, 10, 15, 20],  
        'min_samples_leaf': [1, 2, 5, 10],  
        'subsample': [0.5, 0.7, 0.8, 0.9, 1.0],  
    },
    'XGBoost': {
        'n_estimators': [50, 120, 180, 250, 400],  
        'learning_rate': [0.01, 0.03, 0.05, 0.08, 0.1, 0.25, 0.5], 
        'max_depth': [10, 15, 60, 120], 
        'min_child_weight': [1, 2, 3, 5, 7, 10],  
        'gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5],  
        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],  
        'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],  
        'reg_alpha': [0, 0.01, 0.1, 1, 10],  
        'reg_lambda': [0, 0.01, 0.1, 1, 10],  
    },
    'SVC': {  
        'C': [0.01, 0.1, 1, 10, 100, 500, 1000],  
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],  
        'degree': [2, 3, 4, 5],  
    }
}


In [98]:
models = {
    'Gradient Boosting': GradientBoostingClassifier(),
    'XGBoost': XGBClassifier(tree_method='gpu_hist'),
    'SVC': SVC(),
}

In [None]:
from sklearn.model_selection import GridSearchCV

best_models = {}
best_params = {}  # Store best params
best_scores = {}  # Store best scores

for model_name, model in models.items():
    if param_grid.get(model_name):  # Check if params exist for the model
        grid_search = GridSearchCV(model, param_grid[model_name], cv=5, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        
        best_models[model_name] = grid_search.best_estimator_
        best_params[model_name] = grid_search.best_params_
        best_scores[model_name] = grid_search.best_score_

    else:
        model.fit(X_train, y_train)
        best_models[model_name] = model  # Use the default model if no params
        best_params[model_name] = "Default Parameters"
        best_scores[model_name] = model.score(X_test, y_test)  # Evaluate on test set
    print(f"{model_name} has been trained successfully.")
    print(f"Best Parameters: {best_params[model_name]}")
    print(f"Best Accuracy: {best_scores[model_name]}")


In [None]:
# Print results
for model_name in best_models.keys():
    print(f"Model: {model_name}")
    print(f"Best Parameters: {best_params[model_name]}")
    print(f"Best Accuracy Score: {best_scores[model_name]:.4f}")
    print("-" * 50)