In [1]:
#import essential libraries
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
#reading data
pd.options.display.max_columns=None
pd.options.display.max_rows=None
data=pd.read_csv('Travel.csv')
df=data.copy()
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [3]:
#chekcing info of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4662 non-null   float64
 3   TypeofContact             4863 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4637 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4843 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4862 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4748 non-null   float64
 14  Passport

In [4]:
#checking null values
df.isna().sum().to_frame('Null Count').sort_values(by='Null Count',ascending=False).reset_index()

Unnamed: 0,index,Null Count
0,DurationOfPitch,251
1,MonthlyIncome,233
2,Age,226
3,NumberOfTrips,140
4,NumberOfChildrenVisiting,66
5,NumberOfFollowups,45
6,PreferredPropertyStar,26
7,TypeofContact,25
8,Designation,0
9,OwnCar,0


In [5]:
#checking relation between variables
df.groupby('ProductPitched')['DurationOfPitch'].median()

ProductPitched
Basic           13.0
Deluxe          14.0
King            11.0
Standard        14.0
Super Deluxe    13.0
Name: DurationOfPitch, dtype: float64

In [6]:
#null value imputation of DurationOfPitch
for i in df.ProductPitched.unique():
    df.loc[df['ProductPitched']==i,'DurationOfPitch']=df.loc[df['ProductPitched']==i,'DurationOfPitch'].fillna(
        df.loc[df['ProductPitched']==i,'DurationOfPitch'].median())

In [7]:
#conforming successfull execution of above code
df.isna().sum().to_frame('Null Count').sort_values(by='Null Count',ascending=False).reset_index()

Unnamed: 0,index,Null Count
0,MonthlyIncome,233
1,Age,226
2,NumberOfTrips,140
3,NumberOfChildrenVisiting,66
4,NumberOfFollowups,45
5,PreferredPropertyStar,26
6,TypeofContact,25
7,MaritalStatus,0
8,Designation,0
9,OwnCar,0


In [8]:
#chekcing relation between variables
df.groupby('Designation')['MonthlyIncome'].median()

Designation
AVP               32181.0
Executive         20689.0
Manager           22922.0
Senior Manager    26425.0
VP                34999.0
Name: MonthlyIncome, dtype: float64

In [9]:
#null value imputaion of MonthlyIncome
for i in df.Designation.unique():
    df.loc[df['Designation']==i,'MonthlyIncome']=df.loc[df['Designation']==i,'MonthlyIncome'].fillna(
        df.loc[df['Designation']==i,'MonthlyIncome'].median())
df.isna().sum().to_frame('Null Count').sort_values(by='Null Count',ascending=False).reset_index()

Unnamed: 0,index,Null Count
0,Age,226
1,NumberOfTrips,140
2,NumberOfChildrenVisiting,66
3,NumberOfFollowups,45
4,PreferredPropertyStar,26
5,TypeofContact,25
6,CustomerID,0
7,MaritalStatus,0
8,Designation,0
9,OwnCar,0


In [10]:
#chekcing relation between variables
df.groupby(['Designation','Gender']).Age.median()

Designation     Gender 
AVP             Fe Male    40.0
                Female     49.0
                Male       49.0
Executive       Fe Male    32.0
                Female     32.0
                Male       32.0
Manager         Fe Male    36.0
                Female     37.0
                Male       36.0
Senior Manager  Fe Male    36.0
                Female     39.0
                Male       39.0
VP              Female     50.0
                Male       47.0
Name: Age, dtype: float64

In [11]:
#rectifying the spelling
df.Gender.replace({'Fe Male':'Female'},inplace=True)

In [12]:
#chekcing relation between variables
df.groupby(['Designation','Gender']).Age.median().unstack()

Gender,Female,Male
Designation,Unnamed: 1_level_1,Unnamed: 2_level_1
AVP,48.0,49.0
Executive,32.0,32.0
Manager,37.0,36.0
Senior Manager,38.0,39.0
VP,50.0,47.0


In [13]:
#null value imputaion of Age
for i in df.Designation.unique():
    df.loc[df['Designation']==i,'Age']=df.loc[df['Designation']==i,'Age'].fillna(
        df.loc[df['Designation']==i,'Age'].median())
df.isna().sum().to_frame('Null Count').sort_values(by='Null Count',ascending=False).reset_index()

Unnamed: 0,index,Null Count
0,NumberOfTrips,140
1,NumberOfChildrenVisiting,66
2,NumberOfFollowups,45
3,PreferredPropertyStar,26
4,TypeofContact,25
5,CustomerID,0
6,Designation,0
7,OwnCar,0
8,PitchSatisfactionScore,0
9,Passport,0


In [14]:
#checking unique count of NumberOfTrips
df.NumberOfTrips.unique()

array([ 1.,  2.,  7.,  5.,  6.,  3.,  4., 19., 21.,  8., nan, 20., 22.])

In [15]:
#chekcing relation between variables
df.groupby(['Designation','Gender'])['NumberOfTrips'].median().unstack()

Gender,Female,Male
Designation,Unnamed: 1_level_1,Unnamed: 2_level_1
AVP,4.0,3.0
Executive,3.0,3.0
Manager,3.0,3.0
Senior Manager,3.0,3.0
VP,4.0,2.0


In [16]:
#null value imputation of NumberoFTrips
df['NumberOfTrips'].fillna(3,inplace=True)

In [17]:
#confirming the successful execution of above code
df.isna().sum().to_frame('Null Count').sort_values(by='Null Count',ascending=False).reset_index()

Unnamed: 0,index,Null Count
0,NumberOfChildrenVisiting,66
1,NumberOfFollowups,45
2,PreferredPropertyStar,26
3,TypeofContact,25
4,CustomerID,0
5,Designation,0
6,OwnCar,0
7,PitchSatisfactionScore,0
8,Passport,0
9,NumberOfTrips,0


In [18]:
#cheking relation between variables
df.groupby('MaritalStatus')['NumberOfChildrenVisiting'].median()

MaritalStatus
Divorced     1.0
Married      1.0
Single       1.0
Unmarried    1.0
Name: NumberOfChildrenVisiting, dtype: float64

In [19]:
#checking relation between variables
df.loc[df.NumberOfChildrenVisiting.isna(),'MaritalStatus'].value_counts()

Married     29
Single      20
Divorced    17
Name: MaritalStatus, dtype: int64

In [20]:
#null value imputation of NumberOfChildrenVisiting
df.NumberOfChildrenVisiting.fillna(1,inplace=True)

In [21]:
#checking successful execution of above code
df.isna().sum().to_frame('Null Count').sort_values(by='Null Count',ascending=False).reset_index()

Unnamed: 0,index,Null Count
0,NumberOfFollowups,45
1,PreferredPropertyStar,26
2,TypeofContact,25
3,CustomerID,0
4,Designation,0
5,NumberOfChildrenVisiting,0
6,OwnCar,0
7,PitchSatisfactionScore,0
8,Passport,0
9,NumberOfTrips,0


In [22]:
#chekcing relation between varibales
df.groupby('PitchSatisfactionScore')['NumberOfFollowups'].median()

PitchSatisfactionScore
1    4.0
2    4.0
3    4.0
4    4.0
5    4.0
Name: NumberOfFollowups, dtype: float64

In [23]:
#null value imputation of NumberOfFollowUps
df.NumberOfFollowups.fillna(4,inplace=True)

In [24]:
#checking successful execution of above code
df.isna().sum().to_frame('Null Count').sort_values(by='Null Count',ascending=False).reset_index()

Unnamed: 0,index,Null Count
0,PreferredPropertyStar,26
1,TypeofContact,25
2,CustomerID,0
3,Designation,0
4,NumberOfChildrenVisiting,0
5,OwnCar,0
6,PitchSatisfactionScore,0
7,Passport,0
8,NumberOfTrips,0
9,MaritalStatus,0


In [25]:
#null value imputation of PreferredPropertyStar
df.PreferredPropertyStar=df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.median())
df.isna().sum().to_frame('Null Count').sort_values(by='Null Count',ascending=False).reset_index()

Unnamed: 0,index,Null Count
0,TypeofContact,25
1,CustomerID,0
2,PreferredPropertyStar,0
3,Designation,0
4,NumberOfChildrenVisiting,0
5,OwnCar,0
6,PitchSatisfactionScore,0
7,Passport,0
8,NumberOfTrips,0
9,MaritalStatus,0


In [26]:
#Checking relation between variables
pd.crosstab(df['Occupation'],df['TypeofContact'])

TypeofContact,Company Invited,Self Enquiry
Occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
Free Lancer,0,2
Large Business,110,324
Salaried,726,1637
Small Business,583,1481


In [27]:
##checking relation between variables
df.loc[df['TypeofContact'].isna()==True,'Occupation'].value_counts()

Small Business    20
Salaried           5
Name: Occupation, dtype: int64

In [28]:
#null value imputation of TypeOfContact
df.TypeofContact.fillna('Self Enquiry',inplace=True)
df.isna().sum().to_frame('Null Count').sort_values(by='Null Count',ascending=False).reset_index()

Unnamed: 0,index,Null Count
0,CustomerID,0
1,ProdTaken,0
2,Designation,0
3,NumberOfChildrenVisiting,0
4,OwnCar,0
5,PitchSatisfactionScore,0
6,Passport,0
7,NumberOfTrips,0
8,MaritalStatus,0
9,PreferredPropertyStar,0


In [29]:
#checking summary for object type variables
df.describe(include='object')

Unnamed: 0,TypeofContact,Occupation,Gender,ProductPitched,MaritalStatus,Designation
count,4888,4888,4888,4888,4888,4888
unique,2,4,2,5,4,5
top,Self Enquiry,Salaried,Male,Basic,Married,Executive
freq,3469,2368,2916,1842,2340,1842


In [30]:
df.ProductPitched.unique()

array(['Deluxe', 'Basic', 'Standard', 'Super Deluxe', 'King'],
      dtype=object)

In [31]:
#Label Encoding ProductPitched
df.ProductPitched=df.ProductPitched.replace({'Deluxe':2, 'Basic':0, 'Standard':1, 'Super Deluxe':3, 'King':4})
df.ProductPitched.unique()

array([2, 0, 1, 3, 4], dtype=int64)

In [32]:
df.Designation.unique()

array(['Manager', 'Executive', 'Senior Manager', 'AVP', 'VP'],
      dtype=object)

In [33]:
#Label Encoding Designation
df.Designation=df.Designation.replace({'Manager':1, 'Executive':0, 'Senior Manager':2, 'AVP':3, 'VP':4})
df.Designation.unique()

array([1, 0, 2, 3, 4], dtype=int64)

In [34]:
#Dividind Data into Predictors and Target
X=df.drop(['ProdTaken','CustomerID'],axis=1)
y=df['ProdTaken']
X.head()

Unnamed: 0,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,2,3.0,Single,1.0,1,2,1,0.0,1,20993.0
1,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,2,4.0,Divorced,2.0,0,3,1,2.0,1,20130.0
2,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,0,3.0,Single,7.0,1,3,0,0.0,0,17090.0
3,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,0,3.0,Divorced,2.0,1,5,1,1.0,0,17909.0
4,32.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,0,4.0,Divorced,1.0,0,5,1,0.0,0,18468.0


In [35]:
#Dummy encoding remaining object type variables
X=pd.get_dummies(X,drop_first=True)
X.head()

Unnamed: 0,Age,CityTier,DurationOfPitch,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome,TypeofContact_Self Enquiry,Occupation_Large Business,Occupation_Salaried,Occupation_Small Business,Gender_Male,MaritalStatus_Married,MaritalStatus_Single,MaritalStatus_Unmarried
0,41.0,3,6.0,3,3.0,2,3.0,1.0,1,2,1,0.0,1,20993.0,1,0,1,0,0,0,1,0
1,49.0,1,14.0,3,4.0,2,4.0,2.0,0,3,1,2.0,1,20130.0,0,0,1,0,1,0,0,0
2,37.0,1,8.0,3,4.0,0,3.0,7.0,1,3,0,0.0,0,17090.0,1,0,0,0,1,0,1,0
3,33.0,1,9.0,2,3.0,0,3.0,2.0,1,5,1,1.0,0,17909.0,0,0,1,0,0,0,0,0
4,32.0,1,8.0,2,3.0,0,4.0,1.0,0,5,1,0.0,0,18468.0,1,0,0,1,1,0,0,0


In [36]:
#splitting data into train & test
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [37]:
#training multiples model together with the help of Pipeline
models = []  # Empty list to store all the models

# Appending pipelines for each model into the list
models.append(
    (
        "LR",
        Pipeline(
            steps=[
                ("log_reg", LogisticRegression(n_jobs=-1)),
            ]
        ),
    )
)
models.append(
    (
        "RF",
        Pipeline(
            steps=[
                ("random_forest", RandomForestClassifier(n_jobs=-1)),
            ]
        ),
    )
)
models.append(
    (
        "GBM",
        Pipeline(
            steps=[
                ("gradient_boosting", GradientBoostingClassifier()),
            ]
        ),
    )
)
models.append(
    (
        "ADA",
        Pipeline(
            steps=[
                ("ADA", AdaBoostClassifier()),
            ]
        ),
    )
)

models.append(
    (
        "ETREE",
        Pipeline(
            steps=[
                ("extra_tree", ExtraTreesClassifier()),
            ]
        ),
    )
)
models.append(
    (
        "XGB",
        Pipeline(
            steps=[
                ("xgboost", XGBClassifier(eval_metric='logloss',n_jobs=-1)),
            ]
        ),
    )
)

models.append(
    (
        "LGBM",
        Pipeline(
            steps=[
                ("lgbm", LGBMClassifier()),
            ]
        ),
    )
)

names = []  # Empty list to store name of the models
# loop through all models to get the RMSE
results=[]
for name, model in models:
    names.append(names)
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    f1=round(f1_score(y_test,y_pred)*100,2)
    print("F-1 Score of {}: {}".format(name, f1))

F-1 Score of LR: 15.53
F-1 Score of RF: 75.91
F-1 Score of GBM: 47.06
F-1 Score of ADA: 37.04
F-1 Score of ETREE: 79.62
F-1 Score of XGB: 77.98
F-1 Score of LGBM: 69.77


In [38]:
#training multiples model together with the help of Pipeline along with Powertransformer
models = []  # Empty list to store all the models

# Appending pipelines for each model into the list
models.append(
    (
        "LR",
        Pipeline(
            steps=[("pt", PowerTransformer()),
                ("log_reg", LogisticRegression(n_jobs=-1)),
            ]
        ),
    )
)
models.append(
    (
        "RF",
        Pipeline(
            steps=[("pt", PowerTransformer()),
                ("random_forest", RandomForestClassifier(n_jobs=-1)),
            ]
        ),
    )
)
models.append(
    (
        "GBM",
        Pipeline(
            steps=[("pt", PowerTransformer()),
                ("gradient_boosting", GradientBoostingClassifier()),
            ]
        ),
    )
)
models.append(
    (
        "ADA",
        Pipeline(
            steps=[("pt", PowerTransformer()),
                ("ADA", AdaBoostClassifier()),
            ]
        ),
    )
)

models.append(
    (
        "ETREE",
        Pipeline(
            steps=[("pt", PowerTransformer()),
                ("extra_tree", ExtraTreesClassifier()),
            ]
        ),
    )
)
models.append(
    (
        "XGB",
        Pipeline(
            steps=[("pt", PowerTransformer()),
                ("xgboost", XGBClassifier(eval_metric='logloss',n_jobs=-1)),
            ]
        ),
    )
)

models.append(
    (
        "LGBM",
        Pipeline(
            steps=[("pt", PowerTransformer()),
                ("lgbm", LGBMClassifier()),
            ]
        ),
    )
)

names = []  # Empty list to store name of the models
# loop through all models to get the RMSE
results=[]
for name, model in models:
    names.append(names)
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    f1=round(f1_score(y_test,y_pred)*100,2)
    print("F-1 Score of {}: {}".format(name, f1))

F-1 Score of LR: 36.43
F-1 Score of RF: 75.5
F-1 Score of GBM: 48.89
F-1 Score of ADA: 38.38
F-1 Score of ETREE: 82.1
F-1 Score of XGB: 80.24
F-1 Score of LGBM: 65.56
