Random Forest Classification

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings 
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
df=pd.read_csv('Travel.csv')
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


# Data Cleaning
    Handling Missing Values
        1.Handling Missing Values
        2.Handling Duplicates
        3.Check data type
        4.Understand the dataset

In [4]:
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [5]:
## Check all the categories
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [6]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [7]:
df['Gender']=df['Gender'].replace('Fe Male','Female')
df['MaritalStatus']=df['MaritalStatus'].replace('Single','Unmarried')

# Imputing Null Values

In [8]:
# Age
df.Age.fillna(df.Age.median(),inplace=True)

# TypeofContact
df.TypeofContact.fillna(df.TypeofContact.mode()[0],inplace=True)

#DurationofPitch
df.DurationOfPitch.fillna(df.DurationOfPitch.median(),inplace=True)

#NumberOfFollowups
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0],inplace=True)

#PreferredPropertyStar
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.mode()[0],inplace=True)

#NumberOfTrips
df.NumberOfTrips.fillna(df.NumberOfTrips.median(0),inplace=True)

#NumberOfChildrenVisiting
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode()[0],inplace=True)

#MonthlyIncome
df.MonthlyIncome.fillna(df.MonthlyIncome.median(),inplace=True)





In [9]:
df.isnull().sum()

CustomerID                  0
ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

In [10]:
df.drop('CustomerID',inplace=True,axis=1)

In [11]:
df.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,3,0,0.0,Executive,17090.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


# Feature Engineering

### Feature Extraction

In [12]:
# Merging two columns
df['TotalVisiting']=df['NumberOfChildrenVisiting']+df['NumberOfPersonVisiting']
df.drop(columns=['NumberOfPersonVisiting','NumberOfChildrenVisiting'],axis=1)

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,TotalVisiting
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,Manager,20993.0,3.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Unmarried,7.0,1,3,0,Executive,17090.0,3.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4883,1,49.0,Self Enquiry,3,9.0,Small Business,Male,5.0,Deluxe,4.0,Unmarried,2.0,1,1,1,Manager,26576.0,4.0
4884,1,28.0,Company Invited,1,31.0,Salaried,Male,5.0,Basic,3.0,Unmarried,3.0,1,3,1,Executive,21212.0,6.0
4885,1,52.0,Self Enquiry,3,17.0,Salaried,Female,4.0,Standard,4.0,Married,7.0,0,1,1,Senior Manager,31820.0,7.0
4886,1,19.0,Self Enquiry,3,16.0,Small Business,Male,4.0,Basic,3.0,Unmarried,3.0,0,5,0,Executive,20289.0,5.0


## Train Test Split and Model Training

In [13]:
## train test split
from sklearn.model_selection import train_test_split
X=df.drop(['ProdTaken'],axis=1)
y=df['ProdTaken']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=10)

In [14]:
y.value_counts()

ProdTaken
0    3968
1     920
Name: count, dtype: int64

In [15]:
# Creating Column Transformer with 3 types of Transformer
cat_features=X.select_dtypes(include="object").columns
num_features = X.select_dtypes(exclude="object").columns
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
numeric_transformer=StandardScaler()
ohe_transformer=OneHotEncoder(drop='first')
preprocessor=ColumnTransformer([
    ("OneHotEncoder",ohe_transformer,cat_features),
    ("StandardScaler",numeric_transformer,num_features)
])

In [16]:
X_train

Unnamed: 0,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome,TotalVisiting
2237,36.0,Self Enquiry,1,9.0,Salaried,Female,2,3.0,Deluxe,4.0,Unmarried,4.0,0,4,1,0.0,Manager,22347.0,2.0
1156,40.0,Self Enquiry,1,21.0,Salaried,Female,3,4.0,Standard,3.0,Married,2.0,0,1,1,1.0,Senior Manager,25435.0,4.0
3918,28.0,Self Enquiry,1,9.0,Small Business,Male,3,4.0,Basic,4.0,Married,2.0,0,5,1,2.0,Executive,22146.0,5.0
3497,58.0,Self Enquiry,1,7.0,Salaried,Male,4,2.0,Deluxe,3.0,Divorced,2.0,0,3,1,3.0,Manager,23578.0,7.0
3967,36.0,Company Invited,1,18.0,Small Business,Male,4,5.0,Standard,5.0,Married,4.0,1,5,1,3.0,Senior Manager,28562.0,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1180,36.0,Self Enquiry,1,13.0,Salaried,Male,3,3.0,Basic,4.0,Unmarried,3.0,0,1,0,2.0,Executive,17013.0,5.0
3441,28.0,Self Enquiry,1,24.0,Salaried,Female,3,4.0,Basic,3.0,Unmarried,3.0,1,5,1,1.0,Executive,21072.0,4.0
1344,37.0,Self Enquiry,1,13.0,Small Business,Male,3,3.0,Deluxe,5.0,Married,6.0,1,3,0,0.0,Manager,22347.0,3.0
4623,32.0,Company Invited,1,16.0,Small Business,Male,3,5.0,Basic,5.0,Unmarried,2.0,1,3,0,2.0,Executive,20999.0,5.0


In [17]:
X_train=preprocessor.fit_transform(X_train)

In [18]:
X_test=preprocessor.transform(X_test)

## Random Forest Classifier

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score

In [20]:
models={
    "Decisiontree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier()}
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train) ## Training Model

    ## Making Prediction
    y_train_pred=model.predict(X_train)
    y_test_pred=model.predict(X_test)

    ## Training set performance
    model_train_accuracy=accuracy_score(y_train_pred,y_train)
    model_train_f1score=f1_score(y_train_pred,y_train,average='weighted')
    model_train_cr=classification_report(y_train_pred,y_train)
    model_train_recall=recall_score(y_train_pred,y_train)

    
    # ## Testing set performance
    model_test_accuracy=accuracy_score(y_test_pred,y_test)
    model_test_f1score=f1_score(y_test_pred,y_test,average='weighted')
    model_test_cr=classification_report(y_test_pred,y_test)
    model_test_recall=recall_score(y_test_pred,y_test)

    print(list(models.keys())[i])
    print("Model performance for training set")
    print('Accuracy:{:.4f}'.format(model_train_accuracy))
    print('f1score:{:.4f}'.format(model_train_f1score))
    print('recallscore:{:.4f}'.format(model_train_recall))
    
    print('************************')

    print("Model performance for test set")
    print('Accuracy:{:.4f}'.format(model_test_accuracy))
    print('f1score:{:.4f}'.format(model_test_f1score))
    print('recallscore:{:.4f}'.format(model_test_recall))
    #print('='*35)
    print('\n')

Decisiontree
Model performance for training set
Accuracy:1.0000
f1score:1.0000
recallscore:1.0000
************************
Model performance for test set
Accuracy:0.9213
f1score:0.9189
recallscore:0.7234


Random Forest
Model performance for training set
Accuracy:1.0000
f1score:1.0000
recallscore:1.0000
************************
Model performance for test set
Accuracy:0.9264
f1score:0.9329
recallscore:0.9238




In [21]:
## Hyperparameter Training
rf_params = {"max_depth" : [5,8,15,None],
             "max_features":[5,7,"auto"],
             "min_samples_split":[2,8,15,20],
             "n_estimators":[100,200]
             }

In [22]:
## models list for hyperparameter tuning
randomcv_models=[
    ("RF",RandomForestClassifier(),rf_params)
]

In [23]:
# from sklearn.model_selection import RandomizedSearchCV
# model_param=