In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("Travel.csv")

In [4]:
df.columns

Index(['CustomerID', 'ProdTaken', 'Age', 'TypeofContact', 'CityTier',
       'DurationOfPitch', 'Occupation', 'Gender', 'NumberOfPersonVisiting',
       'NumberOfFollowups', 'ProductPitched', 'PreferredPropertyStar',
       'MaritalStatus', 'NumberOfTrips', 'Passport', 'PitchSatisfactionScore',
       'OwnCar', 'NumberOfChildrenVisiting', 'Designation', 'MonthlyIncome'],
      dtype='object')

In [5]:
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [7]:
df["Age"] = df["Age"].fillna(df["Age"].median())

In [8]:
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                           0
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [9]:
df["TypeofContact"].unique()

array(['Self Enquiry', 'Company Invited', nan], dtype=object)

In [9]:
df["TypeofContact"].value_counts()

TypeofContact
Self Enquiry       3444
Company Invited    1419
Name: count, dtype: int64

In [26]:
df["TypeofContact"].mode()[0]

'Self Enquiry'

In [10]:
df["TypeofContact"] = df["TypeofContact"].fillna(df["TypeofContact"].mode()[0])

In [11]:
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                           0
TypeofContact                 0
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [12]:
df["MonthlyIncome"].isnull().mean()*100

4.766775777414075

In [13]:
df["MonthlyIncome"].fillna(df["MonthlyIncome"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["MonthlyIncome"].fillna(df["MonthlyIncome"].median(), inplace=True)


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4888 non-null   float64
 3   TypeofContact             4888 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4637 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4843 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4862 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4748 non-null   float64
 14  Passport

In [14]:
df["NumberOfTotalVisiting"] = df["NumberOfChildrenVisiting"] + df["NumberOfPersonVisiting"]

In [15]:
df.drop(columns=["NumberOfChildrenVisiting", "NumberOfPersonVisiting"], inplace=True)

In [16]:
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,NumberOfTotalVisiting
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Single,1.0,1,2,1,Manager,20993.0,3.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Single,7.0,1,3,0,Executive,17090.0,3.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,200004,0,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [17]:
df.isnull().sum()

CustomerID                  0
ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch           251
Occupation                  0
Gender                      0
NumberOfFollowups          45
ProductPitched              0
PreferredPropertyStar      26
MaritalStatus               0
NumberOfTrips             140
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
Designation                 0
MonthlyIncome               0
NumberOfTotalVisiting      66
dtype: int64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   CustomerID              4888 non-null   int64  
 1   ProdTaken               4888 non-null   int64  
 2   Age                     4888 non-null   float64
 3   TypeofContact           4888 non-null   object 
 4   CityTier                4888 non-null   int64  
 5   DurationOfPitch         4637 non-null   float64
 6   Occupation              4888 non-null   object 
 7   Gender                  4888 non-null   object 
 8   NumberOfPersonVisiting  4888 non-null   int64  
 9   NumberOfFollowups       4843 non-null   float64
 10  ProductPitched          4888 non-null   object 
 11  PreferredPropertyStar   4862 non-null   float64
 12  MaritalStatus           4888 non-null   object 
 13  NumberOfTrips           4748 non-null   float64
 14  Passport                4888 non-null   

In [18]:
df["NumberOfTotalVisiting"].fillna(df["NumberOfTotalVisiting"].median(), inplace= True)
df["NumberOfTrips"].fillna(df["NumberOfTrips"].median(), inplace= True)
df["PreferredPropertyStar"].fillna(df["PreferredPropertyStar"].median(), inplace= True)
df["DurationOfPitch"].fillna(df["DurationOfPitch"].median(), inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["NumberOfTotalVisiting"].fillna(df["NumberOfTotalVisiting"].median(), inplace= True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["NumberOfTrips"].fillna(df["NumberOfTrips"].median(), inplace= True)
The behavior will change in pandas 3.0. This inplace method will never w

In [19]:
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,NumberOfTotalVisiting
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Single,1.0,1,2,1,Manager,20993.0,3.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Single,7.0,1,3,0,Executive,17090.0,3.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,200004,0,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [20]:
df.drop(columns=["NumberOfPersonVisiting"], inplace=True)

KeyError: "['NumberOfPersonVisiting'] not found in axis"

In [None]:
df.head()

In [None]:
df["NumberOfTotalVisiting"] = df["NumberOfTotalVisiting"].astype(int)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df["NumberOfFollowups"].fillna(df["NumberOfFollowups"].median(), inplace=True)

In [21]:
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,NumberOfTotalVisiting
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Single,1.0,1,2,1,Manager,20993.0,3.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Single,7.0,1,3,0,Executive,17090.0,3.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,200004,0,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [22]:
df.isnull().sum()

CustomerID                 0
ProdTaken                  0
Age                        0
TypeofContact              0
CityTier                   0
DurationOfPitch            0
Occupation                 0
Gender                     0
NumberOfFollowups         45
ProductPitched             0
PreferredPropertyStar      0
MaritalStatus              0
NumberOfTrips              0
Passport                   0
PitchSatisfactionScore     0
OwnCar                     0
Designation                0
MonthlyIncome              0
NumberOfTotalVisiting      0
dtype: int64

In [23]:
df["Gender"].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [24]:
df["Gender"] = df["Gender"].replace({"Fe Male": "Female"})

In [25]:
df["MaritalStatus"].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [26]:
df["MaritalStatus"] = df["MaritalStatus"].replace({"Unmarried": "Single"})

In [27]:
df["MaritalStatus"].value_counts()

MaritalStatus
Married     2340
Single      1598
Divorced     950
Name: count, dtype: int64

In [28]:
df["MonthlyIncome"].unique()

array([20993., 20130., 17090., ..., 22097., 22995., 21471.])

In [29]:
df["Occupation"].value_counts()

Occupation
Salaried          2368
Small Business    2084
Large Business     434
Free Lancer          2
Name: count, dtype: int64

In [30]:
df.describe()

Unnamed: 0,CustomerID,ProdTaken,Age,CityTier,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,MonthlyIncome,NumberOfTotalVisiting
count,4888.0,4888.0,4888.0,4888.0,4888.0,4843.0,4888.0,4888.0,4888.0,4888.0,4888.0,4888.0,4888.0
mean,202443.5,0.188216,37.547259,1.654255,15.36293,3.708445,3.577946,3.229746,0.290917,3.078151,0.620295,23559.179419,4.088789
std,1411.188388,0.390925,9.104795,0.916583,8.316166,1.002509,0.797005,1.822769,0.454232,1.365792,0.485363,5257.862921,1.412727
min,200000.0,0.0,18.0,1.0,5.0,1.0,3.0,1.0,0.0,1.0,0.0,1000.0,1.0
25%,201221.75,0.0,31.0,1.0,9.0,3.0,3.0,2.0,0.0,2.0,0.0,20485.0,3.0
50%,202443.5,0.0,36.0,1.0,13.0,4.0,3.0,3.0,0.0,3.0,1.0,22347.0,4.0
75%,203665.25,0.0,43.0,3.0,19.0,4.0,4.0,4.0,1.0,4.0,1.0,25424.75,5.0
max,204887.0,1.0,61.0,3.0,127.0,6.0,5.0,22.0,1.0,5.0,1.0,98678.0,7.0


In [31]:
df.drop(columns="CustomerID", inplace=True)

In [32]:
df.head()

Unnamed: 0,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,NumberOfTotalVisiting
0,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Single,1.0,1,2,1,Manager,20993.0,3.0
1,0,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Single,7.0,1,3,0,Executive,17090.0,3.0
3,0,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,0,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X = df.drop(columns="ProdTaken")
Y = df["ProdTaken"]

In [35]:
X

Unnamed: 0,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,NumberOfTotalVisiting
0,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Single,1.0,1,2,1,Manager,20993.0,3.0
1,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Single,7.0,1,3,0,Executive,17090.0,3.0
3,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,36.0,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4883,49.0,Self Enquiry,3,9.0,Small Business,Male,5.0,Deluxe,4.0,Single,2.0,1,1,1,Manager,26576.0,4.0
4884,28.0,Company Invited,1,31.0,Salaried,Male,5.0,Basic,3.0,Single,3.0,1,3,1,Executive,21212.0,6.0
4885,52.0,Self Enquiry,3,17.0,Salaried,Female,4.0,Standard,4.0,Married,7.0,0,1,1,Senior Manager,31820.0,7.0
4886,19.0,Self Enquiry,3,16.0,Small Business,Male,4.0,Basic,3.0,Single,3.0,0,5,0,Executive,20289.0,5.0


In [36]:
Y.value_counts()

ProdTaken
0    3968
1     920
Name: count, dtype: int64

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [38]:
X_train.shape

(3421, 17)

In [39]:
X.shape

(4888, 17)

In [40]:
X_test.shape

(1467, 17)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   ProdTaken               4888 non-null   int64  
 1   Age                     4888 non-null   float64
 2   TypeofContact           4888 non-null   object 
 3   CityTier                4888 non-null   int64  
 4   DurationOfPitch         4888 non-null   float64
 5   Occupation              4888 non-null   object 
 6   Gender                  4888 non-null   object 
 7   NumberOfFollowups       4888 non-null   float64
 8   ProductPitched          4888 non-null   object 
 9   PreferredPropertyStar   4888 non-null   float64
 10  MaritalStatus           4888 non-null   object 
 11  NumberOfTrips           4888 non-null   float64
 12  Passport                4888 non-null   int64  
 13  PitchSatisfactionScore  4888 non-null   int64  
 14  OwnCar                  4888 non-null   

In [41]:
cat_features = X.select_dtypes(include="object").columns

In [42]:
num_features = X.select_dtypes(exclude="object").columns

In [43]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


scaler = StandardScaler()
encoder = OneHotEncoder(drop="first")

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", encoder, cat_features),
        ("StandardScaler",scaler, num_features)
    ]
)


In [44]:
preprocessor

In [45]:
X_train = preprocessor.fit_transform(X_train)

In [75]:
X_test = preprocessor.transform(X_test)

ValueError: X has 26 features, but ColumnTransformer is expecting 17 features as input.

In [46]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,-0.714031,-0.649789,0.281921,-0.727208,-1.217370,-0.632714,1.409081,0.781435,0.460318,-0.067144
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.714031,-0.527737,-0.714787,1.768057,1.504605,-0.632714,1.409081,-1.279697,-1.005668,-0.778290
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.714031,-0.161582,1.278628,1.768057,-0.672975,1.580492,-0.046813,0.781435,-1.209722,-0.067144
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.470853,-0.771841,0.281921,0.520424,-0.128580,1.580492,-0.046813,0.781435,-0.017750,0.644002
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.714031,-0.527737,1.278628,0.520424,2.593396,-0.632714,0.681134,-1.279697,-0.423411,-0.067144
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3416,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.714031,-0.649789,1.278628,-0.727208,-0.672975,-0.632714,-1.502707,0.781435,-0.531926,0.644002
3417,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.470853,-0.893893,-0.714787,1.768057,-1.217370,-0.632714,1.409081,0.781435,1.512744,-0.067144
3418,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.470853,1.547143,0.281921,-0.727208,2.049001,-0.632714,-0.774760,0.781435,-0.357400,0.644002
3419,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.470853,1.791246,1.278628,-0.727208,-0.128580,-0.632714,-1.502707,0.781435,-0.250765,0.644002


In [47]:
X_test.shape

(1467, 17)

## training with random forest algorithm

In [48]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score, roc_auc_score


models = {
    "rf" : RandomForestClassifier(),
    "dt" : DecisionTreeClassifier(),
    "linear regression " : LogisticRegression()
    
}

for i in range(len(models)):
    model = list(models.values())[i]
    model.fit(X_train, Y_train)
    
    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)
    
    
    print(f"for {list(models.keys())[i]} model")
    
#     accuracy_score for train
    print("accuracy for train")
    print("accuracy : ",accuracy_score(Y_train, y_pred_train))
    print("f1 score : ", f1_score(Y_train, y_pred_train))
    print("precision score :",precision_score(Y_train, y_pred_train))
    print("recall score :",recall_score(Y_train, y_pred_train))
    print("roc auc score",roc_auc_score(Y_train, y_pred_train))
    
    
    print()
    print("-----------")
    
#     accuracy_score for test
    print("accuracy for test")
    print("accuracy : ",accuracy_score(Y_test, y_pred_test))
    print("f1 score :",f1_score(Y_test, y_pred_test))
    print("precision score :",precision_score(Y_test, y_pred_test))
    print("recall score :", recall_score(Y_test, y_pred_test))
    print("roc auc score",roc_auc_score(Y_test, y_pred_test))
    print()



ValueError: could not convert string to float: 'Company Invited'

## hyperparameter tuning

In [95]:
params = {
    "n_estimators" : [100, 400, 800, 1000],
    "criterion" : ["gini", "entropy", "log_loss"],
    "max_depth": [None, 5, 10, 15, 20],
    "min_samples_split" : [2,8, 15,20],
    "max_features" : ["sqrt", "log2", None]
}

In [100]:
randomcv_models = [
    ("RF", RandomForestClassifier(), params),
]

In [101]:
randomcv_models

[('RF',
  RandomForestClassifier(),
  {'n_estimators': [100, 400, 800, 1000],
   'criterion': ['gini', 'entropy', 'log_loss'],
   'max_depth': [None, 5, 10, 15, 20],
   'min_samples_split': [2, 8, 15, 20],
   'max_features': ['sqrt', 'log2', None]})]

In [102]:
from sklearn.model_selection import RandomizedSearchCV

model_params = {}

for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model, param_distributions=params,n_jobs=-1)
    
    random.fit(X_train, Y_train)
    model_params[name] = random.best_params_
    
for model_name in model_params:
    print(f"------best param for {model_name} ------")
    print(model_params[model_name])
    
    

Fitting 3 folds for each of 100 candidates, totalling 300 fits
------best param for RF ------
{'n_estimators': 400, 'min_samples_split': 2, 'max_features': None, 'max_depth': None, 'criterion': 'gini'}


In [110]:
models = {
    "RF_with_hyperparameter" : RandomForestClassifier(n_estimators=400, min_samples_split=2, max_features=None, max_depth=None, criterion='gini'),
    "RF" : RandomForestClassifier(n_estimators=400, min_samples_split=2, max_features=None, max_depth=None, criterion='entropy')
}

for name, model in models.items():
    estimator = model
    estimator.fit(X_train, Y_train)
    
    y_pred_test = estimator.predict(X_test)
    y_pred_train = estimator.predict(X_train)
    
    
    #accuracy_score for train
    print("accuracy for train")
    print("accuracy : ",accuracy_score(Y_train, y_pred_train))
    print("f1 score : ", f1_score(Y_train, y_pred_train))
    print("precision score :",precision_score(Y_train, y_pred_train))
    print("recall score :",recall_score(Y_train, y_pred_train))
    print("roc auc score",roc_auc_score(Y_train, y_pred_train))
    
    
    print()
    print("-----------")
    
    # accuracy_score for test
    print("accuracy for test")
    print("accuracy : ",accuracy_score(Y_test, y_pred_test))
    print("f1 score :",f1_score(Y_test, y_pred_test))
    print("precision score :",precision_score(Y_test, y_pred_test))
    print("recall score :", recall_score(Y_test, y_pred_test))
    print("roc auc score",roc_auc_score(Y_test, y_pred_test))
    print()
    


accuracy for train
accuracy :  1.0
f1 score :  1.0
precision score : 1.0
recall score : 1.0
roc auc score 1.0

-----------
accuracy for test
accuracy :  0.9366053169734151
f1 score : 0.8082474226804124
precision score : 0.9289099526066351
recall score : 0.7153284671532847
roc auc score 0.8513775613218226

accuracy for train
accuracy :  1.0
f1 score :  1.0
precision score : 1.0
recall score : 1.0
roc auc score 1.0

-----------
accuracy for test
accuracy :  0.934560327198364
f1 score : 0.8016528925619835
precision score : 0.9238095238095239
recall score : 0.708029197080292
roc auc score 0.8473088148016715



## plot roc auc curve

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
plt.figure()


auc_models = [
    {
        'label' : "Random forest classifier",
        'model' : RandomForestClassifier(n_estimators=400, min_samples_split=2, max_features=None, max_depth=None, criterion='gini'),
        'auc' : 0.8473088148016715
    }
]


for algo in auc_models:
    model = algo['model']
    
    
    model.fit(X_train, Y_train)
    
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, ])
