### Random Forest Classifier implementation with Pipelines and Hyperparameter tunning

In [1]:
import seaborn as sns

In [2]:
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [None]:
# In this problem we will take time as dependent feature(output) and rest as independent feature(input)

In [4]:
df['time'].unique()  # so its a bianry classification problem now 

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [None]:
# what the generic activities we should do when we do eda and feature engineering
# we need to automate the below processes
# handling missing values
# handling outlliers
# handling categorical features 
# feature scaling

EDA cannot be automated, as it is based on evaluation of data. We can automate feature engineering, model training ,model evaluation etc.

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
# check for categorical values 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [None]:
# we have 4 categorical values.Convert the dependent feature into 0 and 1

In [7]:
from sklearn.preprocessing import LabelEncoder  # perform encodding 

In [11]:
encoder=LabelEncoder()
df['time']=encoder.fit_transform(df['time'])

In [12]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.5,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4


In [13]:
df['time'].value_counts()

0    176
1     68
Name: time, dtype: int64

In [14]:
# segregate the dataset into dependent and independent variables
X=df.drop('time',axis=1)
y=df['time']

In [17]:
X.head()  

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
1,10.34,1.66,Male,No,Sun,3
2,21.01,3.5,Male,No,Sun,3
3,23.68,3.31,Male,No,Sun,2
4,24.59,3.61,Female,No,Sun,4


In [18]:
# perform train_test_split
from sklearn.model_selection import train_test_split

In [19]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [20]:
X_train.head()  # this dataset contain 3 categorical features that contains nominal variable ( we can't assign rank to it)
# for nominal dataset we use one hot encoding 
# for ordinal (rank) dataset we use oordinal encoding 

Unnamed: 0,total_bill,tip,sex,smoker,day,size
228,13.28,2.72,Male,No,Sat,2
208,24.27,2.03,Male,Yes,Sat,2
96,27.28,4.0,Male,Yes,Fri,2
167,31.71,4.5,Male,No,Sun,4
84,15.98,2.03,Male,No,Thur,2


In [21]:
from sklearn.impute import SimpleImputer # for handling missing values 
from sklearn.preprocessing import OneHotEncoder # for handling categorical features
# we will not handle outliers in this project
from sklearn.preprocessing import StandardScaler # for feature scalling 
from sklearn.pipeline import Pipeline # for automating the steps 
from sklearn.compose import ColumnTransformer # used to connect this pipeline(also known column transfromer)

In random forest feature scalling is not compulsary as it handle automatically but for other algorithm like logistic regression it is required.


In [22]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,0,2
1,10.34,1.66,Male,No,Sun,0,3
2,21.01,3.5,Male,No,Sun,0,3
3,23.68,3.31,Male,No,Sun,0,2
4,24.59,3.61,Female,No,Sun,0,4


In [23]:
# check all categorical columns and numerical columns 
categorical_columns=['sex','smoker','day']
numerical_columns=['total_bill','tip','size']

When we deploy our data, a new data comes and we have to train our model continuously (we have to train our model like every month its not a one time process we have keep training the model because new new data keeps on coming)
In numerical pipeline, we first handle missing value 

In [28]:
# feature Engineering Automation
# this numerical pipeline will be responsible for feature engineering on every numerical columns
# we have to perform both pipeline so we create a wrapper on top of it 
num_pipelines=Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')), # handling missing values
        ('scaler' ,StandardScaler())   # feature scaling
    ]
)

cat_pipelines=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')), # handling missing values
        ('onehotencoder',OneHotEncoder())        # One hot encoding
    ]
)

In [30]:
preprocessor=ColumnTransformer([
         ('num_pipelines',num_pipelines,numerical_columns),  # name of pipeline, pipeline variable, nuerical columns
         ('col_pipelines',cat_pipelines,categorical_columns),  # name of pipeline, pipeline variable, nuerical columns
])

In [33]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [51]:
# apply our model
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [52]:
## automate model training process
models={
    'Random Forest':RandomForestClassifier(), # variable of the algorithm, algorithm
    'Decision Tree':DecisionTreeClassifier(),
    'SVC':SVC()
}

In [36]:
from sklearn.metrics import accuracy_score

In [45]:
def evaluate_model(X_train,X_test,y_train,y_test,models):
    report={}
    for i in range(len(models)):
        model=list(models.values())[i]
        # train data
        model.fit(X_train,y_train)
        # predict data
        y_pred=model.predict(X_test)
    
        # get accuracy for test data prediction
        test_model_score=accuracy_score(y_test,y_pred)
        report[list(models.keys())[i]]=test_model_score
    return report

In [53]:
evaluate_model(X_train,X_test,y_train,y_test,models)

{'Random Forest': 0.9591836734693877,
 'Decision Tree': 0.9387755102040817,
 'SVC': 0.9591836734693877}

In [54]:
# we can see the random forest is good now we can perform hyper parameter tunning
classifier=RandomForestClassifier()

In [55]:
parameters={
    'max_depth':[3,5,10,None],
    'n_estimators':[100,200,300],
    'criterion':['gini','entropy']
}

In [56]:
from sklearn.model_selection import RandomizedSearchCV

In [58]:
cv=RandomizedSearchCV(classifier,param_distributions=parameters,cv=5,scoring='accuracy',verbose=3)
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.974 total time=   0.4s
[CV 2/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.949 total time=   0.4s
[CV 3/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.974 total time=   0.4s
[CV 4/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.923 total time=   0.4s
[CV 5/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.923 total time=   0.4s
[CV 1/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.974 total time=   0.2s
[CV 2/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.923 total time=   0.2s
[CV 3/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.974 total time=   0.2s
[CV 4/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.923 total time=   0.2s
[CV 5/5] END criterion=entropy, max_depth=5, n_estimators=100;, score=0.923 total time=   0.2s
[CV 

In [59]:
cv.best_params_

{'n_estimators': 300, 'max_depth': 10, 'criterion': 'entropy'}

In [60]:
classifier=RandomForestClassifier(n_estimators=300,max_depth=10,criterion='entropy')

In [None]:
classifier.fit(X_train,y_train)