In [58]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
df=sns.load_dataset("tips")
##independent and dependent features
X=df.drop(labels=['time'],axis=1)
y=df.time
##training our model
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.20)
##dividing our dataset into categrical cols and numerical
categorical_cols=['sex','smoker','day']
numerical_cols=['total_bill','tip','size']
##feature Engineering automation
##numerical pipeline
num_pipeline=Pipeline(steps=[('imputer',SimpleImputer(strategy='mean')),##missing values handling
                            ('scaler',StandardScaler())##feature scaling
                            ])
##categorical pipeline
cat_pipeline=Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),##handling missing values
                            ('onehotencoder',OneHotEncoder())])##categorical to numerical
##Combining the numerical and categorical pipelines
preprocessor=ColumnTransformer([('num_pipeline',num_pipeline,numerical_cols),
                               ('cat_pipeline',cat_pipeline,categorical_cols)])
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)
##Model Training Automation
models={'Random Forest':RandomForestClassifier()
       ,'Logistic Regression':LogisticRegression()}

def evaluate_models(X_train,X_test,y_train,y_test,models):
    report={}
    for i in range(len(models)):
        model=list(models.values())[i]
        ##train model
        model.fit(X_train,y_train)
        ##predict testing data
        y_test_pred=model.predict(X_test)
        ##accuracy score
        test_model_score=accuracy_score(y_test,y_test_pred)
        report[list(models.keys())[i]]=test_model_score
        return report
evaluate_models(X_train,X_test,y_train,y_test,models)
classifier=RandomForestClassifier()
##hyperparameter tuning
params={'max_depth':[3,5,10,None],
       'n_estimators':[100,200,300],
       'criterion':['gini','entropy']}
cv=RandomizedSearchCV(classifier,param_distributions=params,scoring='accuracy',cv=5,verbose=3)
cv.fit(X_train,y_train)
cv.best_params_##the best params for this model are{'n_estimators': 200, 'max_depth': 3, 'criterion': 'gini'}
##tuning our model according to the best params
params1={'max_depth':[3],
       'n_estimators':[200],
       'criterion':['gini']}
cv=RandomizedSearchCV(classifier,param_distributions=params1,scoring='accuracy',cv=5,verbose=3)
cv.fit(X_train,y_train)
classifier=RandomForestClassifier()
classifier.fit(X_train,y_train)
y_pred=classifier.predict(X_test)
score=accuracy_score(y_pred,y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.974 total time=   0.2s
[CV 2/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.923 total time=   0.2s
[CV 3/5] END criterion=gini, max_depth=10, n_estimators=100;, score=1.000 total time=   0.2s
[CV 4/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.897 total time=   0.2s
[CV 5/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.923 total time=   0.2s
[CV 1/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.974 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.949 total time=   0.5s
[CV 3/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.974 total time=   0.5s
[CV 4/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.923 total time=   1.0s
[CV 5/5] END criterion=gini, max_depth=3, n_estimators=300;, score=0.923 total time=   0.5s
[CV 1/5] END c



[CV 1/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.974 total time=   0.4s
[CV 2/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.949 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.974 total time=   0.3s
[CV 4/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.923 total time=   0.3s
[CV 5/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.923 total time=   0.3s
