In [1]:
import seaborn as sns

In [3]:
df=sns.load_dataset('tips')

In [4]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
enc=LabelEncoder()

In [7]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [9]:
df['time']=enc.fit_transform(df['time'])

In [11]:
df.time.unique()

array([0, 1])

In [12]:
X=df.drop(['time'],axis=1)
y=df.time

In [14]:
from sklearn.model_selection import train_test_split as tts

In [35]:
Xtr,Xte,ytr,yte=tts(X,y,random_state=42,test_size=0.2)

In [36]:
df.day.unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [37]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer #Handle missing
from sklearn.preprocessing import StandardScaler #Feature Scaling
from sklearn.preprocessing import OneHotEncoder #Categorical to numerical
from sklearn.compose import ColumnTransformer

In [38]:
cat_col=['sex','smoker','day']

In [39]:
num_col=['total_bill','tip','size']

In [40]:
#Numerical Pipeline

In [41]:
num_pipe=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')),
        ('scaler',StandardScaler())
    ]
)

In [42]:
cat_pipe=Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('encoder',OneHotEncoder())
    ]
)

In [43]:
preprocessor=ColumnTransformer([
        ('num_pipe',num_pipe,num_col),
        ('cat_pipe',cat_pipe,cat_col)
    ])

In [44]:
Xtr=preprocessor.fit_transform(Xtr)
Xte=preprocessor.transform(Xte)

In [45]:
#model Training automation

In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [61]:
models={
    'random_forest':RandomForestClassifier(),
    'logistic_reg':LogisticRegression(),
    'decision_tree':DecisionTreeClassifier()
}

In [48]:
from sklearn.metrics import accuracy_score as acs

In [54]:
def eval_models(Xtr,ytr,Xte,yte,models):
    report={}
    for i in range(len(models)):
        model=list(models.values())[i]
        model.fit(Xtr,ytr)
        y_pred=model.predict(Xte)
        score=acs(y_pred,yte)
        report[list(models.keys())[i]]=score
    return report

In [62]:
eval_models(Xtr,ytr,Xte,yte,models)

{'random_forest': 0.9591836734693877,
 'logistic_reg': 1.0,
 'decision_tree': 0.9387755102040817}

In [63]:
tree=RandomForestClassifier()

In [64]:
params={
    'max_depth':[3,5,10,None],
    'n_estimators':[100,200,300],
    'criterion':['gini','entropy']
}

In [65]:
from sklearn.model_selection import RandomizedSearchCV

In [66]:
rscv=RandomizedSearchCV(estimator=tree,param_distributions=params,scoring='accuracy',cv=5,verbose=3)

In [67]:
rscv.fit(Xtr,ytr)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.974 total time=   0.1s
[CV 2/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.949 total time=   0.1s
[CV 3/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.974 total time=   0.1s
[CV 4/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.923 total time=   0.1s
[CV 5/5] END criterion=gini, max_depth=3, n_estimators=200;, score=0.949 total time=   0.1s
[CV 1/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.974 total time=   0.1s
[CV 2/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.923 total time=   0.1s
[CV 3/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.974 total time=   0.1s
[CV 4/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.923 total time=   0.1s
[CV 5/5] END criterion=entropy, max_depth=5, n_estimators=200;, score=0.923 total time=   0.1s
[CV 

In [68]:
rscv.best_params_

{'n_estimators': 300, 'max_depth': 3, 'criterion': 'gini'}