In [2]:
import seaborn as sns 

In [7]:
df=sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [9]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [10]:
df["day"].unique()

['Sun', 'Sat', 'Thur', 'Fri']
Categories (4, object): ['Thur', 'Fri', 'Sat', 'Sun']

In [85]:
df["time"].unique()

array([0, 1])

In [84]:
df.time.unique()

array([0, 1])

In [13]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
df["time"]=encoder.fit_transform(df["time"])

In [15]:
df.time.unique()

array([0, 1])

In [22]:
X=df.drop(labels=["time"],axis=1)
y=df.time

In [23]:
X,y

(     total_bill   tip     sex smoker   day  size
 0         16.99  1.01  Female     No   Sun     2
 1         10.34  1.66    Male     No   Sun     3
 2         21.01  3.50    Male     No   Sun     3
 3         23.68  3.31    Male     No   Sun     2
 4         24.59  3.61  Female     No   Sun     4
 ..          ...   ...     ...    ...   ...   ...
 239       29.03  5.92    Male     No   Sat     3
 240       27.18  2.00  Female    Yes   Sat     2
 241       22.67  2.00    Male    Yes   Sat     2
 242       17.82  1.75    Male     No   Sat     2
 243       18.78  3.00  Female     No  Thur     2
 
 [244 rows x 6 columns],
 0      0
 1      0
 2      0
 3      0
 4      0
       ..
 239    0
 240    0
 241    0
 242    0
 243    0
 Name: time, Length: 244, dtype: int64)

In [25]:
## Split the dataset into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [86]:
categorical_cols=["sex","smoker","day"]
numerical_cols=["total_bill","tip","size"]
print(categorical_cols)

['sex', 'smoker', 'day']


In [38]:
num_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")),
        ("scaler",StandardScaler())
    ]
)

cat_pipeline=Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="most_frequent")),
        ("onehotencoder",OneHotEncoder())
    ]
)

In [40]:
preprocessing=ColumnTransformer( [
    ("cat_pipeline",cat_pipeline,categorical_cols),
    ("num_pipeline",num_pipeline,numerical_cols)
]
)

In [43]:
X_train=preprocessing.fit_transform(X_train)
X_test=preprocessing.transform(X_test)

In [44]:
X_train

array([[ 0.        ,  1.        ,  1.        , ..., -0.79306155,
        -0.2580329 , -0.61214068],
       [ 0.        ,  1.        ,  0.        , ...,  0.46322744,
        -0.74211442, -0.61214068],
       [ 0.        ,  1.        ,  0.        , ...,  0.80730659,
         0.6399734 , -0.61214068],
       ...,
       [ 1.        ,  0.        ,  0.        , ..., -1.65383098,
        -1.46472887, -0.61214068],
       [ 0.        ,  1.        ,  0.        , ...,  1.64749986,
         0.32426806, -0.61214068],
       [ 1.        ,  0.        ,  0.        , ...,  2.75289699,
        -0.41237773,  0.45363997]], shape=(195, 11))

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [64]:
models={
    "Random Forest":RandomForestClassifier(),
    "Logistic Regression":LogisticRegression(),
    "DecisionTree Classifier":DecisionTreeClassifier()
}

In [50]:
from sklearn.metrics import accuracy_score

In [57]:
def evaluate_model(X_train, y_train, X_test, y_test, models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]   # Get model
        model.fit(X_train, y_train)         # Train model
        y_test_pred = model.predict(X_test) # Predict
        test_model_score = accuracy_score(y_test, y_test_pred) # Accuracy
        report[list(models.keys())[i]] = test_model_score       # Store result
    return report

In [65]:
evaluate_model(X_train, y_train, X_test, y_test, models)

{'Random Forest': 0.9591836734693877,
 'Logistic Regression': 1.0,
 'DecisionTree Classifier': 0.9387755102040817}

In [66]:
classifier=RandomForestClassifier()

In [77]:
params={
    "max_depth":[3,5,10,None],
    "n_estimators":[100,200,300],
    "criterion":["gini","entropy"]
}

In [78]:
from sklearn.model_selection import RandomizedSearchCV

In [89]:
cv = RandomizedSearchCV( classifier, param_distributions = params, scoring="accuracy",cv=5,verbose=3)
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.974 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.923 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=10, n_estimators=100;, score=1.000 total time=   0.2s
[CV 4/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.897 total time=   0.2s
[CV 5/5] END criterion=gini, max_depth=10, n_estimators=100;, score=0.923 total time=   0.3s
[CV 1/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.974 total time=   0.5s
[CV 2/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.923 total time=   0.6s
[CV 3/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=1.000 total time=   0.5s
[CV 4/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.923 total time=   0.8s
[CV 5/5] END criterion=entropy, max_depth=10, n_estimators=200;, score=0.923 total time=  

0,1,2
,estimator,RandomForestClassifier()
,param_distributions,"{'criterion': ['gini', 'entropy'], 'max_depth': [3, 5, ...], 'n_estimators': [100, 200, ...]}"
,n_iter,10
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,3
,pre_dispatch,'2*n_jobs'
,random_state,

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [88]:
cv.best_params_

{'n_estimators': 100, 'max_depth': None, 'criterion': 'gini'}