In [70]:
import pandas as pd
from sklearn.pipeline import Pipeline

In [71]:
import seaborn as sns
df = sns.load_dataset('tips')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [72]:
X = df.drop('day',axis=1)
y = df['day']


In [73]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=45,test_size=0.2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((195, 6), (49, 6), (195,), (49,))

In [74]:
# getting Numerical Column

num_cols = [i for i in X.columns if X[i].dtype == 'int64' or X[i].dtypes == 'int32' or X[i].dtypes == 'float64']
cate_cols = [i for i in X.columns if X[i].dtypes == 'object' or X[i].dtypes == 'category']
# cate_cols = ['sex', 'smoker', 'day', 'time']
print(num_cols)
print(cate_cols)

['total_bill', 'tip', 'size']
['sex', 'smoker', 'time']


In [75]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder

In [76]:
# Numerical Pipeline
num_pipeline = Pipeline(
    steps = [
    ('imputer', SimpleImputer(strategy='mean') ),
    ('Standardization',StandardScaler()),    
])

In [77]:
# column Pipeline
cate_pipeline = Pipeline(steps = [
    ('imputer',SimpleImputer(strategy = 'most_frequent')),
    ('encoding',OneHotEncoder()),
])

In [78]:
from sklearn.compose import ColumnTransformer

In [79]:
# Combining cate pipeline and num pipline

preprocess = ColumnTransformer([
    ('num_pipeline',num_pipeline,num_cols),
    ('cate_pipeline',cate_pipeline,cate_cols)
])

In [80]:
X_train = preprocess.fit_transform(X_train)
X_test = preprocess.transform(X_test)

In [81]:
from sklearn.ensemble import RandomForestClassifier

In [83]:
model = RandomForestClassifier()

In [84]:
model.fit(X_train,y_train)

RandomForestClassifier()

In [85]:
y_pred = model.predict(X_test)

In [89]:
from sklearn.metrics import confusion_matrix,accuracy_score
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))

[[ 0  1  1  5]
 [ 1 11  8  0]
 [ 0  9  4  0]
 [ 0  1  0  8]]
0.46938775510204084


In [90]:
# Q2
df = sns.load_dataset('iris')
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [91]:
X = df.drop('species',axis = 1)
y = df['species']

In [92]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=45,test_size=0.2)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [93]:
from sklearn.linear_model import LogisticRegression
models = {
    'Rando_forest':RandomForestClassifier(),
    'logistic':LogisticRegression()
}

In [102]:
def evaluate(X_train,y_train,X_test,y_test,models):
    report = {}
    for i in range(len(models)):
        classifier = list(models.values())[i]
        
        classifier.fit(X_train,y_train)
        
        y_pred = classifier.predict(X_test)
        
        model_score = accuracy_score(y_test,y_pred)
        
        report[list(models.keys())[i]] = model_score
    return report
        
output = evaluate(X_train,y_train,X_test,y_test,models)
output

{'Rando_forest': 0.9333333333333333, 'logistic': 0.9666666666666667}