# Random Forest Classifier with Pipeline and Hypermeter Tuning

In [1]:
import seaborn as sns

In [2]:
df = sns.load_dataset('tips')

In [3]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.isnull().sum()

total_bill    0
tip           0
sex           0
smoker        0
day           0
time          0
size          0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,total_bill,tip,size
count,244.0,244.0,244.0
mean,19.785943,2.998279,2.569672
std,8.902412,1.383638,0.9511
min,3.07,1.0,1.0
25%,13.3475,2.0,2.0
50%,17.795,2.9,2.0
75%,24.1275,3.5625,3.0
max,50.81,10.0,6.0


In [6]:
df['time'].unique()

['Dinner', 'Lunch']
Categories (2, object): ['Lunch', 'Dinner']

In [7]:
df['time']

0      Dinner
1      Dinner
2      Dinner
3      Dinner
4      Dinner
        ...  
239    Dinner
240    Dinner
241    Dinner
242    Dinner
243    Dinner
Name: time, Length: 244, dtype: category
Categories (2, object): ['Lunch', 'Dinner']

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
Encoder = LabelEncoder()

In [10]:
df['time'] = Encoder.fit_transform(df['time'])

In [11]:
#Independent and dependent features
x= df.drop(labels='time', axis=1)
y = df['time']

In [12]:
x.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,size
0,16.99,1.01,Female,No,Sun,2
1,10.34,1.66,Male,No,Sun,3
2,21.01,3.5,Male,No,Sun,3
3,23.68,3.31,Male,No,Sun,2
4,24.59,3.61,Female,No,Sun,4


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer #hanle missing values
from sklearn.preprocessing import StandardScaler #feature scaling
from sklearn.preprocessing import OneHotEncoder #encoding categorical variables
from sklearn.compose import ColumnTransformer


In [16]:
Catogorical = ['sex','smoker','day']
numerical = ['total_bill','tip','size']

In [17]:
#Feature Engineering automation
#Numerical pipeline
num_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='median')), #handling missing values
        ('scaler',StandardScaler()), #feature scaling
    ]
)
#category pipeline
cat_pipeline = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')), #handling missing values
        ('onehot',OneHotEncoder()) #categorical features to numerical
    ]
)

In [18]:
preprocessor = ColumnTransformer(
    [
        ('num_pipeline',num_pipeline,numerical),
        ('cat_pipeline',cat_pipeline,Catogorical)
    ]
)

In [19]:
preprocessor

In [20]:
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [22]:
##Model Training automation
models = {
    'Random_forest' : RandomForestClassifier(),
    'DT': DecisionTreeClassifier(),
    'Logistic' : LogisticRegression(),
    'SVM': SVC(),
    'Naive': GaussianNB()
    }

In [23]:
from sklearn.metrics import accuracy_score

In [24]:
def evaluate_model(x_train,y_train,x_test,y_test,models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        #model training
        model.fit(x_train,y_train)
        #model prediction
        y_pred = model.predict(x_test)
        #model Score
        model_score = accuracy_score(y_test,y_pred)
        #model report
        report[list(models.keys())[i]] = model_score
    return report


In [25]:
evaluate_model(x_train,y_train,x_test,y_test,models)

{'Random_forest': 0.9387755102040817,
 'DT': 0.9387755102040817,
 'Logistic': 0.9591836734693877,
 'SVM': 0.9591836734693877,
 'Naive': 0.9387755102040817}

In [26]:
classifier = RandomForestClassifier()

In [27]:
#Hyperparameter tuning
paramaters = {'max_depth':[3,5,10,None],'n_estimators':[100,200,300],'criterion':['gini','entropy']}

In [28]:
from sklearn.model_selection import RandomizedSearchCV

In [29]:
cv = RandomizedSearchCV(classifier,paramaters,scoring='accuracy',cv=5,verbose=3,n_jobs=-1)

In [30]:
cv.fit(x_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [31]:
cv.best_params_

{'n_estimators': 200, 'max_depth': 5, 'criterion': 'gini'}

In [32]:
classifier = RandomForestClassifier(n_estimators=200, max_depth=5, criterion='gini')

In [33]:
classifier.fit(x_train,y_train)

In [34]:
ypred = classifier.predict(x_test)

In [35]:
accuracy_score(y_test,ypred)

0.9591836734693877

In [36]:
ypred

array([1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0])

# Random Forest Regressor

In [41]:
import seaborn as sns

In [42]:
df = sns.load_dataset('tips')

In [43]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [44]:
#denpendent and independent features
x = df.drop(labels='total_bill',axis=1)
y = df['total_bill']

In [45]:
cat = ['sex','smoker','day','time']
num = ['tip','size']

In [46]:
num_pipe = Pipeline(steps=[
    ("handling missing value",SimpleImputer(strategy='median')),
    ('feature scaling',StandardScaler())
])
cat_pipe = Pipeline(steps=[
    ("handling missing value",SimpleImputer(strategy='most_frequent')),
    ('onehot encoding',OneHotEncoder())
])

In [48]:
process= ColumnTransformer(
    transformers=[
        ('numerical',num_pipe,num),
        ('categorical',cat_pipe,cat)
    ]
)
process

In [49]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)

In [50]:
x_train = process.fit_transform(x_train)
x_test = process.transform(x_test)

In [51]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet


In [52]:
Reg_models = {'LinearRegression':LinearRegression(),
         'DecisionTree':DecisionTreeRegressor(),
         'RandomForest':RandomForestRegressor(),
         'SVM':SVR(),
         'Lasso':Lasso(),
        'Ridge':Ridge(),
        'ElasticNet':ElasticNet()
        }

In [53]:
from sklearn.metrics import r2_score

In [54]:
def reg_evaluate_model(x_train,y_train,x_test,y_test,Reg_models):
    report = {}
    for i in range(len(Reg_models)):
        model = list(Reg_models.values())[i]
        #model training
        model.fit(x_train,y_train)
        #model prediction
        y_pred = model.predict(x_test)
        #model Score
        model_score = r2_score(y_test,y_pred)
        #model report
        report[list(Reg_models.keys())[i]] = model_score
    return report


In [55]:
reg_evaluate_model(x_train,y_train,x_test,y_test,Reg_models)

{'LinearRegression': 0.634423637586069,
 'DecisionTree': 0.16377751283407116,
 'RandomForest': 0.6525560152941874,
 'SVM': 0.4083152500841183,
 'Lasso': 0.6365181753220512,
 'Ridge': 0.6403304169707702,
 'ElasticNet': 0.5873738780057987}