# Machine Learning           

## Hyper Parameter Tuning 

In [25]:
# Importing Libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split , GridSearchCV, RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [26]:
# Load Dataset
df = sns.load_dataset("penguins")
print(df.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  


In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [28]:
# Let's check for missing values in our dataset
print(df.isnull().sum().sort_values(ascending=False))

sex                  11
bill_depth_mm         2
bill_length_mm        2
flipper_length_mm     2
body_mass_g           2
island                0
species               0
dtype: int64


In [29]:
# Create SimpleImputer instances
imputers = {
    'bill_depth_mm': SimpleImputer(strategy='mean'),
    'bill_length_mm': SimpleImputer(strategy='mean'),  # Removed extra space
    'flipper_length_mm': SimpleImputer(strategy='mean'),
    'body_mass_g': SimpleImputer(strategy='mean'),
    'sex': SimpleImputer(strategy='most_frequent')
}

# Impute missing values using a for loop
for column, imputer in imputers.items():
    df[column] = imputer.fit_transform(df[[column]]).ravel()


In [30]:
# Let's check for missing values in our dataset
print(df.isnull().sum().sort_values(ascending=False))

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [31]:
# Let's encode the object column in our dataset using LabelEncoder in For Loop
label_encoder = LabelEncoder()

# Loop through categorical columns and encode them
for column in df.select_dtypes(include=['object']).columns:
    df[column] = label_encoder.fit_transform(df[column])


In [32]:
# Split Data to Test and Train.
X = df.drop('species', axis=1)
y = df['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Grid Search

In [None]:
# Create a Random Forest model
model = RandomForestClassifier()

# Define the hyperparameter grid for tuning
param_grid = {
    'n_estimators': [10, 50, 100],         # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],       # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],       # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],      # Number of features to consider when looking for the best split
    'bootstrap': [True, False]             # Whether bootstrap samples are used when building trees
}

# Setup GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, 
                           scoring='accuracy', cv=5)  # 5-fold cross-validation

# Fit the model using the training data
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
 best_model = grid_search.best_estimator_

# Make predictions using the test data
 y_pred = best_model.predict(X_test)

# Calculate accuracy of the best model
 accuracy = accuracy_score(y_test, y_pred)
 print(f'Accuracy of best model: {accuracy:.2f}')

# Print the best model parameters found during tuning
 print("Best Model Parameters:")
 print(grid_search.best_params_)

# Create a DataFrame to compare actual and predicted values
 results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
 print(results.head(5))  # Display the first 5 rows of the results

Accuracy of best model: 0.99
Best Model Parameters:
{'bootstrap': True, 'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
     Actual  Predicted
194       1          1
157       1          1
225       2          2
208       1          1
318       2          2


## Randomized Search

In [None]:
# Create a Random Forest model
model = RandomForestClassifier()

# Define the hyperparameter distribution for tuning
param_dist = {
    'n_estimators': [10, 50, 100],         # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],       # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],       # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Minimum samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],      # Number of features to consider when looking for the best split
    'bootstrap': [True, False]             # Whether bootstrap samples are used when building trees
}

# Setup RandomizedSearchCV for hyperparameter tuning
rand_search = RandomizedSearchCV(estimator=model, param_distributions=param_dist, 
                                 scoring='accuracy', cv=5, n_iter=100, verbose=1)  # 100 iterations

# Fit the model using the training data
 rand_search.fit(X_train, y_train)

# Get the best model from the random search
 best_model = rand_search.best_estimator_

# Make predictions using the test data
 y_pred = best_model.predict(X_test)

# Calculate accuracy of the best model
 accuracy = accuracy_score(y_test, y_pred)
 print(f'Accuracy of best model: {accuracy:.2f}')

# Print the best model parameters found during tuning
 print("Best Model Parameters:")
 print(rand_search.best_params_)

# Create a DataFrame to compare actual and predicted values
 results = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
 print(results.head(5))  # Display the first 5 rows of the results

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Accuracy of best model: 1.00
Best Model Parameters:
{'n_estimators': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 30, 'bootstrap': False}
     Actual  Predicted
194       1          1
157       1          1
225       2          2
208       1          1
318       2          2
