# Machine Learning

# Best Model with Best Hyperparameters

In [37]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# Importing Classification Models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
# Importing Classification Metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [38]:
# Ignoring Warnings
import warnings
warnings.filterwarnings("ignore")

In [39]:
# Loading the Dataset
df = sns.load_dataset("penguins")
print(df.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  


In [40]:
# To check missing values in our dataset
print(df.isnull().sum().sort_values(ascending=False))

sex                  11
bill_depth_mm         2
bill_length_mm        2
flipper_length_mm     2
body_mass_g           2
island                0
species               0
dtype: int64


In [41]:
# Let's fill the missing values in our dataset
df["sex"] = df["sex"].fillna(df["sex"].mode()[0])
df["bill_depth_mm"] = df["bill_depth_mm"].fillna(df["bill_depth_mm"].mean())
df["bill_length_mm"] = df["bill_length_mm"].fillna(df["bill_length_mm"].mean())
df["flipper_length_mm"] = df["flipper_length_mm"].fillna(df["flipper_length_mm"].mean())
df["body_mass_g"] = df["body_mass_g"].fillna(df["body_mass_g"].mean())
# To again check for missing values in our dataset
print(df.isnull().sum().sort_values(ascending=False))

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64


In [42]:
# Let's getting info of our dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     344 non-null    float64
 3   bill_depth_mm      344 non-null    float64
 4   flipper_length_mm  344 non-null    float64
 5   body_mass_g        344 non-null    float64
 6   sex                344 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [43]:
# Create a LabelEncoder object
le = LabelEncoder()

# Encode all categorical columns in the DataFrame
for col in df.select_dtypes(include=['object', 'category']).columns:
    df[col] = le.fit_transform(df[col])

In [44]:
# Selecting Features and Target Variable
X = df.drop("island", axis=1)
y = df["island"]
# Splitting the Dataset using Train-Test-Split by 80/20 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
# Create a dictionary of models to evaluate
models = { 
    'XGB Classifier': XGBClassifier(),  # XGBoost classifier
    'Logistic Regression': LogisticRegression(),  # Logistic Regression
    'Random Forest Classifier': RandomForestClassifier(),  # Random Forest
    'Support Vector Classifier': SVC(),  # Support Vector Machine
    'K Neighbors Classifier': KNeighborsClassifier(),  # KNN Classifier
    'Gradient Boosting Classifier': GradientBoostingClassifier()  # Gradient Boosting
}

model_scores = []  # List to store model names and their accuracy

# Loop through each model
for name, model in models.items():
    model.fit(X_train, y_train)  # Train the model on training data
    y_pred = model.predict(X_test)  # Predict on test data
    acc = accuracy_score(y_test, y_pred)  # Calculate accuracy
    model_scores.append((name, acc))  # Append model name and accuracy to list

# Sort the models by accuracy in ascending order
model_scores_sorted = sorted(model_scores, key=lambda x: x[1])

# Create a DataFrame for results
results_df = pd.DataFrame(model_scores_sorted, columns=['Model', 'Accuracy'])

# Print the results table
print(results_df.to_string(index=False))  # Display as a table

# Get the best model
best_model = results_df.iloc[-1]  # Last row contains the best model
print(f"\nBest Model: {best_model['Model']} with Accuracy: {best_model['Accuracy']:.4f}")

                       Model  Accuracy
      K Neighbors Classifier  0.579710
    Random Forest Classifier  0.652174
   Support Vector Classifier  0.652174
              XGB Classifier  0.666667
         Logistic Regression  0.710145
Gradient Boosting Classifier  0.710145

Best Model: Gradient Boosting Classifier with Accuracy: 0.7101


## Let's do Hyperparameter Tunning 

In [47]:
from sklearn.model_selection import GridSearchCV

# Define parameter grids for each model
param_grids = {
    'XGB Classifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    },
    'Logistic Regression': {
        'C': [0.1, 1.0, 10.0],
        'penalty': ['l1', 'l2']
    },
    'Random Forest Classifier': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    },
    'Support Vector Classifier': {
        'C': [0.1, 1.0, 10.0],
        'kernel': ['linear', 'rbf']
    },
    'K Neighbors Classifier': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance']
    },
    'Gradient Boosting Classifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    }
}

# Hyperparameter tuning and model training
best_models = {}
model_scores = []

for name, model in models.items():
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[name], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_

    # Evaluate the model on test data
    y_pred = best_models[name].predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    # Append scores only if the model name is unique
    if (name, acc) not in model_scores:
        model_scores.append((name, acc))

# Sort the models by accuracy in ascending order
model_scores_sorted = sorted(model_scores, key=lambda x: x[1])

# Create a DataFrame for results
results_df = pd.DataFrame(model_scores_sorted, columns=['Model', 'Accuracy'])

# Print the results table
print(results_df.to_string(index=False))

# Get the best model
best_model = results_df.iloc[-1]
print(f"\nBest Model: {best_model['Model']} with Accuracy: {best_model['Accuracy']:.4f}")

                       Model  Accuracy
      K Neighbors Classifier  0.579710
              XGB Classifier  0.652174
   Support Vector Classifier  0.652174
    Random Forest Classifier  0.666667
Gradient Boosting Classifier  0.666667
         Logistic Regression  0.710145

Best Model: Logistic Regression with Accuracy: 0.7101
