<a href="https://colab.research.google.com/github/Aatiqa-bano/Internship-Projects/blob/main/Hyperparameter_Tuning_of_ML_Models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#hyperparameter tunning using Grid Search

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load the dataset
data = pd.read_csv('emails.csv')
print(f"Shape of data after loading: {data.shape}")

# Assuming 'Email No.' is an index and should be dropped
if 'Email No.' in data.columns:
    data.drop(columns=['Email No.'], inplace=True)
print(f"Shape of data after dropping 'Email No.': {data.shape}")

# Check for missing values
if data.isnull().sum().sum() > 0:
    data.fillna(data.mean(), inplace=True)  # Filling missing values with the mean of each column
print(f"Shape of data after filling missing values: {data.shape}")

# Separate features and target variable
if 'Prediction' not in data.columns:
    raise ValueError("'Prediction' column is missing from the dataset")

X = data.drop(columns=['Prediction'])
y = data['Prediction']
print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

# Convert the features to numeric if they are not
X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')
print(f"Shape of X after conversion to numeric: {X.shape}, Shape of y: {y.shape}")

# Convert target variable to integers
y = y.astype(int)

# Handle any remaining missing values in features
X.fillna(X.mean(), inplace=True)
print(f"Shape of X after handling remaining missing values: {X.shape}, Shape of y: {y.shape}")

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Model Training
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print(f"Shape of X_train: {X_train.shape}, Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}, Shape of y_test: {y_test.shape}")

# Define the parameter grid for SVC
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': [0.01, 0.1, 1, 10],  # Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’
    'kernel': ['linear', 'rbf']  # Type of kernel function
}

# Initialize and fit GridSearchCV
grid_search = GridSearchCV(SVC(), param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
print(f"Best Hyperparameters: {best_params}")

# Model Evaluation
# Evaluate the tuned model
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best, average='macro')
recall_best = recall_score(y_test, y_pred_best, average='macro')
f1_best = f1_score(y_test, y_pred_best, average='macro')

print("Tuned SVM Model Performance:")
print(f"Accuracy: {accuracy_best}")
print(f"Precision: {precision_best}")
print(f"Recall: {recall_best}")
print(f"F1-Score: {f1_best}")

# Confusion Matrix
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
print("\nConfusion Matrix (Tuned Model):")
print(conf_matrix_best)

# Classification Report
class_report_best = classification_report(y_test, y_pred_best)
print("\nClassification Report (Tuned Model):")
print(class_report_best)


Shape of data after loading: (5172, 3002)
Shape of data after dropping 'Email No.': (5172, 3001)
Shape of data after filling missing values: (5172, 3001)
Shape of X: (5172, 3000), Shape of y: (5172,)
Shape of X after conversion to numeric: (5172, 3000), Shape of y: (5172,)
Shape of X after handling remaining missing values: (5172, 3000), Shape of y: (5172,)
Shape of X_train: (4137, 3000), Shape of X_test: (1035, 3000)
Shape of y_train: (4137,), Shape of y_test: (1035,)
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best Hyperparameters: {'C': 0.1, 'gamma': 0.01, 'kernel': 'linear'}
Tuned SVM Model Performance:
Accuracy: 0.9555555555555556
Precision: 0.9399470688371103
Recall: 0.953687872581648
F1-Score: 0.946439178486572

Confusion Matrix (Tuned Model):
[[708  31]
 [ 15 281]]

Classification Report (Tuned Model):
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       739
           1       0.90      0.95      0.92       2

#hyperparameter tunning using Random Search

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Load the dataset
data = pd.read_csv('emails.csv')

# Assuming 'Email No.' is an index and should be dropped
if 'Email No.' in data.columns:
    data.drop(columns=['Email No.'], inplace=True)

# Check for missing values
if data.isnull().sum().sum() > 0:
    data.fillna(data.mean(), inplace=True)  # Filling missing values with the mean of each column

# Separate features and target variable
if 'Prediction' not in data.columns:
    raise ValueError("'Prediction' column is missing from the dataset")

X = data.drop(columns=['Prediction'])
y = data['Prediction']

# Convert the features to numeric if they are not
X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')

# Convert target variable to integers
y = y.astype(int)

# Handle any remaining missing values in features
X.fillna(X.mean(), inplace=True)

# Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define the parameter grid for SVC
param_dist = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'max_iter': [5000, 10000, 20000]  # Increasing iterations significantly
}

# Initialize and fit RandomizedSearchCV
random_search = RandomizedSearchCV(SVC(tol=1e-5), param_distributions=param_dist, n_iter=50, cv=5, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

# Get the best hyperparameters and best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_
print(f"Best Hyperparameters: {best_params}")

# Evaluate the tuned model
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best, average='macro')
recall_best = recall_score(y_test, y_pred_best, average='macro')
f1_best = f1_score(y_test, y_pred_best, average='macro')

print("Tuned SVM Model Performance:")
print(f"Accuracy: {accuracy_best}")
print(f"Precision: {precision_best}")
print(f"Recall: {recall_best}")
print(f"F1-Score: {f1_best}")

# Confusion Matrix
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
print("\nConfusion Matrix (Tuned Model):")
print(conf_matrix_best)

# Classification Report
class_report_best = classification_report(y_test, y_pred_best)
print("\nClassification Report (Tuned Model):")
print(class_report_best)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Hyperparameters: {'max_iter': 5000, 'kernel': 'linear', 'gamma': 1, 'C': 0.1}
Tuned SVM Model Performance:
Accuracy: 0.9465020576131687
Precision: 0.9387494002878618
Recall: 0.9354761904761905
F1-Score: 0.9370880050982813

Confusion Matrix (Tuned Model):
[[162   6]
 [  7  68]]

Classification Report (Tuned Model):
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       168
           1       0.92      0.91      0.91        75

    accuracy                           0.95       243
   macro avg       0.94      0.94      0.94       243
weighted avg       0.95      0.95      0.95       243

