In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


In [None]:
filename = "ML_data_set.csv"
df = pd.read_csv(filename)
print("Columns in dataset:", df.columns)

target_col = "label"  # Change if needed
if target_col not in df.columns:
    print(f"ERROR: Target column '{target_col}' not found!")
    exit()

# Encode target if categorical
if df[target_col].dtype == 'object':
    le_y = LabelEncoder()
    df[target_col] = le_y.fit_transform(df[target_col])

X = df.drop(target_col, axis=1)
y = df[target_col]

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Preprocessing: scale numeric + encode categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', 'passthrough' if not categorical_cols else OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

pipeline = Pipeline([
    ('pre', preprocessor),
])

X_processed = pipeline.fit_transform(X)

# Handle imbalance
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_processed, y)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_res, y_res, test_size=0.2, random_state=42, stratify=y_res
)

# Function to display performance metrics
def print_metrics(model_name, y_true, y_pred):
    print(f"\n{model_name} Performance Metrics")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, average='weighted', zero_division=0))
    print("Recall   :", recall_score(y_true, y_pred, average='weighted', zero_division=0))
    print("F1-score :", f1_score(y_true, y_pred, average='weighted', zero_division=0))
    print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, zero_division=0))

In [None]:
# Random Forest
print("\nTraining Random Forest...")
rf = RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced')
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print_metrics("Random Forest", y_test, rf_pred)

In [None]:
# XGBoost
print("\nTraining XGBoost...")
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=200, random_state=42)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
print_metrics("XGBoost", y_test, xgb_pred)


In [None]:
# SVM
print("\nTraining SVM...")
svm = SVC(kernel='rbf', probability=True, random_state=42)
svm.fit(X_train, y_train)
svm_pred = svm.predict(X_test)
print_metrics("Support Vector Machine (SVM)", y_test, svm_pred)

In [None]:
# KNN
print("\nTraining K-Nearest Neighbors...")
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_pred = knn.predict(X_test)
print_metrics("K-Nearest Neighbors (KNN)", y_test, knn_pred)

In [None]:
# Hyperparameter tuning for Random Forest
print("\nPerforming Hyperparameter Tuning for Random Forest...")
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [None]:
grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=42, class_weight='balanced'),
    param_grid, cv=3, verbose=1, n_jobs=-1
)
grid_rf.fit(X_train, y_train)

print("\nBest Random Forest parameters:", grid_rf.best_params_)
print("Grid Search Best Cross-Validation Accuracy:", grid_rf.best_score_)

# Test tuned Random Forest
best_rf = grid_rf.best_estimator_
best_rf_pred = best_rf.predict(X_test)
print_metrics("Best Tuned Random Forest", y_test, best_rf_pred)

input("\nPress Enter to exit...")