# Loding Dataset & spliting the dataset

In [None]:
# Importing the libraries
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

In [None]:
# Load the data
df = pd.read_csv('../NoteBooks/cleaned_data.csv')
df.head()

In [None]:
X = df.drop(columns=['IncidentGrade'])  # Features
y = df['IncidentGrade']                 # Target variable   

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Check the shape of the splits
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_test.shape[0]}")

# Optional: Verify class distribution (use only if stratify=y is set)
print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in validation set:")
print(y_test.value_counts(normalize=True))

In [None]:
from imblearn.over_sampling import KMeansSMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

# Step 1: Pre-sampling with reduced ratio for majority class
sampling_strategy = {class_label: int(0.2 * count) if count > min(Counter(y_train).values()) else count for class_label, count in Counter(y_train).items()}  # Adjust ratio dynamically

undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)
print(f"After pre-undersampling: {Counter(y_train_under)}")

# Reduce to 20 principal components (adjust n_components based on variance retained)
pca = PCA(n_components=20, random_state=42)
X_train_reduced = pca.fit_transform(X_train_under)
print(f"Data shape after PCA: {X_train_reduced.shape}")

# smote = SMOTE(sampling_strategy="auto", k_neighbors=5, random_state=42, n_jobs=-1)
# X_resampled, y_resampled = smote.fit_resample(X_train_under, y_train_under)

kmeans_smote = KMeansSMOTE(sampling_strategy="auto", random_state=42, n_jobs=-1)
X_resampled, y_resampled = kmeans_smote.fit_resample(X_train_under, y_train_under)
print(f"After SMOTE : {Counter(y_resampled)}")

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

print(f"Before SMOTE + RandomUnderSampler: {Counter(y_train)}")

# Pre-sampling to reduce dataset size
undersampler = RandomUnderSampler(sampling_strategy={class_label: int(0.20*count) for class_label,count in Counter(y_train).items()}, random_state=42)  # Balance classes to 1:0.5 ratio
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)

print(f"After pre-sampling: {Counter(y_train_under)}")

In [None]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter

print(f"Before SMOTE + RandomUnderSampler: {Counter(y_train)}")

# Step 1: Pre-sampling with reduced ratio for majority class
sampling_strategy = {class_label: int(0.3 * count) if count > min(Counter(y_train).values()) else count for class_label, count in Counter(y_train).items()}  # Adjust ratio dynamically

undersampler = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=42)
X_train_under, y_train_under = undersampler.fit_resample(X_train, y_train)
print(f"After pre-sampling: {Counter(y_train_under)}")

# Step 2: SMOTE on reduced dataset
smote = SMOTE(sampling_strategy="minority", k_neighbors=2, random_state=42, n_jobs=-1)  # Use fewer neighbors and parallelize
X_resampled, y_resampled = smote.fit_resample(X_train_under, y_train_under)
print(f"After SMOTE: {Counter(y_resampled)}")

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import lightgbm as lgb

# Load dataset (replace with your dataset)
# Example: Using sklearn's breast cancer dataset
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
print("=== Baseline Model: Logistic Regression ===")
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
y_pred_baseline = lr.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_baseline):.4f}")
print(classification_report(y_test, y_pred_baseline))

# Decision Tree
print("\n=== Advanced Models: Decision Tree ===")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")

# Random Forest with hyperparameter tuning
print("\n=== Advanced Models: Random Forest (Grid Search) ===")
rf_model = RandomForestClassifier(random_state=42)
param_grid_rf = {'n_estimators': [50, 100], 'max_depth': [None, 10, 20]}
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
print(f"Best Params: {grid_search_rf.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")

# XGBoost
print("\n=== Advanced Models: XGBoost ===")
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")

# LightGBM
print("\n=== Advanced Models: LightGBM ===")
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_lgb):.4f}")

# 3. Cross-Validation
print("\n=== Cross-Validation: Logistic Regression ===")
cv_scores = cross_val_score(lr, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

In [None]:

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

# Perform Cross-Validation for All Models
print("=== Cross-Validation Results ===")
for name, model in models.items():
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name}:")
    print(f"  Mean Accuracy: {cv_scores.mean():.4f}")
    print(f"  Std Deviation: {cv_scores.std():.4f}\n")

# Train models on the training set and evaluate on the test set
print("=== Test Set Results ===")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}:")
    print(f"  Test Set Accuracy: {accuracy:.4f}")
    print(f"  Classification Report:\n{classification_report(y_test, y_pred)}")