# Loding Dataset & Spliting the dataset

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('../NoteBooks/cleaned_data.csv')
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['IncidentGrade'])  # Features
y = df['IncidentGrade']                 # Target variable   

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Check the shape of the splits
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_test.shape[0]}")

# Optional: Verify class distribution (use only if stratify=y is set)
print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in validation set:")
print(y_test.value_counts(normalize=True))

In [None]:
# Reducing the dataset size
sample_fraction = 0.05  # Adjust as needed (e.g., 10% of the original dataset)
df_sampled, _ = train_test_split(
    df, 
    stratify=df['IncidentGrade'], 
    test_size=1-sample_fraction, 
    random_state=42
)

# Separate features and target
X = df_sampled.drop(columns=['IncidentGrade'])
y = df_sampled['IncidentGrade']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Check the shape of the splits
print(f"Training set size: {X_train.shape[0]}")
print(f"Validation set size: {X_test.shape[0]}")

# Optional: Verify class distribution (use only if stratify=y is set)
print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in validation set:")
print(y_test.value_counts(normalize=True))

# Balancing Target Class

In [None]:
from imblearn.combine import SMOTEENN
from sklearn.utils import resample
from joblib import Parallel, delayed
from collections import Counter
from imblearn.over_sampling import SMOTE

def undersample_data(X, y, target_majority_fraction=0.7, random_state=42):

    # Get class counts and identify majority and minority classes
    class_counts = Counter(y)       
    majority_class = max(class_counts, key=class_counts.get)    # get the class with most occurrences
    # minority_class = min(class_counts, key=class_counts.get)    # get the class with fewest occurrences
    
    # Separate majority and non-majority classes
    X_majority = X[y == majority_class]
    y_majority = y[y == majority_class]
    X_minority = X[y != majority_class]
    y_minority = y[y != majority_class]

    # Calculate target size for majority class
    target_majority_size = int(len(y_minority) / (1 - target_majority_fraction))
    target_majority_size = min(target_majority_size, len(y_majority))

    # Undersample the majority class
    X_majority_resampled, y_majority_resampled = resample(
        X_majority, y_majority, replace=False,
        n_samples=target_majority_size,
        random_state=random_state)

    # Combine resampled majority with minority classes
    X_resampled = pd.concat([X_majority_resampled, X_minority])
    y_resampled = pd.concat([y_majority_resampled, y_minority])

    return X_resampled, y_resampled

# Apply Undersampling and SMOTE-ENN in chunks
chunk_size = 50000  
balanced_chunks = []
smote = SMOTE(random_state=42, n_jobs=-1)               # Initialize the SMOTE object
smote_enn = SMOTEENN(random_state=42, n_jobs=-1)        # Initialize the SMOTE-ENN object

for i in range(0, len(X_train), chunk_size):            # Process the data in chunks
    X_chunk, y_chunk = X_train[i:i+chunk_size], y_train[i:i+chunk_size]

    # Undersample the majority class
    X_chunk_undersampled, y_chunk_undersampled = undersample_data(X_chunk, y_chunk)
    
    # Apply SMOTE to oversample minority classes
    X_smote, y_smote = smote.fit_resample(X_chunk_undersampled, y_chunk_undersampled)

    # Step 3: Apply SMOTEENN to clean the resampled data
    X_res, y_res = smote_enn.fit_resample(X_smote, y_smote)
    
    # Store the balanced chunk
    balanced_chunks.append((X_res, y_res))
    
# Combine all balanced chunks
X_resampled = pd.concat([chunk[0] for chunk in balanced_chunks])    # Combine the X chunks
y_resampled = pd.concat([chunk[1] for chunk in balanced_chunks])    # Combine the y chunks

In [None]:
# Print class distributions before and after
print("Original class distribution:", Counter(y_train))
print("Class distribution after undersampling and SMOTEENN:", Counter(y_resampled))

In [None]:
import pandas as pd
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample
from collections import Counter

def undersample_data(X, y, target_majority_fraction=0.7, random_state=42):
    """
    Perform undersampling on the majority class to balance with minority classes.
    """
    class_counts = Counter(y)
    majority_class = max(class_counts, key=class_counts.get)
    
    # Separate majority and non-majority classes
    X_majority = X[y == majority_class]
    y_majority = y[y == majority_class]
    X_minority = X[y != majority_class]
    y_minority = y[y != majority_class]

    # Calculate target size for majority class
    target_majority_size = int(len(y_minority) / (1 - target_majority_fraction))
    target_majority_size = min(target_majority_size, len(y_majority))

    # Resample majority class
    X_majority_resampled, y_majority_resampled = resample(
        X_majority,
        y_majority,
        replace=False,
        n_samples=target_majority_size,
        random_state=random_state
    )

    # Combine resampled majority with minority classes
    X_resampled = pd.concat([X_majority_resampled, X_minority])
    y_resampled = pd.concat([y_majority_resampled, y_minority])

    return X_resampled, y_resampled


def balance_data(X, y, random_state=42):
    """
    Apply undersampling, SMOTE, and SMOTEENN to balance the dataset.
    """
    smote = SMOTE(random_state=random_state)
    smote_enn = SMOTEENN(random_state=random_state)

    # Step 1: Undersample the majority class
    X_undersampled, y_undersampled = undersample_data(X, y, random_state=random_state)

    # Step 2: Oversample with SMOTE
    X_smote, y_smote = smote.fit_resample(X_undersampled, y_undersampled)

    # Step 3: Clean with SMOTE-ENN
    X_resampled, y_resampled = smote_enn.fit_resample(X_smote, y_smote)

    return X_resampled, y_resampled


# Example usage
# Assuming X_train and y_train are loaded as Pandas DataFrames/Series
print("Original class distribution:", Counter(y_train))
X_balanced, y_balanced = balance_data(X_train, y_train)
print("Class distribution after undersampling and SMOTEENN:", Counter(y_balanced))


# Scaling Numerical Columns

In [None]:
# import scaling 4
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Initialize the scalers
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()

# Fit the scalers
scaler_standard.fit(X_resampled)
scaler_minmax.fit(X_resampled)

# Transform the data using the fitted scalers
X_standard = scaler_standard.transform(X_resampled)
X_minmax = scaler_minmax.transform(X_resampled)

# Model Selection & Traning 

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import lightgbm as lgb

# Logistic Regression
print("=== Baseline Model: Logistic Regression ===")
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train, y_train)
y_pred_baseline = lr.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_baseline):.4f}")
print(classification_report(y_test, y_pred_baseline))

# Decision Tree
print("\n=== Advanced Models: Decision Tree ===")
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")

# Random Forest with hyperparameter tuning
print("\n=== Advanced Models: Random Forest (Grid Search) ===")
rf_model = RandomForestClassifier(random_state=42)
param_grid_rf = {'n_estimators': [50, 100], 'max_depth': [None, 10, 20]}
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)
best_rf = grid_search_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test)
print(f"Best Params: {grid_search_rf.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")

# XGBoost
print("\n=== Advanced Models: XGBoost ===")
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")

# LightGBM
print("\n=== Advanced Models: LightGBM ===")
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)
y_pred_lgb = lgb_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred_lgb):.4f}")

# 3. Cross-Validation
print("\n=== Cross-Validation: Logistic Regression ===")
cv_scores = cross_val_score(lr, X, y, cv=5, scoring='accuracy')
print(f"Cross-validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

In [None]:

# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

# Perform Cross-Validation for All Models
print("=== Cross-Validation Results ===")
for name, model in models.items():
    cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"{name}:")
    print(f"  Mean Accuracy: {cv_scores.mean():.4f}")
    print(f"  Std Deviation: {cv_scores.std():.4f}\n")

# Train models on the training set and evaluate on the test set
print("=== Test Set Results ===")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name}:")
    print(f"  Test Set Accuracy: {accuracy:.4f}")
    print(f"  Classification Report:\n{classification_report(y_test, y_pred)}")

#  5. Model Evaluation and Tuning