In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score , roc_curve, auc
import matplotlib.pyplot as plt
import joblib


# Load the preprocessed data
from data_reprocessing import train_to_df
train_file = 'new_train.csv'  
dataset = train_to_df(train_file)

# Display the first few rows of the dataset to verify the preprocessing
display(dataset.head())

# Split the data into features and target
X = dataset.drop('is_attributed', axis=1)  # Features
y = dataset['is_attributed']  # Target variable


In [None]:
# Outer 80/20 Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shape of training and validation sets
print("Data split into 80% training and 20% validation.")
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")


In [None]:
# Set number of folds based on results of the ROC Cross-Validation on 80% training data
n_splits = 5  # Number of folds
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize the SVM model with desired parameters
svm_model = SVC(
    C=1.0,  # Regularization parameter
    kernel='rbf',  # Radial Basis Function kernel
    gamma='scale',  # Kernel coefficient
    probability=True,  # Enables probability estimates for ROC AUC calculation
    random_state=42
)

print("SVM model initialized.")


In [None]:
# Track cross-validation results
roc_auc_scores = []  # To store AUC-ROC scores for each fold

print(f"Stratified K-Fold Cross-Validation set up with {n_splits} folds.")


In [None]:
# Start cross-validation
print("Starting Stratified K-Fold Cross-Validation...")
for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
    # Split the data into training and validation sets for the current fold
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # Train the SVM model
    svm_model.fit(X_train, y_train)

    # Make predictions on the validation set
    y_pred = svm_model.predict(X_val)
    y_pred_proba = svm_model.predict_proba(X_val)[:, 1]  # Probability estimates for ROC AUC

    # Calculate ROC-AUC for the current fold
    roc_auc = roc_auc_score(y_val, y_pred_proba)
    roc_auc_scores.append(roc_auc)

    # Print performance for the current fold
    print(f"\nFold {fold} - ROC AUC Score: {roc_auc:.2f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))

print("\nCross-Validation completed.")


In [None]:
# Calculate the average ROC-AUC across folds
mean_roc_auc = np.mean(roc_auc_scores)
std_roc_auc = np.std(roc_auc_scores)

print(f"\nAverage ROC AUC Score across folds: {mean_roc_auc:.2f} ± {std_roc_auc:.2f}")


In [None]:
# Save the final model after cross-validation (trained on the last fold)
joblib.dump(svm_model, 'svm_model.joblib')
print("Trained model saved as 'svm_model.joblib'.")


In [None]:
# Calculate ROC curve for the last fold
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) for SVM (Last Fold)')
plt.legend(loc='lower right')
plt.show()
