In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib # Import joblib for saving the model


# Import preprocessed data
from data_reprocessing import train_to_df

# Load the preprocessed data
train_file = 'new_train.csv'  
dataset = train_to_df(train_file)

# Display the first few rows of the dataset to verify the preprocessing
display(dataset.head())


In [None]:
# Split the data into features and target
X = dataset.drop('is_attributed', axis=1)  # Features
y = dataset['is_attributed']  # Target variable

# Split the dataset into training and validation sets (80% training, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Display the shape of training and validation sets
print("Data split into training and validation sets.")
print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")


In [None]:
# Initialize the SVM model
svm_model = SVC(
    C=1.0,  # Regularization parameter
    kernel='rbf',  # Radial Basis Function kernel
    gamma='scale',  # Kernel coefficient
    probability=True,  # Enables probability estimates for ROC AUC calculation
    random_state=42
)

print("Starting model training...")

# Train the SVM model
svm_model.fit(X_train, y_train)

print("Model training completed.")


In [None]:
# Make predictions on the validation set
y_pred = svm_model.predict(X_val)
y_pred_proba = svm_model.predict_proba(X_val)[:, 1]  # Probability estimates for ROC AUC

# Evaluate model performance
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))

print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Calculate and display the ROC AUC score
roc_auc = roc_auc_score(y_val, y_pred_proba)
print(f"\nROC AUC Score: {roc_auc:.2f}")


In [None]:
# Save the trained model using joblib
joblib.dump(svm_model, 'svm_model.joblib')
print("Trained model saved as 'svm_model.joblib'.")

In [None]:
# Import additional libraries for ROC curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Calculate the ROC curve
fpr, tpr, thresholds = roc_curve(y_val, y_pred_proba)  # Calculate False Positive Rate, True Positive Rate
roc_auc = auc(fpr, tpr)  # Calculate AUC score

# Plot the ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) for SVM')
plt.legend(loc='lower right')
plt.show()
