<a href="https://colab.research.google.com/github/EricSiq/Supervised-Machine-Learning-Lab/blob/main/SML_SupportVectorMachine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
import pandas as pd  # For data manipulation and analysis (e.g., loading CSV files, DataFrame operations)
import numpy as np  # For numerical computations and array operations
from sklearn.svm import SVC  # To implement the Support Vector Machine algorithm for classification tasks
from sklearn.preprocessing import StandardScaler  # For scaling features to improve model performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # To evaluate model performance with various metrics


In [24]:
# Define Data Cleaning Functions
def clean_loan_id(df):
#Remove the 'LP' prefix from the 'Loan_ID' column.

    if 'Loan_ID' in df.columns:
        df['Loan_ID'] = df['Loan_ID'].str.replace('LP', '', regex=False)
    return df

def handle_missing_values(df, categorical_cols):
    """
    Fill missing values:
    - For categorical columns: fill with the mode (most frequent value).
    - For numerical columns: fill with the median.
    """
    # Fill missing categorical values
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0])

    # Fill missing values for numerical columns
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())

    return df

In [25]:
#Define Categorical Columns
categorical_cols = ['Gender', 'Married', 'Dependents', 'Self_Employed', 'Education', 'Property_Area']

#Load and Clean the Training Data
train = pd.read_csv('train.csv')
train = clean_loan_id(train)
train = handle_missing_values(train, categorical_cols)

# Convert target to numeric: assuming 'Y' = 1, 'N' = 0
train['Loan_Status'] = train['Loan_Status'].apply(lambda x: 1 if x == 'Y' else 0)

In [26]:

# Preprocess the Training Data
# Apply one-hot encoding to categorical columns
train_encoded = pd.get_dummies(train, columns=categorical_cols, drop_first=True)

# Separate features and target
X_train = train_encoded.drop('Loan_Status', axis=1)
y_train = train_encoded['Loan_Status']

# Scale numerical features (SVM benefits from feature scaling)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train the SVM Model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train)
# Evaluate on training data (for reference)
train_preds = svm_model.predict(X_train_scaled)
print("Training Accuracy:", accuracy_score(y_train, train_preds))
print("Training Classification Report:\n", classification_report(y_train, train_preds))
print("Training Confusion Matrix:\n", confusion_matrix(y_train, train_preds))

Training Accuracy: 0.8273615635179153
Training Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.47      0.63       192
           1       0.81      0.99      0.89       422

    accuracy                           0.83       614
   macro avg       0.88      0.73      0.76       614
weighted avg       0.85      0.83      0.81       614

Training Confusion Matrix:
 [[ 91 101]
 [  5 417]]


In [27]:

# Load and Preprocess the Test Data
test = pd.read_csv('test.csv')
test = clean_loan_id(test)
test = handle_missing_values(test, categorical_cols)

# Check if test set has the target column for evaluation
has_target = 'Loan_Status' in test.columns
if has_target:
    test['Loan_Status'] = test['Loan_Status'].apply(lambda x: 1 if x == 'Y' else 0)

# Apply one-hot encoding to test data using the same categorical columns
test_encoded = pd.get_dummies(test, columns=categorical_cols, drop_first=True)

# Ensure the test set has the same columns as the training set (excluding the target)
X_test = test_encoded.copy()
if has_target:
    X_test = X_test.drop('Loan_Status', axis=1)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Scale test data using the same scaler
X_test_scaled = scaler.transform(X_test)

In [28]:
#  Make Predictions on the Test Data
test_preds = svm_model.predict(X_test_scaled)

if has_target:
    y_test = test_encoded['Loan_Status']
    print("Test Accuracy:", accuracy_score(y_test, test_preds))
    print("Test Classification Report:\n", classification_report(y_test, test_preds))
    print("Test Confusion Matrix:\n", confusion_matrix(y_test, test_preds))
else:
    # If no target column is present, output predictions
    test['Predicted_Loan_Status'] = np.where(test_preds == 1, 'Y', 'N')
    print("Predictions on test data:")
    print(test[['Loan_ID', 'Predicted_Loan_Status']].head())

# Optionally, save predictions to a CSV file
test.to_csv('test_predictions.csv', index=False)


Predictions on test data:
  Loan_ID Predicted_Loan_Status
0  001015                     Y
1  001022                     Y
2  001031                     Y
3  001035                     Y
4  001051                     Y
