# Step 1: Install Libraries

In [None]:
!pip install matplotlib seaborn



# Step 2: Import Libraries

In [None]:
import pandas as pd
import numpy as np

import math

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import matplotlib.pyplot as plt
import seaborn as sns

# Step 3: Get Dataset Pandas Frame

In [None]:
train_set_url = "https://raw.githubusercontent.com/AsadiAhmad/Loan-Prediction-SVM/refs/heads/main/Dataset/train.csv"
test_set_url = "https://raw.githubusercontent.com/AsadiAhmad/Loan-Prediction-SVM/refs/heads/main/Dataset/test.csv"

pd.set_option('display.max_rows', None)

train_set = pd.read_csv(train_set_url)
test_set = pd.read_csv(test_set_url)

In [None]:
train_set.head(10)

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,75256,4260981,47,18,single,rented,no,Politician,Hindupur,Andhra_Pradesh,7,13,0
1,192436,8529345,44,2,single,rented,no,Computer_hardware_engineer,Narasaraopet,Andhra_Pradesh,2,10,0
2,154840,7848654,55,9,single,rented,no,Software_Developer,Patna,Bihar,9,13,0
3,59775,8491491,61,20,single,rented,no,Comedian,Haldia,West_Bengal,8,11,0
4,63789,1537266,78,13,single,rented,no,Web_designer,Coimbatore,Tamil_Nadu,9,12,0
5,239303,4716686,74,11,single,norent_noown,no,Analyst,Firozabad,Uttar_Pradesh,9,10,0
6,235589,8631544,69,13,married,rented,no,Graphic_Designer,Solapur,Maharashtra,13,12,0
7,102872,6947233,62,10,single,rented,no,Technical_writer,Bidar,Karnataka,10,12,0
8,235463,3359719,49,13,single,rented,no,Fashion_Designer,Korba,Chhattisgarh,13,14,0
9,170788,961937,57,13,single,rented,no,Graphic_Designer,Avadi,Tamil_Nadu,5,10,0


# Step 4: Preprocessing

## Transform discrete columns to numuric data

In [None]:
def transform_binary_columns(dataset):
    married_mapping = {'married': 1, 'single': 0}
    house_mapping = {'owned': 1, 'rented': 0.5, 'norent_noown': 0}
    car_mapping = {'yes': 1, 'no': 0}

    dataset['Married/Single'] = dataset['Married/Single'].map(married_mapping)
    dataset['House_Ownership'] = dataset['House_Ownership'].map(house_mapping)
    dataset['Car_Ownership'] = dataset['Car_Ownership'].map(car_mapping)

    return dataset

In [None]:
train_set_transformed = transform_binary_columns(train_set)
test_set_transformed = transform_binary_columns(test_set)

In [None]:
train_set_transformed.head(10)

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,75256,4260981,47,18,0,0.5,0,Politician,Hindupur,Andhra_Pradesh,7,13,0
1,192436,8529345,44,2,0,0.5,0,Computer_hardware_engineer,Narasaraopet,Andhra_Pradesh,2,10,0
2,154840,7848654,55,9,0,0.5,0,Software_Developer,Patna,Bihar,9,13,0
3,59775,8491491,61,20,0,0.5,0,Comedian,Haldia,West_Bengal,8,11,0
4,63789,1537266,78,13,0,0.5,0,Web_designer,Coimbatore,Tamil_Nadu,9,12,0
5,239303,4716686,74,11,0,0.0,0,Analyst,Firozabad,Uttar_Pradesh,9,10,0
6,235589,8631544,69,13,1,0.5,0,Graphic_Designer,Solapur,Maharashtra,13,12,0
7,102872,6947233,62,10,0,0.5,0,Technical_writer,Bidar,Karnataka,10,12,0
8,235463,3359719,49,13,0,0.5,0,Fashion_Designer,Korba,Chhattisgarh,13,14,0
9,170788,961937,57,13,0,0.5,0,Graphic_Designer,Avadi,Tamil_Nadu,5,10,0


## One Hut encoding

In [None]:
def one_hot_encoding(dataset, columns):
    for col in columns:
        unique_values = dataset[col].unique()
        for value in unique_values:
            new_col_name = f"{col}-{value}"
            dataset[new_col_name] = (dataset[col] == value).astype(int)
    return dataset.drop(columns, axis=1)

In [None]:
train_set_one_hut = one_hot_encoding(train_set_transformed, ['Profession', 'CITY', 'STATE'])
test_set_one_hut = one_hot_encoding(test_set_transformed, ['Profession', 'CITY', 'STATE'])

  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  dataset[new_col_name] = (dataset[col] == value).astype(int)
  datase

In [None]:
train_set_one_hut.head(10)

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,...,STATE-Delhi,STATE-Tripura,STATE-Jammu_and_Kashmir,STATE-Manipur,STATE-Uttarakhand,STATE-Uttar_Pradesh[5],STATE-Puducherry,STATE-Himachal_Pradesh,STATE-Chandigarh,STATE-Sikkim
0,75256,4260981,47,18,0,0.5,0,7,13,0,...,0,0,0,0,0,0,0,0,0,0
1,192436,8529345,44,2,0,0.5,0,2,10,0,...,0,0,0,0,0,0,0,0,0,0
2,154840,7848654,55,9,0,0.5,0,9,13,0,...,0,0,0,0,0,0,0,0,0,0
3,59775,8491491,61,20,0,0.5,0,8,11,0,...,0,0,0,0,0,0,0,0,0,0
4,63789,1537266,78,13,0,0.5,0,9,12,0,...,0,0,0,0,0,0,0,0,0,0
5,239303,4716686,74,11,0,0.0,0,9,10,0,...,0,0,0,0,0,0,0,0,0,0
6,235589,8631544,69,13,1,0.5,0,13,12,0,...,0,0,0,0,0,0,0,0,0,0
7,102872,6947233,62,10,0,0.5,0,10,12,0,...,0,0,0,0,0,0,0,0,0,0
8,235463,3359719,49,13,0,0.5,0,13,14,0,...,0,0,0,0,0,0,0,0,0,0
9,170788,961937,57,13,0,0.5,0,5,10,0,...,0,0,0,0,0,0,0,0,0,0


## Normalizing min max

In [None]:
def min_max_normalize(train_set, test_set, columns):
    min_values = train_set[columns].min()
    max_values = train_set[columns].max()

    train_set_normalized = train_set.copy()
    train_set_normalized[columns] = (train_set[columns] - min_values) / (max_values - min_values)

    test_set_normalized = test_set.copy()
    test_set_normalized[columns] = (test_set[columns] - min_values) / (max_values - min_values)

    return train_set_normalized, test_set_normalized

In [None]:
columns_to_normalize = ["Income", "Age", "Experience", "CURRENT_JOB_YRS", "CURRENT_HOUSE_YRS"]
train_set_normal, test_set_normal = min_max_normalize(train_set_one_hut, test_set_one_hut, columns_to_normalize)

In [None]:
train_set_normal.head(10)

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,...,STATE-Delhi,STATE-Tripura,STATE-Jammu_and_Kashmir,STATE-Manipur,STATE-Uttarakhand,STATE-Uttar_Pradesh[5],STATE-Puducherry,STATE-Himachal_Pradesh,STATE-Chandigarh,STATE-Sikkim
0,75256,0.425508,0.448276,0.9,0,0.5,0,0.5,0.75,0,...,0,0,0,0,0,0,0,0,0,0
1,192436,0.852788,0.396552,0.1,0,0.5,0,0.142857,0.0,0,...,0,0,0,0,0,0,0,0,0,0
2,154840,0.784648,0.586207,0.45,0,0.5,0,0.642857,0.75,0,...,0,0,0,0,0,0,0,0,0,0
3,59775,0.848999,0.689655,1.0,0,0.5,0,0.571429,0.25,0,...,0,0,0,0,0,0,0,0,0,0
4,63789,0.152854,0.982759,0.65,0,0.5,0,0.642857,0.5,0,...,0,0,0,0,0,0,0,0,0,0
5,239303,0.471126,0.913793,0.55,0,0.0,0,0.642857,0.0,0,...,0,0,0,0,0,0,0,0,0,0
6,235589,0.863019,0.827586,0.65,1,0.5,0,0.928571,0.5,0,...,0,0,0,0,0,0,0,0,0,0
7,102872,0.694413,0.706897,0.5,0,0.5,0,0.714286,0.5,0,...,0,0,0,0,0,0,0,0,0,0
8,235463,0.335289,0.482759,0.65,0,0.5,0,0.928571,1.0,0,...,0,0,0,0,0,0,0,0,0,0
9,170788,0.095262,0.62069,0.65,0,0.5,0,0.357143,0.0,0,...,0,0,0,0,0,0,0,0,0,0


## Remove unrelated features

In [None]:
train_set = train_set_normal.drop(["Id"], axis=1)
test_set = test_set_normal.drop(["Id"], axis=1)

## Move Target col to the End

In [None]:
train_set = train_set[[col for col in train_set.columns if col != "Risk_Flag"] + ["Risk_Flag"]]

# Step 5: Calculate Pearson Correlation

In [None]:
def calculate_pearson_correlation(feature_list, target_list):
    feature_mean = sum(feature_list) / len(feature_list)
    target_mean = sum(target_list) / len(target_list)

    feature_diff = []
    for item in feature_list:
        feature_diff.append(item - feature_mean)

    target_diff = []
    for item in target_list:
        target_diff.append(item - target_mean)

    multiply_diff = []
    for index in range(len(feature_diff)):
        multiply_diff.append(feature_diff[index] * target_diff[index])

    square_feature_diff = []
    for item in feature_diff:
        square_feature_diff.append(item*item)

    square_target_diff = []
    for item in target_diff:
        square_target_diff.append(item*item)

    sum_multiply_diff = sum(multiply_diff)
    sum_square_feature_diff = sum(square_feature_diff)
    sum_square_target_diff = sum(square_target_diff)

    return sum_multiply_diff/math.sqrt(sum_square_feature_diff * sum_square_target_diff)

In [None]:
all_cols = [train_set[col].tolist() for col in train_set.columns]
pearson_correlation = []
for anyList in all_cols[0:-1]:
    pearson_correlation.append(calculate_pearson_correlation(anyList, all_cols[-1]))

# Step 6: Feature Selection

In [None]:
selected_features = []
cols = list(train_set.columns)[0:-1]
for index in range(len(pearson_correlation)):
    if abs(pearson_correlation[index]) > 0.02:
        selected_features.append(cols[index])

In [None]:
selected_features

['Age',
 'Experience',
 'Married/Single',
 'Car_Ownership',
 'CITY-Bhubaneswar',
 'CITY-Raiganj',
 'CITY-Barasat',
 'CITY-Satna',
 'CITY-Kochi',
 'CITY-Gwalior',
 'STATE-Kerala',
 'STATE-Madhya_Pradesh']

# Step 7: Split Dataset into Train and validation

In [None]:
train_set, validation_set = train_test_split(train_set, test_size=0.2, random_state=42)

# Step 8: Train SVM model with Selected Features

In [None]:
features_train = train_set[selected_features]
target_train = train_set['Risk_Flag']

features_validation = validation_set[selected_features]
target_validation = validation_set['Risk_Flag']

## Linear Kernel

In [None]:
model_linear = SVC(kernel='linear')
model_linear.fit(features_train, target_train)

In [None]:
linear_predicted = model_linear.predict(features_validation)

## Poly Kernel

In [None]:
model_poly = SVC(kernel='poly')
model_poly.fit(features_train, target_train)

In [None]:
poly_predicted = model_poly.predict(features_validation)

# RBF Kernel

In [None]:
model_rbf = SVC(kernel='rbf')
model_rbf.fit(features_train, target_train)

In [None]:
rbf_predicted = model_rbf.predict(features_validation)

# Sigmoid Kernal

In [None]:
model_sigmoid = SVC(kernel='sigmoid')
model_sigmoid.fit(features_train, target_train)

In [None]:
sigmoid_predicted = model_sigmoid.predict(features_validation)

# Step 11: Calculate measures for each kernel

In [None]:
mse = mean_squared_error(target_validation, target_validation_predicted)
r2 = r2_score(target_validation, target_validation_predicted)
accuracy = accuracy_score(target_validation, target_validation_predicted)

In [None]:
mse

0.1234375

In [None]:
r2

-0.14081996434937638

In [None]:
accuracy

0.8765625

# Step 12:  Calculate Target for the Test set

In [None]:
features_test_mi = test_set[mutual_information_features]
target_test_predict_mi = model_mi.predict(features_test_mi)
test_set['MedHouseValPredicted'] = target_test_predict_mi

In [None]:
test_set.head(10)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseValPredicted
0,2.6667,52.0,5.652174,1.038647,604.0,2.917874,37.34,-121.87,2.000043
1,6.6345,52.0,5.603093,0.85567,650.0,3.350515,37.34,-121.87,3.47001
2,2.4306,39.0,4.899209,1.06917,1990.0,3.932806,37.34,-121.87,1.868526
3,2.6312,52.0,3.819383,1.022026,1264.0,5.568282,37.34,-121.88,1.971099
4,2.9222,27.0,3.664921,1.084817,3019.0,3.161257,37.35,-121.87,2.013218
5,2.4968,46.0,4.374622,0.996979,1094.0,3.305136,37.35,-121.86,1.911364
6,2.3277,52.0,3.587558,0.976959,1580.0,3.640553,37.35,-121.87,1.838395
7,2.5947,29.0,3.217816,0.991458,6234.0,3.803539,37.34,-121.86,1.789614
8,3.0089,40.0,5.246544,1.170507,1718.0,3.958525,37.34,-121.86,2.086455
9,2.375,41.0,2.146739,0.891304,549.0,2.983696,37.33,-121.88,1.903743


# Step 13: Export the predicted EXCEL

In [None]:
test_set.to_excel('MutualInformationResult.xlsx', index=False)