# Name, etc

# Introduction

In [14]:
# Import libraries needed

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
data = pd.read_csv("C:/Users/henri/Documents/GitHub/CA-Machine-Learning/BankRecords.csv")
data.head()

Unnamed: 0,ID,Age,Experience(Years),Income(Thousands's),Sort Code,Family,Credit Score,Education,Mortgage(Thousands's),Personal Loan,Securities Account,CD Account,Online Banking,CreditCard
0,1,25,1,49,91107,4,1.6,Diploma,0,No,Yes,No,No,No
1,2,45,19,34,90089,3,1.5,Diploma,0,No,Yes,No,No,No
2,3,39,15,11,94720,1,1.0,Diploma,0,No,No,No,No,No
3,4,35,9,100,94112,1,2.7,Degree,0,No,No,No,No,No
4,5,35,8,45,91330,4,1.0,Degree,0,No,No,No,No,Yes


In [3]:
# Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

# Check for duplicates
duplicates = data.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

ID                       0
Age                      0
Experience(Years)        0
Income(Thousands's)      0
Sort Code                0
Family                   0
Credit Score             0
Education                0
Mortgage(Thousands's)    0
Personal Loan            0
Securities Account       0
CD Account               0
Online Banking           0
CreditCard               0
dtype: int64
Number of duplicate rows: 0


In [4]:
# Create correlation matrix
matrix = data.corr()
print(matrix)

                             ID       Age  Experience(Years)  \
ID                     1.000000 -0.008473          -0.008326   
Age                   -0.008473  1.000000           0.994215   
Experience(Years)     -0.008326  0.994215           1.000000   
Income(Thousands's)   -0.017695 -0.055269          -0.046574   
Sort Code              0.013432 -0.029216          -0.028626   
Family                -0.016797 -0.046418          -0.052563   
Credit Score          -0.024672 -0.052030          -0.050089   
Mortgage(Thousands's) -0.013920 -0.012539          -0.010582   

                       Income(Thousands's)  Sort Code    Family  Credit Score  \
ID                               -0.017695   0.013432 -0.016797     -0.024672   
Age                              -0.055269  -0.029216 -0.046418     -0.052030   
Experience(Years)                -0.046574  -0.028626 -0.052563     -0.050089   
Income(Thousands's)               1.000000  -0.016410 -0.157501      0.645993   
Sort Code         

In [9]:
# Encode categorical variables
label_encoders = {}
categorical_features = ['Education', 'Personal Loan', 'Securities Account', 'CD Account', 'Online Banking', 'CreditCard']
for col in categorical_features:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Standardize numerical features
scaler = StandardScaler()
numerical_features = ["Age", "Experience(Years)" , "Credit Score", "Mortgage(Thousands's)"]
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [13]:
# Split the data into training and testing sets
X = data.drop(["ID", "Income(Thousands's)", "Sort Code"], axis=1)
y = data["Income(Thousands's)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# Create a binary target variable
threshold = 70
y_binary = (y > threshold).astype(int)

# Split the data 
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
logistic_model = LogisticRegression()
logistic_model.fit(X_train_binary, y_train_binary)

# Make predictions
y_pred_binary = logistic_model.predict(X_test_binary)

# Evaluating model
accuracy = accuracy_score(y_test_binary, y_pred_binary)
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)
class_report = classification_report(y_test_binary, y_pred_binary)

print(f"Logistic Regression Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Logistic Regression Accuracy: 0.775
Confusion Matrix:
[[499  59]
 [166 276]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.89      0.82       558
           1       0.82      0.62      0.71       442

    accuracy                           0.78      1000
   macro avg       0.79      0.76      0.76      1000
weighted avg       0.78      0.78      0.77      1000



# References

https://www.geeksforgeeks.org/create-a-correlation-matrix-using-python/ (28/05)