## Assignment 1
### Name:
### Roll Number:

In [31]:
# K-fold cross-validation is a technique used to assess and optimize the performance of machine
# learning models. The dataset is divided into K subsets, or ”folds.” The model is trained on K-1 folds
# and tested on the remaining one. This process is repeated K times, and the average performance
# is used to gauge the model’s generalization ability.

In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
dataset = pd.read_csv('../../dataset/cross-validation.csv')
# randomize the dataset
dataset = dataset.sample(frac=1).reset_index(drop=True)
dataset.head()


# drop the Loan_ID column
dataset = dataset.drop(columns=['Loan_ID'], axis=1)


# check null values
dataset.isnull().sum()

# drop null values
dataset = dataset.dropna()

# shape of the dataset
dataset.shape



# split the dataset into train and test
train = dataset[:int(0.8*len(dataset))]
test = dataset[int(0.8*len(dataset)):]
print(train.shape, test.shape)

# split the train and test into X and Y
X_train = train.drop(columns=['Loan_Status'])
y_train = train['Loan_Status']
X_test = test.drop(columns=['Loan_Status'])
y_test = test['Loan_Status']

# encode the categorical features
le = LabelEncoder()


def encode(data):
    for i in data.columns:
        if data[i].dtype == 'object':
            le.fit(data[i].astype(str))
            data[i] = le.transform(data[i].astype(str))
    return data

scaler = StandardScaler()
X_train = encode(X_train)
X_test = encode(X_test)

# scale the data
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# create the model with saga solver
model = LogisticRegression(solver='saga', max_iter=2000)

# fit the model
model.fit(X_train, y_train)

# predict the model
y_pred = model.predict(X_test)

# print the accuracy score
print("Accuracy Score: ", accuracy_score(y_test, y_pred))

# print the classification report
print("Classification Report: \n", classification_report(y_test, y_pred))

# print the confusion matrix
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))

# make the 5-fold cross validation

k = 5
size = len(dataset) // k
acc = []
for i in range(k):
    # create 5 folds of the train data and make validation set
    val = dataset[i*size: (i+1)*size]
    train = dataset.drop(val.index)
    # split the train and test into X and Y
    X_train = train.drop(columns=['Loan_Status'])
    y_train = train['Loan_Status']
    X_test = val.drop(columns=['Loan_Status'])
    y_test = val['Loan_Status']
    
    X_train = encode(X_train)
    X_test = encode(X_test)
    
    # scale the data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # create the model with saga solver
    model = LogisticRegression(solver='saga', max_iter=2000)
    
    # fit the model
    model.fit(X_train, y_train)
    
    # predict the model
    y_pred = model.predict(X_test)
    
    # print the accuracy score
    print("Accuracy Score: ", accuracy_score(y_test, y_pred))
    acc.append(accuracy_score(y_test, y_pred))
    
print("Average Accuracy Score: ", np.mean(acc))






(384, 12) (96, 12)
Accuracy Score:  0.7916666666666666
Classification Report: 
               precision    recall  f1-score   support

           N       0.79      0.39      0.52        28
           Y       0.79      0.96      0.87        68

    accuracy                           0.79        96
   macro avg       0.79      0.67      0.70        96
weighted avg       0.79      0.79      0.77        96

Confusion Matrix: 
 [[11 17]
 [ 3 65]]
Accuracy Score:  0.8020833333333334
Accuracy Score:  0.8125
Accuracy Score:  0.78125
Accuracy Score:  0.84375
Accuracy Score:  0.7916666666666666
Average Accuracy Score:  0.80625
