# Implementation of Logistic Regression
## part 1 - Loading Data

In [1]:
#import Libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
#load data files

train=pd.read_csv("train_u6lujuX_CVtuZ9i.csv")
test=pd.read_csv("test_Y3wMUE5_7gLdaTN.csv")

In [3]:
#List of column names

list(train)

['Loan_ID',
 'Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area',
 'Loan_Status']

In [4]:
#Sample of data

train.head(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
5,LP001011,Male,Yes,2,Graduate,Yes,5417,4196.0,267.0,360.0,1.0,Urban,Y
6,LP001013,Male,Yes,0,Not Graduate,No,2333,1516.0,95.0,360.0,1.0,Urban,Y
7,LP001014,Male,Yes,3+,Graduate,No,3036,2504.0,158.0,360.0,0.0,Semiurban,N
8,LP001018,Male,Yes,2,Graduate,No,4006,1526.0,168.0,360.0,1.0,Urban,Y
9,LP001020,Male,Yes,1,Graduate,No,12841,10968.0,349.0,360.0,1.0,Semiurban,N


In [5]:
#Types of data columns

train.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [6]:
#Summary statistics

train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


## Part 2 - Data Cleaning and Preprocessing

In [7]:
#Find missing values
train.isnull().sum()
test.isnull().sum()

#Impute missing values with mean (numerical variables)
train.fillna(train.mean(),inplace=True) 
train.isnull().sum() 

#Test data
test.fillna(test.mean(),inplace=True) 
test.isnull().sum()

#Impute missing values with mode (categorical variables)
train.Gender.fillna(train.Gender.mode()[0],inplace=True)
train.Married.fillna(train.Married.mode()[0],inplace=True)
train.Dependents.fillna(train.Dependents.mode()[0],inplace=True) 
train.Self_Employed.fillna(train.Self_Employed.mode()[0],inplace=True)  
train.isnull().sum() 

#Test data
test.Gender.fillna(test.Gender.mode()[0],inplace=True)
test.Dependents.fillna(test.Dependents.mode()[0],inplace=True) 
test.Self_Employed.fillna(test.Self_Employed.mode()[0],inplace=True)  
test.isnull().sum() 

#Treatment of outliers
train.Loan_Amount_Term=np.log(train.Loan_Amount_Term)

In [8]:
mapping = {'Y': 1, 'N': 0}

# Use the replace method to change the values in the 'col1' column
train['Loan_Status'] = train['Loan_Status'].replace(mapping)

train.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,146.412162,5.886104,1.0,Urban,1
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,5.886104,1.0,Rural,0
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,5.886104,1.0,Urban,1
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,5.886104,1.0,Urban,1
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,5.886104,1.0,Urban,1


## Part 3 - Predictive Modelling

In [9]:
#Remove Loan_ID variable - Irrelevant
train = train.drop('Loan_ID', axis=1)
test = test.drop('Loan_ID', axis=1)

#Create target variable
X=train.drop('Loan_Status', axis=1)
y=train.Loan_Status

#Build dummy variables for categorical variables
X=pd.get_dummies(X)
# y=pd.get_dummies(y)
test=pd.get_dummies(test)
X = X.to_numpy()
y = y.to_numpy()

#Split train data for cross validation
from sklearn.model_selection import train_test_split
X_train,X_cv,y_train,y_cv = train_test_split(X,y,test_size=0.2)

In [10]:
len(X_train[0])

20

In [11]:
# X_train = x_train.to_numpy()
# y_train = y_train.to_numpy()
# X_cv = x_cv.to_numpy()
# y_cv = y_cv.to_numpy()

In [12]:
print("x train: ",X_train.shape)
print("y train: ",y_train.shape)

x train:  (491, 20)
y train:  (491,)


## Part 3 - Logistic Regression Algorithm

### Steps to write logistic regression algorithm

1. Import necessary libraries such as numpy and matplotlib
2. Prepare the data by loading it into a numpy array and splitting it into a training and testing set.
3. Preprocess the data by cleaning it and handling missing values if necessary.
4. Define a sigmoid function to map the input to a probability value between 0 and 1.
5. Initialize the model parameters, such as the weight and bias, with random values.
6. Implement the cost function, which calculates the error between the predicted and actual values.
7. Implement the gradient descent algorithm to update the parameters and minimize the cost function.
8. Train the model by iterating over the training data and updating the parameters with each iteration.
9. Measure the performance of the model on the testing data.
10. Use the model to make predictions on new data.

In [13]:
class LogisticRegression:
    def __init__(self, learning_rate=0.001, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None
        
    def cost_function(self,X, y, theta):
        m = len(y)
        y_pred = self._sigmoid(np.dot(X, theta))
        cost = -1/m * np.sum(y * np.log(y_pred) + (1-y) * np.log(1-y_pred))
        return cost

    def fit(self, X, y):
        n_samples, n_features = X.shape

        # init parameters
        self.weights = np.zeros(n_features)
        self.bias = 0
        costs = []

        # gradient descent
        for i in range(self.n_iters):
            # approximate y with linear combination of weights and x, plus bias
            linear_model = np.dot(X, self.weights) + self.bias
            # apply sigmoid function
#             print(linear_model)
            y_predicted = self._sigmoid(linear_model)
#             print(X.T.shape)
#             print(y_predicted.shape)
#             print(y.shape)
#             break
            # compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            # update parameters
            self.weights -= self.lr * dw
            self.bias -= self.lr * db
            
            cost = self.cost_function(X, y, self.weights)
            costs.append(cost)
            print("Cost after iteration {}: {}".format(i, cost))

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self._sigmoid(linear_model)
        y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
        return np.array(y_predicted_cls)

    def _sigmoid(self, x):
        x_scaled = x/1000
        return 1 / (1 + np.exp(-x_scaled))

In [14]:
#Method to evaluate accuracy of model
def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

In [15]:
model = LogisticRegression(learning_rate=0.0001, n_iters=20)
model.fit(X_train, y_train)
predictions = model.predict(X_cv)

print("LR classification accuracy:", accuracy(y_cv, predictions))

Cost after iteration 0: 0.6596087222859441
Cost after iteration 1: 0.6557499072141979
Cost after iteration 2: 0.653994663621662
Cost after iteration 3: 0.6526653573838261
Cost after iteration 4: 0.6515954053492724
Cost after iteration 5: 0.6507281184095234
Cost after iteration 6: 0.650022052170019
Cost after iteration 7: 0.6494450377824399
Cost after iteration 8: 0.6489718796354946
Cost after iteration 9: 0.6485827041290767
Cost after iteration 10: 0.6482617323214813
Cost after iteration 11: 0.6479963608221309
Cost after iteration 12: 0.6477764680558497
Cost after iteration 13: 0.6475938861144636
Cost after iteration 14: 0.6474419951808414
Cost after iteration 15: 0.6473154094886228
Cost after iteration 16: 0.6472097322744605
Cost after iteration 17: 0.6471213632068825
Cost after iteration 18: 0.6470473460767077
Cost after iteration 19: 0.6469852476277599
LR classification accuracy: 0.6910569105691057


In [16]:
# Plotting decision regions
# X_train = X_train[:10]
# y_train = y_train[:10]

# Create a grid to evaluate the model
# xx, yy = np.meshgrid(np.linspace(*plt.xlim(), num=100),
#                      np.linspace(*plt.ylim(), num=100))
# print(yy.shape)
# Z = model.predict(np.c_[xx, yy])
# Z = Z.reshape(xx.shape)

# Plot the decision boundary and the data points
# plt.contourf(xx, yy, Z, alpha=0.5)
plt.scatter(X_train[:100,1], X_train[:100,1], c=y_train[:100,1], s=50, cmap='RdBu')
def f(x):
    m=model.weights
    b=model.bias
    return np.dot(x,m)+b
plt.plot(X_train[:100],f(X_train[:100]))

# Show the plot
plt.show()

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed