# Logistic Regression

In [28]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)

In [30]:
clf.predict(X[:2, :])

array([0, 0])

In [31]:
clf.predict_proba(X[:2, :])

array([[9.81799057e-01, 1.82009288e-02, 1.43502503e-08],
       [9.71722113e-01, 2.82778573e-02, 3.00201040e-08]])

In [32]:
clf.score(X, y)

0.9733333333333334

# Confusion Matrix

In [33]:
from sklearn.metrics import confusion_matrix

In [34]:
confusion_matrix(y, clf.predict(X))

array([[50,  0,  0],
       [ 0, 47,  3],
       [ 0,  1, 49]], dtype=int64)

In [35]:
confusion_matrix(y, clf.predict(X), normalize='true')

array([[1.  , 0.  , 0.  ],
       [0.  , 0.94, 0.06],
       [0.  , 0.02, 0.98]])

# 1. Solve classification problem using 'classification.csv' dataset

##### target variable is 'default'. Apply feature selection, feature scaling, cross validation etc. (anything you think is needed)

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

In [37]:
dt = pd.read_csv('classification.csv')
dt.head(5)

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,college degree,17,12,176,9.3,11.359392,5.008608,1
1,27,no high school,10,6,31,17.3,1.362202,4.000798,0
2,40,no high school,15,14,55,5.5,0.856075,2.168925,0
3,41,no high school,15,14,120,2.9,2.65872,0.82128,0
4,24,high school,2,0,28,17.3,1.787436,3.056564,1


In [38]:
dt.corr()

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,default
age,1.0,0.536497,0.597591,0.47871,0.016398,0.295207,0.340217,-0.137657
employ,0.536497,1.0,0.322334,0.619681,-0.031182,0.403694,0.406091,-0.282978
address,0.597591,0.322334,1.0,0.316245,0.011323,0.208435,0.226514,-0.164451
income,0.47871,0.619681,0.316245,1.0,-0.026777,0.570199,0.610659,-0.07097
debtinc,0.016398,-0.031182,0.011323,-0.026777,1.0,0.501767,0.58487,0.389575
creddebt,0.295207,0.403694,0.208435,0.570199,0.501767,1.0,0.633104,0.24474
othdebt,0.340217,0.406091,0.226514,0.610659,0.58487,0.633104,1.0,0.145713
default,-0.137657,-0.282978,-0.164451,-0.07097,0.389575,0.24474,0.145713,1.0


In [39]:
dt = pd.get_dummies(dt)
dt

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,default,ed_college degree,ed_high school,ed_no high school,ed_postgraduate,ed_undergraduate
0,41,17,12,176,9.3,11.359392,5.008608,1,1,0,0,0,0
1,27,10,6,31,17.3,1.362202,4.000798,0,0,0,1,0,0
2,40,15,14,55,5.5,0.856075,2.168925,0,0,0,1,0,0
3,41,15,14,120,2.9,2.658720,0.821280,0,0,0,1,0,0
4,24,2,0,28,17.3,1.787436,3.056564,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,36,6,15,27,4.6,0.262062,0.979938,1,0,1,0,0,0
696,29,6,4,21,11.5,0.369495,2.045505,0,0,1,0,0,0
697,33,15,3,32,7.6,0.491264,1.940736,0,0,0,1,0,0
698,45,19,22,77,8.4,2.302608,4.165392,0,0,0,1,0,0


In [40]:
dt.corr()

Unnamed: 0,age,employ,address,income,debtinc,creddebt,othdebt,default,ed_college degree,ed_high school,ed_no high school,ed_postgraduate,ed_undergraduate
age,1.0,0.536497,0.597591,0.47871,0.016398,0.295207,0.340217,-0.137657,0.051031,-0.059646,0.015432,0.082148,-0.02026
employ,0.536497,1.0,0.322334,0.619681,-0.031182,0.403694,0.406091,-0.282978,-0.029161,-0.073388,0.146061,0.005245,-0.13529
address,0.597591,0.322334,1.0,0.316245,0.011323,0.208435,0.226514,-0.164451,0.058899,0.001322,-0.047697,0.023896,0.007779
income,0.47871,0.619681,0.316245,1.0,-0.026777,0.570199,0.610659,-0.07097,0.149451,0.014392,-0.178107,0.163696,0.085204
debtinc,0.016398,-0.031182,0.011323,-0.026777,1.0,0.501767,0.58487,0.389575,0.032199,-0.034308,0.005693,-0.042278,0.024489
creddebt,0.295207,0.403694,0.208435,0.570199,0.501767,1.0,0.633104,0.24474,0.110548,-0.038828,-0.056229,-0.009532,0.043597
othdebt,0.340217,0.406091,0.226514,0.610659,0.58487,0.633104,1.0,0.145713,0.16963,-0.012977,-0.127588,0.080032,0.030057
default,-0.137657,-0.282978,-0.164451,-0.07097,0.389575,0.24474,0.145713,1.0,0.071502,0.052241,-0.118909,-0.011857,0.058337
ed_college degree,0.051031,-0.029161,0.058899,0.149451,0.032199,0.110548,0.16963,0.071502,1.0,-0.236597,-0.401203,-0.031954,-0.090259
ed_high school,-0.059646,-0.073388,0.001322,0.014392,-0.034308,-0.038828,-0.012977,0.052241,-0.236597,1.0,-0.668829,-0.053269,-0.150468


In [41]:
from sklearn.model_selection import train_test_split
xTrain, xTest, yTrain, yTest = train_test_split(dt[dt.columns.difference(['default'])], dt['default'], test_size = 0.2,random_state = 10)

In [42]:
xTrain

Unnamed: 0,address,age,creddebt,debtinc,ed_college degree,ed_high school,ed_no high school,ed_postgraduate,ed_undergraduate,employ,income,othdebt
16,6,36,2.918216,16.4,0,1,0,0,0,13,41,3.805784
280,6,31,0.345408,4.8,0,0,1,0,0,11,28,0.998592
353,13,37,1.185750,8.5,0,1,0,0,0,6,62,4.084250
374,17,39,0.340774,1.4,0,1,0,0,0,20,101,1.073226
300,18,47,20.561310,25.3,0,0,1,0,0,29,129,12.075690
...,...,...,...,...,...,...,...,...,...,...,...,...
369,4,46,0.826551,2.3,1,0,0,0,0,22,99,1.450449
320,4,24,0.381600,5.0,0,0,1,0,0,8,24,0.818400
527,10,42,0.666528,10.6,0,0,1,0,0,3,24,1.877472
125,11,41,0.694364,4.4,0,0,1,0,0,7,43,1.197636


In [43]:
regr = LogisticRegression(random_state = 10).fit(xTrain,yTrain)
y_predict = regr.predict(xTrain)
regr1 = LogisticRegression(random_state = 10).fit(xTest,yTest)
y_predict1 = regr1.predict(xTest)

In [44]:
regr.score(xTrain,yTrain)

0.8196428571428571

In [45]:
regr1.score(xTest,yTest)

0.8142857142857143

In [46]:
confusion_matrix(yTrain, y_predict)

array([[378,  33],
       [ 68,  81]], dtype=int64)

In [47]:
confusion_matrix(yTest, y_predict1)

array([[99,  7],
       [19, 15]], dtype=int64)

In [48]:
from sklearn.metrics import precision_score

In [49]:
precision_score(yTest, y_predict1,average= None)

array([0.83898305, 0.68181818])

In [50]:
recall = 99/(99+7)
presicion = 99/(99+19)
accuracy = (99+15)/(99+7+19+15)
print(recall,presicion,accuracy)

0.9339622641509434 0.8389830508474576 0.8142857142857143


# 2. Print accuracy, confusion matrix, precision, recall, sensitivity and specifity on train and test (and maybe validation) datasets.

##### do not use any libraries for metrics, implement yourself

In [121]:
yTrain = yTrain.reset_index(drop = True)

In [139]:
TP,FP,TN,FN = 0,0,0,0


for i in range(len(yTrain)):
    if (yTrain[i] != y_predict[i] and yTrain[i]==0):
        FP+=1
    elif (yTrain[i] != y_predict[i] and yTrain[i]==1):
        FN+=1
    elif (yTrain[i] == y_predict[i] and yTrain[i]==0):
        TP+=1
    elif (yTrain[i] == y_predict[i] and yTrain[i]==1):
        TN+=1
confusion_m = np.array([[TP,FP],[FN,TN]])


In [140]:
confusion_m

array([[378,  33],
       [ 68,  81]])

In [142]:
recall = TP/(TP+FN)
recall

0.8475336322869955

In [143]:
accuracy = (TP+TN)/(TP+FP+TN+FN)
accuracy 

0.8196428571428571

In [144]:
precision = TP/(TP+FP)
precision

0.9197080291970803

In [145]:
sensitivity = TP/(TP+FN)
sensitivity

0.8475336322869955

In [146]:
specifity = TN/(TN+FP)
specifity 

0.7105263157894737

In [155]:
yTest = yTest.reset_index(drop = True)
yTest

0      0
1      0
2      0
3      0
4      0
      ..
135    1
136    0
137    0
138    0
139    0
Name: default, Length: 140, dtype: int64

In [156]:
TPt,FPt,TNt,FNt = 0,0,0,0


for i in range(len(yTest)):
    if (yTest[i] != y_predict1[i] and yTest[i]==0):
        FPt+=1
    elif (yTest[i] != y_predict1[i] and yTest[i]==1):
        FNt+=1
    elif (yTest[i] == y_predict1[i] and yTest[i]==0):
        TPt+=1
    elif (yTest[i] == y_predict1[i] and yTest[i]==1):
        TNt+=1
confusion_mt = np.array([[TPt,FPt],[FNt,TNt]])


In [157]:
confusion_mt

array([[99,  7],
       [19, 15]])

In [158]:
recallt = TPt/(TPt+FNt)
recallt

0.8389830508474576

In [159]:
accuracyt = (TPt+TNt)/(TPt+FPt+TNt+FNt)
accuracyt

0.8142857142857143

In [160]:
precisiont = TPt/(TPt+FPt)
precisiont

0.9339622641509434

In [161]:
sensitivityt = TPt/(TPt+FNt)
sensitivityt

0.8389830508474576

In [162]:
specifityt = TNt/(TNt+FPt)
specifityt

0.6818181818181818