In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics

In [5]:
data=pd.read_csv("C:\\Users\\fasy\\Downloads\\diabetes2.csv")

In [6]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [37]:
data.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [38]:
x=data[["Pregnancies","Glucose","BloodPressure","Age","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction"]].values
y=data["Outcome"].values

In [39]:
y.shape

(768,)

In [40]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
x_valid,x_test,y_valid,y_test=train_test_split(x_test,y_test,test_size=0.50,random_state=42)


In [72]:
class LogisticRrgression:
    def __init__(self,lrate=0.01,iterations=1000):
        self.lrate=lrate
        self.iterations=iterations
        
    def scale(self,x):
        x_scaled=x-np.mean(x,axis=0)
        x_scaled=x_scaled/np.std(x_scaled,axis=0)
        return x_scaled
        
    def fit(self,x,y):
        self.losses=[]
        self.theta=np.zeros((1+x.shape[1]))
        n=x.shape[0]
        x=self.scale(x)
        
        for i in range(self.iterations):
            y_pred=self.theta[0]+np.dot(x,self.theta[1:])
            z=y_pred
            g_z=1/(1+np.e**(-z))
            cost=(-y*np.log(g_z)-(1-y)*np.log(1-g_z))/n
            self.losses.append(cost)
            
            dtheta1=(1/n)*np.dot(x.T,(g_z-y))
            dtheta0=(1/n)*np.sum(g_z-y)
            
            self.theta[1:]=self.theta[1:]-self.lrate*dtheta1
            self.theta[0]=self.theta[0]-self.lrate*dtheta0
        return self
    
    def predict(self,x):
        x=self.scale(x)
        y_pred=self.theta[0]+np.dot(x,self.theta[1:])
        z=y_pred
        g_z=1/(1+np.e**(-z))
        return [1 if i>0.4 else 0 for i in g_z]

In [73]:
model=LogisticRrgression()
model.fit(x_train,y_train)

<__main__.LogisticRrgression at 0x18ed9f2de50>

In [74]:
print("theta is : ",model.theta)

theta is :  [-0.64266626  0.35600026  0.85182347 -0.16480043  0.16528704  0.02288199
  0.03134143  0.46495281  0.22764651]


In [75]:
y_pred_train=model.predict(x_train)
y_pred_valid=model.predict(x_valid)

In [76]:
train_acc=metrics.accuracy_score(y_train,y_pred_train)
valid_acc=metrics.accuracy_score(y_valid,y_pred_valid)

In [77]:
print("traing accuracy is :",train_acc)
print("validation accuracy is :",valid_acc)

traing accuracy is : 0.767479674796748
validation accuracy is : 0.7304347826086957


In [78]:
y_pred_test=model.predict(x_test)
test_acc=metrics.accuracy_score(y_test,y_pred_test)
print("traing accuracy is :",test_acc)

traing accuracy is : 0.7320261437908496


In [79]:
num_splits=5
kfold=StratifiedKFold(num_splits,shuffle=True,random_state=1)
train_accs,test_accs=[],[]
for train_index,test_index in kfold.split(x,y):
    x_train,x_test=x[train_index],x[test_index]
    y_train,y_test=y[train_index],y[test_index]
    
    model.fit(x_train,y_train)
    y_pred_train=model.predict(x_train)
    y_pred_test=model.predict(x_test)
    
    train_accs.append(metrics.accuracy_score(y_train,y_pred_train)*100)
    test_accs.append(metrics.accuracy_score(y_test,y_pred_test)*100)

In [80]:
ave_train_acc=0
ave_test_acc=0
print("\t","training_acc","\t","testing_acc")

for i in range(num_splits):
    print(i,"\t",train_accs[i],"\t",test_accs[i])
    
    ave_train_acc+=train_accs[i]/num_splits
    ave_test_acc+=test_accs[i]/num_splits
print("AV","\t",ave_train_acc,"\t",ave_test_acc)

	 training_acc 	 testing_acc
0 	 76.0586319218241 	 75.32467532467533
1 	 75.2442996742671 	 74.67532467532467
2 	 73.9413680781759 	 81.16883116883116
3 	 77.72357723577235 	 71.89542483660131
4 	 76.7479674796748 	 73.20261437908496
AV 	 75.94316887794285 	 75.25337407690348
