In [1]:
import numpy as  np 
import pandas as pd 

In [2]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")

In [3]:
df=pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [5]:
#performing label encoding on the gender column
gender_map ={"Male":1,"Female":0}
df["gender"]=df["gender"].map(gender_map)

In [6]:
smoking_encoded = {'formerly smoked':0.5, 'never smoked':0, 'smokes':1 }
df["smoking_status"] = df["smoking_status"].map(smoking_encoded)

In [7]:
df.isnull().sum()

id                      0
gender                  1
age                     0
hypertension            0
heart_disease           0
ever_married            0
work_type               0
Residence_type          0
avg_glucose_level       0
bmi                   201
smoking_status       1544
stroke                  0
dtype: int64

In [8]:
df = df.dropna(subset=["smoking_status"])


In [9]:
df = df.dropna(subset=["bmi"])

In [10]:
df = df.dropna(subset=["gender"])


In [11]:
df = df.reset_index()

In [12]:
df

Unnamed: 0,index,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,9046,1.0,67.0,0,1,Yes,Private,Urban,228.69,36.6,0.5,1
1,2,31112,1.0,80.0,0,1,Yes,Private,Rural,105.92,32.5,0.0,1
2,3,60182,0.0,49.0,0,0,Yes,Private,Urban,171.23,34.4,1.0,1
3,4,1665,0.0,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,0.0,1
4,5,56669,1.0,81.0,0,0,Yes,Private,Urban,186.21,29.0,0.5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3420,5100,68398,1.0,82.0,1,0,Yes,Self-employed,Rural,71.97,28.3,0.0,0
3421,5102,45010,0.0,57.0,0,0,Yes,Private,Rural,77.93,21.7,0.0,0
3422,5106,44873,0.0,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,0.0,0
3423,5107,19723,0.0,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,0.0,0


In [13]:
#normalising age,avg_glucose_level,bmi 
df["age"] = (df["age"]-df["age"].min())/(df["age"].max()-df["age"].min())
df["avg_glucose_level"] = (df["avg_glucose_level"]-df["avg_glucose_level"].min())/(df["avg_glucose_level"].max()-df["avg_glucose_level"].min())
df["bmi"] = (df["bmi"]-df["bmi"].min())/(df["bmi"].max()-df["bmi"].min())

In [14]:
df.drop("index",axis=1,inplace=True)

In [15]:
x_train =df[['gender', 'age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi','smoking_status']] 

In [16]:
x_train.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
avg_glucose_level    0
bmi                  0
smoking_status       0
dtype: int64

In [17]:
y_train = df["stroke"]

In [18]:
# we know that linear regression= sigmoid = = 1/1+e^-z
#where z = w.x + z 
def sigmoid(z):
    g= 1/(1+np.exp(-z))
    
    return g

In [19]:
sigmoid(0)

0.5

In [20]:
#calculating cost for the model 
def comp_cost(x,y,w,b):
    
    m,n = x.shape
   
    z_wb =0
    loss_sum=0
    epsilon = 1e-15 
    
    for i in range(m):
        for j in range(n):
            z_wb_j =  w[j]*x.iloc[i][j] 
            z_wb+=z_wb_j
        z_wb+=b
        #print(z_wb)
        func = sigmoid(z_wb)
        #print(func)
        loss = (-y.iloc[i] * np.log(func + epsilon)) - (1 - y.iloc[i]) * np.log(1 - func + epsilon)
        #print("loss",loss)
        loss_sum+= loss
        #print("loss_sum",loss_sum)
        #print("i",i)
    cost = loss_sum/m
    return cost
    

In [21]:
n = 7
# Compute and display cost with w and b initialized to zeros
initial_w = np.zeros(n)
initial_b = 0
cost = comp_cost(x_train, y_train, initial_w, initial_b)
print('Cost at initial w and b (zeros): {:.3f}'.format(cost))

Cost at initial w and b (zeros): 0.693


In [22]:
#finding differentiated w and b value which needs to be substituted in gradient descent
def grad_func(x,y,w,b):
    
    m,n=x.shape
    z_wb=0
    sum_error=0
    dj_j=np.zeros(w.shape)
    
    for i in range(m):
        for j in range(n):
            z_ij = w[j]*x.iloc[i,j]
            z_wb+=z_ij
        z_wb+=b
        func = sigmoid(z_wb)
        error = func - y.iloc[i]
        sum_error+= error
        db = sum_error/m
        
        for j in range(n):
            dj_i = error*x.iloc[i,j]
            dj_j[j] += dj_i
        dw = dj_j/m
        
        
        return dw,db
    
    
    

In [23]:
initial_w = np.zeros(n)
initial_b = 0
dw,db = grad_func(x_train, y_train, initial_w, initial_b)
print(f"dw at initial : {dw} \n   db at intial : {db}")

dw at initial : [-1.45985401e-04 -1.15571776e-04  0.00000000e+00 -1.45985401e-04
 -1.16972976e-04 -4.55184295e-05 -7.29927007e-05] 
   db at intial : -0.00014598540145985403


In [24]:
# gradient descent is the method of finding suitanle w and b for our model 
def gradient_descent(x,y,w,b,comp_cost,grad_func,alpha,num_iters) :
    
    
    j_hist = []
    for i in range(num_iters):
        
        dw,db = grad_func(x,y,w,b)
        w = w-alpha*dw
        b = b-alpha*db
        
        
        cost = comp_cost(x,y,w,b)
        j_hist.append(cost)
        
        
        if i%5==0 :
            print(f"iterations  : {i}   cost :{j_hist[-1]}")

    return w,b,j_hist


        
        

In [25]:
w = np.array([0.1,0.2,0.3,0.4,0.5,0.6,0.7])
print("w",w)
b = 5
num_iters = 100
alpha = 100

new_w,new_b,_=gradient_descent(x_train,y_train,w,b,comp_cost,grad_func,alpha,num_iters)
print(f"w and b found by gradient descent is w :{new_w} \n b: {new_b}")

w [0.1 0.2 0.3 0.4 0.5 0.6 0.7]
iterations  : 0   cost :32.72360022468938
iterations  : 5   cost :32.72360022432206
iterations  : 10   cost :32.72360022395542
iterations  : 15   cost :32.72360022358945
iterations  : 20   cost :32.723600223224146
iterations  : 25   cost :32.72360022285952
iterations  : 30   cost :32.723600222495556
iterations  : 35   cost :32.72360022213226
iterations  : 40   cost :32.72360022176962
iterations  : 45   cost :32.723600221407644
iterations  : 50   cost :32.72360022104633
iterations  : 55   cost :32.72360022068568
iterations  : 60   cost :32.72360022032568
iterations  : 65   cost :32.72360021996633
iterations  : 70   cost :32.72360021960764
iterations  : 75   cost :32.7236002192496
iterations  : 80   cost :32.7236002188922
iterations  : 85   cost :32.72360021853546
iterations  : 90   cost :32.72360021817936
iterations  : 95   cost :32.7236002178239
w and b found by gradient descent is w :[0.10394642 0.20312425 0.3        0.40394642 0.50316213 0.6012305
 0.7

In [26]:
#from the obtained new w and b the output can be predicted 

def pred(x,w,b):
    
    m,n = x.shape 
    z_wb=0
    p = np.zeros(m)
    for i in range(m):
        
        for j in range(n):
            z_wb_j = w[j]*x[i][j]
            z_wb+=z_wb_j
        z_wb+=b
        func =  sigmoid(z_wb)
        
        if func>=0.5 :
            p[i] = 1
        if func<0.5:
            p[i]=0
            
    return p
        

In [27]:
# lets test it with an example 
print("first value :\n", df.iloc[0])
print("second value :\n", df.iloc[1])


first value :
 id                       9046
gender                    1.0
age                  0.791667
hypertension                0
heart_disease               1
ever_married              Yes
work_type             Private
Residence_type          Urban
avg_glucose_level    0.801265
bmi                  0.311801
smoking_status            0.5
stroke                      1
Name: 0, dtype: object
second value :
 id                      31112
gender                    1.0
age                  0.972222
hypertension                0
heart_disease               1
ever_married              Yes
work_type             Private
Residence_type          Rural
avg_glucose_level    0.234512
bmi                   0.26087
smoking_status            0.0
stroke                      1
Name: 1, dtype: object


In [28]:
x = np.array([[1,0.791667,0,1,0.801265,0.311801,0.5],[1,0.972222,0,1,0.234512,0.26087,0]])
p = pred(x,new_w,new_b)
p

array([1., 1.])