# Logistic Regression


In [None]:
import numpy as np
import pandas as pd
import math

In [2]:
#Read in the dataset 'classification'
dataset = pd.read_table('classification.txt',sep=',',header=None,names=['x1','x2','x3','na','label'])
dataset = dataset.drop(['na'],axis=1)

#Add the X0=1 to each data point:
x0 = []
for i in range(len(dataset)):
    x0.append(1)

first_col = pd.DataFrame(columns=['x0'],data=x0)

dataset = first_col.merge(dataset,how='inner',left_index=True, right_index=True)

In [3]:
#Check our dataset
dataset

Unnamed: 0,x0,x1,x2,x3,label
0,1,0.750072,0.977408,0.885658,1
1,1,0.877914,0.019251,0.506711,-1
2,1,0.777325,0.994066,0.822244,1
3,1,0.181158,0.460749,0.525477,1
4,1,0.114564,0.067555,0.128920,1
...,...,...,...,...,...
1995,1,0.302021,0.049354,0.973333,-1
1996,1,0.196709,0.598557,0.252530,-1
1997,1,0.515506,0.153544,0.012755,1
1998,1,0.228226,0.971554,0.183059,1


In [4]:
#Divide our dataset into X(coordinates of points) and Y(label):
X = dataset.drop(['label'],axis=1)
Y = dataset.loc[:,'label']  

In [5]:
#Randomize the initial weights:
weights = np.random.random(4)
print('The initial weights are',weights)

The initial weights are [0.37727986 0.0682116  0.33581682 0.0951821 ]


In [6]:
#Define sigmoid function:
def sigmoid(s):
    sig = 1/(1 + np.exp(-s))
    return sig 

In [7]:
#Calculate the cost, using cost function:
summation_cost_init = 0
for i in range(len(dataset)):
    xi = X.iloc[i]
    yi = Y.iloc[i]
    summation_cost_init = summation_cost_init + ((yi*np.log(sigmoid(np.dot(weights,xi)))) 
                             + ((1-yi)*np.log(1-sigmoid(np.dot(weights,xi)))))
cost_init = (-1/2000)*summation_cost_init
print('The initial cost of Logistic Regression is',cost_init)

The initial cost of Logistic Regression is 1.0622678621172403


In [8]:
#Define iteration times and the learning rate alpha:
n_iteration=7000
alpha = 0.01 
n=0

In [9]:
#Update the weights 7000 times:
while n < n_iteration:
    weights =  weights- alpha*(1/2000)*np.dot((X.T),(sigmoid(np.dot(X,weights))-Y))
    n=n+1
print('The weights of logistic regression after',n_iteration,'times iteration is',weights)    

The weights of logistic regression after 7000 times iteration is [-3.33088092 -2.13445914 -1.10370491 -1.38789613]


In [10]:
#Calculate the cost after weights updated; the value of cost should be lower 
summation_cost_end = 0
for i in range(len(dataset)):
    xi = X.iloc[i]
    yi = Y.iloc[i]
    summation_cost_end = summation_cost_end + ((yi*np.log(sigmoid(np.dot(weights,xi)))) 
                             + ((1-yi)*np.log(1-sigmoid(np.dot(weights,xi)))))
    
cost_end = (-1/2000)*summation_cost_end
print('The cost of Logistic Regression after',n_iteration,'times iteration is',cost_end)

The cost of Logistic Regression after 7000 times iteration is -0.06893605176642506


In [11]:
#Add the predicted results to our dataset:
predict=[]
for i in range(len(dataset)):
    xi = X.iloc[i]
    yi = Y.iloc[i]
    if sigmoid(np.dot(weights,xi)) >= 0.5:
        predict.append(1)
    elif sigmoid(np.dot(weights,xi)) < 0.5:
        predict.append(-1)
        
predict_col = pd.DataFrame(columns=['predicted label'],data=predict)
new_dataset = dataset.merge(predict_col,how='inner',left_index=True, right_index=True)   

In [13]:
new_dataset

Unnamed: 0,x0,x1,x2,x3,label,predicted label
0,1,0.750072,0.977408,0.885658,1,-1
1,1,0.877914,0.019251,0.506711,-1,-1
2,1,0.777325,0.994066,0.822244,1,-1
3,1,0.181158,0.460749,0.525477,1,-1
4,1,0.114564,0.067555,0.128920,1,-1
...,...,...,...,...,...,...
1995,1,0.302021,0.049354,0.973333,-1,-1
1996,1,0.196709,0.598557,0.252530,-1,-1
1997,1,0.515506,0.153544,0.012755,1,-1
1998,1,0.228226,0.971554,0.183059,1,-1


In [12]:
#Calculate the accuracy rate: 
accurate = 0
for i in range(len(new_dataset)):
    if new_dataset.iloc[i][4] == new_dataset.iloc[i][5]:
            accurate+=1
    else:
        pass

accuracy_rate = accurate/(len(new_dataset))
print ('The accuracy of Logistic regression after model iterates',n_iteration,'times is',accuracy_rate)

The accuracy of Logistic regression after model iterates 7000 times is 0.506
