In [195]:
import pandas as pd 
import numpy as np 



class LogisticRegression:
    def __init__(self, lr=0.2, n_iter=1000):
        '''constructor will take in the learning rate and the number of iterations as parameters,
        default values are 0.1 and 100 respectively
        '''
        self.lr = lr 
        self.n_iter = n_iter 
    
    def predict(self, X):
        """Takes the input features and predicts the result. This depends on two other methods: linear and non_linear"""
        X = self.normalize(X)

        linear = self._linear(X)
   
        preds = self._non_linear(linear)

        return (preds >= 0.5).astype('int')
    
    def initialize_weights(self, X):
        #same number of weights as there are number of features 
        self.weights = np.random.rand(X.shape[1], 1)
        
        self.bias = np.zeros((1,))
    
    def _linear(self, X):
        """This will take in the input features and apply the weighted sum"""
        return np.dot(X, self.weights) + self.bias
        
    def _non_linear(self, X):
        """Takes in the result from the linear function as input and applies sigmoid formula"""
        return  1 / (1+np.exp(-X))
    
    def fit(self, X_train, y_train):
        """This is the gradient descent process, takes in the features and labels to fine-tune weights using gradient descent"""
        self.initialize_weights(X_train)
        
        
        #get mean and std for normalization 
        self.x_mean = X_train.mean(axis=0).T
        self.x_std = X_train.std(axis=0).T
        
        #normalize data
        X_train = self.normalize(X_train)
        
        for i in range(self.n_iter):
            #make predictions
            probs = self._non_linear(self._linear(X_train))
            y_train = y_train.reshape((y_train.shape[0],1))
            diff = probs - y_train
    
            delta_w = np.mean(diff*X_train, axis=0, keepdims=True).T
            delta_b = np.mean(diff)
            
            #update weights
            self.weights = self.weights - (self.lr * delta_w)
            self.bias = self.bias - (self.lr * delta_b)
        return self 
            
    
    def normalize(self, X):
        """Takes in the input features as inputs and will normalize these values"""
        X = (X - self.x_mean)/(self.x_std)
        return X
    
    def accuracy(self, X, y):
        """Reports the accuracy of the model. The mean number of correct predictions"""
        preds = self.predict(X)
        print("Predictions are: ", preds)
        print("Actual labels: ", y)
        return np.mean(preds==y)
    
    def loss(self, X, y):
        """Computes the cross-entropy"""
        probs = self._non_linear(self._linear(X))
        
        #entropy when true class is positive
        pos_log = y * np.log(probs + 1e-15)
        
        #entropy when true class is negative
        neg_log = (1-y) * np.log((1-probs) + 1e-15)
        
        loss = -np.mean(pos_log + neg_log)
        return loss

    

In [28]:
data = pd.read_csv('./4900A1/parkinsons.csv')

In [29]:
x = data.drop(['status', 'name'], axis=1)
y = data['status']

In [34]:
from sklearn.model_selection import train_test_split


In [198]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

ValueError: Found input variables with inconsistent numbers of samples: [195, 207]

In [169]:
clf = LogisticRegression()

In [170]:
clf.fit(np.array(X_train), np.array(y_train))

<__main__.LogisticRegression at 0x192cd79d408>

In [171]:
clf.predict(np.array(X_test))

array([[0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0]])

In [173]:
acc = clf.accuracy(np.array(X_test), np.array(y_test))
loss = clf.loss(np.array(X_test), np.array(y_test))
print(acc)
print(loss)

Predictions are:  [[0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]]
Actual labels:  [0 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 1 1 1 0 0
 0 1]
0.43852728468113084
12.398535116121783


In [149]:
def test_learning_rates(X_train, X_test, y_train, y_test, lrs):
    accuracy = []
    for lr in lrs:
        clf = LogisticRegression(lr=lr)
        clf.fit(np.array(X_train), np.array(y_train))
        clf.predict(np.array(X_test))
        print(clf.accuracy(np.array(X_test), np.array(y_test)))
    

        

In [155]:
lrs = [0.1, 0.01, 0.001, 0.05, 0.2, 0.002, 0.0000001]
results = test_learning_rates(X_train, X_test, y_train, y_test, lrs)
results

Predictions are:  [[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]]
Actual labels:  [0 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 1 1 1 0 0
 0 1]
0.5759368836291914
Predictions are:  [[1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]]
Actual labels:  [0 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 1 1 1 0 0
 0 1]
0.5976331360946746
Predictions are:  [[1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]]
Actual labels:  [0 1 1 1 0 1 0 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 1 0 1 0 1 1 1 1 1 0 0
 0 1]
0.5542406311637081
Prediction

0.5687047994740303

TypeError: object of type 'NoneType' has no len()

In [174]:
data1 = pd.read_csv('./4900A1/sonar.csv')

In [176]:
data1 

Unnamed: 0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R
0,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.0140,0.0049,0.0052,0.0044,R
1,0.0262,0.0582,0.1099,0.1083,0.0974,0.2280,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.0180,0.0244,0.0316,0.0164,0.0095,0.0078,R
2,0.0100,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.0150,0.0085,0.0073,0.0050,0.0044,0.0040,0.0117,R
3,0.0762,0.0666,0.0481,0.0394,0.0590,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.0110,0.0015,0.0072,0.0048,0.0107,0.0094,R
4,0.0286,0.0453,0.0277,0.0174,0.0384,0.0990,0.1201,0.1833,0.2105,0.3039,...,0.0045,0.0014,0.0038,0.0013,0.0089,0.0057,0.0027,0.0051,0.0062,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,0.0187,0.0346,0.0168,0.0177,0.0393,0.1630,0.2028,0.1694,0.2328,0.2684,...,0.0116,0.0098,0.0199,0.0033,0.0101,0.0065,0.0115,0.0193,0.0157,M
203,0.0323,0.0101,0.0298,0.0564,0.0760,0.0958,0.0990,0.1018,0.1030,0.2154,...,0.0061,0.0093,0.0135,0.0063,0.0063,0.0034,0.0032,0.0062,0.0067,M
204,0.0522,0.0437,0.0180,0.0292,0.0351,0.1171,0.1257,0.1178,0.1258,0.2529,...,0.0160,0.0029,0.0051,0.0062,0.0089,0.0140,0.0138,0.0077,0.0031,M
205,0.0303,0.0353,0.0490,0.0608,0.0167,0.1354,0.1465,0.1123,0.1945,0.2354,...,0.0086,0.0046,0.0126,0.0036,0.0035,0.0034,0.0079,0.0036,0.0048,M


In [180]:
x2 = data1.iloc[:,:60]

In [186]:
y2 = data1.iloc[:,-1]

In [188]:
cleanup = {'R' : 1, 'M' : 0}
y2.replace(cleanup, inplace=True)
y2

0      1
1      1
2      1
3      1
4      1
      ..
202    0
203    0
204    0
205    0
206    0
Name: R, Length: 207, dtype: int64

In [203]:
X_train, X_test, y_train, y_test = train_test_split(x2,y2, test_size=0.2)

In [204]:
clf = LogisticRegression()
clf.fit(np.array(X_train), np.array(y_train))
clf.predict(np.array(X_test))
clf.accuracy(np.array(X_test), np.array(y_test))

Predictions are:  [[0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [1]
 [0]
 [1]
 [1]
 [0]]
Actual labels:  [0 0 0 1 1 1 1 0 1 0 1 0 1 0 0 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 0 1 0 0 1 1 1
 1 0 0 1 1]


0.4977324263038549