In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
import matplotlib as plt

In [None]:
# !pip install xlrd == 1.2.0

In [None]:
@dataclass
class LogisticRegression:
    learningRate: float
    tolerance: float
    maxIteration: int
    filePath: str
    
    def __post_int__(self):
        self.train_X, self.rain_y, self.test_X, self.test_y = self.readDataset()
        self.train_X = self.addX0(self.train_X)
        self.test_X = self.addX0(self.test_X)
        
    def readDataset(self):
        train_df = pd.read_excel(self.filePath, sheet_name='2004--2005 Data')
        test_df = pd.read_excel(self.filePath, sheet_name='2004--2007 Data')
        train_df = train_df.values
        test_df = train_df.values
        
        train_X, train_y = train_df[:,1:], train_df[:,0]
        test_X, test_y = test_df[:,1:], test_df[:,0]
        
        return train_X, train_y, test_X, test_y
    
    def addX0 (self, X):
        return np.column_stack([np.ones(X.shape[0]), X])
    
    def sigmoid(self, z):
        sig = 1 / (1 + np.exp(-z))
        return sig
    
    def predict(self, X):
        pred = self.sigmoid(X.dot(self.w))
        return pred
    
    def costFunction(self, X, y):
        # approach 1
        sig = self.predict(X)
        pred = y * log(sig) + (1-y)* np.log(1-sig)
        cost = - pred.sum()
        
        # approach 2
        pred = np.log(np.ones(X.shape[0])-np.exp(X.dot(self.w))) - X.dot(self.w).dot(y)
        cost = pred.sum()
        return cost
    
    def gradient(self, X, y):
        sig = self.sigmoid(self.predict(X))
        grad = (sig - y).dot(X)
        return grad
    
    def gradientDescent(self, X, y):
        error = []
        last = float('inf')
        
        for i in tqdm(range (self.maxIteration)):
            self.w = self.w - self.learningRate * self.gradient(X, y)
            current = self.costFunction(X, y)
            diff = abs(last - current)
            last = current
            if diff < self.tolerance:
                print("model stopped learning")
                break
            
    def fit(self):
        print('solve using Gradient Descent')
        self.w = np.ones(self.train_X.shape[1], dtype = np.float64)
        self.gradientDescent(self.train_X, self.train_y)
        
        f_score, precision, recall = self.evaluate(self.train_X, self.train_y)
        
        print("F1 score {} and precision {}, recall {}".format(f_score, precision, recall))
        
    def evaluate(self, X, y):
        # Precision and recall: refer to wikipedia
        # F-measure when data is imbalanced
        y_hat = self.predict(X)
        y = (y == 1)
        y_hat = (y_hat == 1)
        
        precision = (y & y_hat).sum() / y_hat.sum()
        recall = (y & y_hat).sum() / y.sum()
        
        f_score = 2 * (precision * recall) / (precision + recall)
        
        return f_score, precision, recall
    
    # def plot functions from lab 2 hw

In [None]:
lr = LogisticRegression(0.00001, 0.0005, 50000, 'type_in_ur_file_name.xls')

In [None]:
lr.fit()