# COMP551 Group101 Logistic Regression, Evaluation, and Experiment
 Eric Shen 260798146


## Logistic Regression


In [0]:
import numpy as np
import pandas as pd
import csv
from tqdm import tqdm
import math
import seaborn as sns
import matplotlib.pyplot as plt

class Logistic(object):
    def __init__(self, learningR, Iterations):
        self.learningRate = learningR
        self.gradientDescentIterations = Iterations
        self.weights = []

    def sigmoid(self, gamma):
        if gamma < 0:
            return 1 - 1 / (1 + math.exp(gamma))
        else:
            return 1 / (1 + math.exp(-gamma))

    def addup(self, MatrixW, MatrixX, Y):
        addOn = 0.0
        for num in range(0, len(MatrixX)):
            addOn = addOn + MatrixW[num] * MatrixX[num]
        addOn = Y - self.sigmoid(addOn)
        MatrixAdd = []

        for num in range(0, len(MatrixX)):
            MatrixAdd.append(MatrixX[num] * addOn)

        return MatrixAdd

    def fit(self, trainingDataMatrixX, trainingDataMatrixY):
        numOfRow = len(trainingDataMatrixX)
        numOfColumn = len(trainingDataMatrixX[0])
        MatrixAddAll = []
        if len(self.weights) > 0:
            for num in range(0, numOfColumn + 1):
                MatrixAddAll.append(0)
        else:
            for num in range(0, numOfColumn + 1):
                self.weights.append(0.01)
                MatrixAddAll.append(0)

        for num in range(0, self.gradientDescentIterations):
            for numOne in range(0, numOfRow):
                MaX = trainingDataMatrixX[numOne]
                MaX = np.append([1],MaX)
                matrixAdd = self.addup(self.weights, MaX, trainingDataMatrixY[numOne])
                for numTwo in range(0, numOfColumn + 1):
                    MatrixAddAll[numTwo] = matrixAdd[numTwo] + MatrixAddAll[numTwo]
            for numThree in range(0, numOfColumn + 1):
                MatrixAddAll[numThree] = self.learningRate * MatrixAddAll[numThree]
            for numFour in range(0, numOfColumn + 1):
                self.weights[numFour] = self.weights[numFour] + MatrixAddAll[numFour]
            for numFive in range(0, numOfColumn + 1):
                MatrixAddAll[numFive] = 0
        return

    def predict(self, trainingDataMatrixX):
        outPutY = []
        for numOne in range(0, len(trainingDataMatrixX)):
            MatrixTemp = trainingDataMatrixX[numOne]
            sig = self.weights[0]
            for num in range(0, len(MatrixTemp)):
                sig = sig + self.weights[num + 1] * MatrixTemp[num]
            if self.sigmoid(sig) >= 0.5:
                outPutY.append(1)
            else:
                outPutY.append(0)
        return outPutY


## Normaliztion from Edwin's code


In [0]:
import numpy as np

#==============================================================================================================
#
#   Vector Normalizer
#
#     Takes an input vector of numbers and normalizes its values between 0 and 1.
#
#==============================================================================================================
def normalize_vector(vector):
  #Obtain Normalization Values
  min_value = vector[0]
  max_value = vector[0]
  for i in range(len(vector)):
    if vector[i] < min_value:
      min_value = vector[i]
    elif vector[i] > max_value:
      max_value = vector[i]
  #Normalize all vector elements
  for i in range(len(vector)):
    vector[i] = ( vector[i] - min_value )/(max_value-min_value)
  return vector

## Processes for four datasets

### train_test_split

In [0]:
def train_test_split(mydataset: np.ndarray, k: int, Normalize: bool):

    #Normalize all feature input columns
    if(Normalize):
      for column in range( len(mydataset[0]) - 1 ):
        vector = []
        for instance in range(len(mydataset)):
          vector.append( mydataset[instance][column] )
        vector = normalize_vector(vector)
        for instance in range(len(mydataset)):
          mydataset[instance][column] = vector[instance]

    # First I did a shuffle for whole set
    np.random.shuffle(mydataset)
    rows = mydataset.shape[0]
    # I add k/10 of set to train set, the rest is test set
    mydataset_train = mydataset[: (int)(k * rows/10), :]
    mydataset_test = mydataset[(int)(k * rows/10):, :]
    # Then I split both train set and test set into features and classification
    mydataset_train_x = mydataset_train[:, :-1]
    mydataset_train_y = mydataset_train[:, -1]
    mydataset_test_x = mydataset_test[:, :-1]
    mydataset_test_y = mydataset_test[:, -1]

    return mydataset_train_x, mydataset_train_y, mydataset_test_x, mydataset_test_y


### Process Ionosphere

In [0]:
def process_ionosphere():

    ionosphere = pd.read_csv("ionosphere.csv", header=None)
    ionosphere          = ionosphere.drop([1],axis=1).to_numpy()

    classIndex = len(ionosphere[0]) -1
    for i in range(len(ionosphere)): 
      if( ionosphere[i][classIndex] == "g" ): 
          ionosphere[i][classIndex] = 1
      else:
          ionosphere[i][classIndex] = 0


    ionosphere = np.array(ionosphere[0:])

    return train_test_split(ionosphere, 9, True)

### One Hot Encoding for adult data

In [0]:
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
import numpy as np


def clean(data):
    data = data.dropna(axis='index')
    return data.reset_index(drop=True)

def preprocess(data):
    le = preprocessing.LabelEncoder()
    le.fit(data['workclass'])
    data['workclass'] = le.transform(data['workclass'])
    le.fit(data['education'])
    data['education'] = le.transform(data['education'])
    le.fit(data['marital-status'])
    data['marital-status'] = le.transform(data['marital-status'])
    le.fit(data['occupation'])
    data['occupation'] = le.transform(data['occupation'])
    le.fit(data['relationship'])
    data['relationship'] = le.transform(data['relationship'])
    le.fit(data['race'])
    data['race'] = le.transform(data['race'])
    le.fit(data['sex'])
    data['sex'] = le.transform(data['sex'])
    le.fit(data['native-country'])
    data['native-country'] = le.transform(data['native-country'])
    le.fit(data['id'])
    data['id'] = le.transform(data['id'])
    temp = data.drop(columns='id').copy()
    data = data[temp.apply(lambda x: np.abs(x - x.mean()) / x.std() < 3).all(axis=1)]
    return data.reset_index(drop=True)


def one_hot_encoder(data):
    values = data.to_numpy()

    label_encoder = LabelEncoder()
    integer_encoded = label_encoder.fit_transform(values)

    onehot_encoder = OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
    return onehot_encoded


### Process Adult

In [0]:
def process_Adult():
  data = pd.read_csv('adult.data', engine='python', sep=',\s', na_values=['?'],
                    names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                           'marital-status', 'occupation','relationship', 'race', 'sex', 
                           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'id'])
  target = data['id']

  data = clean(data)
  data = preprocess(data)

  workclass = one_hot_encoder(data['workclass'])
  data = pd.concat([data, pd.DataFrame(workclass)], axis=1)

  education = one_hot_encoder(data['education'])
  data = pd.concat([data, pd.DataFrame(education)], axis=1)

  marital_status = one_hot_encoder(data['marital-status'])
  data = pd.concat([data, pd.DataFrame(marital_status)], axis=1)

  occupation = one_hot_encoder(data['occupation'])
  data = pd.concat([data, pd.DataFrame(occupation)], axis=1)

  relationship = one_hot_encoder(data['relationship'])
  data = pd.concat([data, pd.DataFrame(relationship)], axis=1)

  race = one_hot_encoder(data['race'])
  data = pd.concat([data, pd.DataFrame(race)], axis=1)

  native_country = one_hot_encoder(data['native-country'])
  data = pd.concat([data, pd.DataFrame(native_country)], axis=1)

  del data['workclass']
  del data['education']
  del data['marital-status']
  del data['occupation']
  del data['race']
  del data['relationship']
  del data['native-country']
  ids = data['id'].copy()
  del data["id"]
  data.insert(data.shape[1], "id", ids)

  adult_dataset_result = data.to_numpy()

  return train_test_split(adult_dataset_result, 9, True)

### Process wines


In [0]:
def process_wines():
    with open("winequality-white.csv", 'r') as f:
        wines = list(csv.reader(f, delimiter=";"))
    global wines_header
    wines_header = np.array(wines[0])  # with label header
    wines = np.array(wines[1:], dtype=np.float)  # with label

    # clean malinformed values by deleting the rows they inhabit
    invalid_index = []
    for i in range(len(wines)):
        for number in wines[i]:
            if math.isnan(number):
                np.delete(wines, i, 0)

    # differentiate labels
    for i in tqdm(range(len(wines[:, -1]))):
        if wines[:, -1][i] > 5:
            wines[:, -1][i] = 1
        else:
            wines[:, -1][i] = 0

    return train_test_split(wines, 9, True)


### Process Breast Cancer

In [0]:
def process_cancer():
    with open("breast-cancer-wisconsin.csv", 'r') as f:
        tumors = list(csv.reader(f, delimiter=";"))

    global tumors_header
    tumors_header = ["clump thickness", "cell size", "cell shape", "marginal adhesion", \
                     "single epithelial cell size", "number of bare nuclei", "bland chromatin", \
                     "number of normal nuclei", "mitosis", "label"]  # with label header but no IDs
    invalid_index = []
    for i in tqdm(range(len(tumors))):
        tumors[i] = tumors[i][0].split(",")
        for j in range(len(tumors[i])):
            if tumors[i][j].isnumeric() == False:
                invalid_index.append(i) 
        # change labels into binary
        if int(tumors[i][-1]) <= 2:
            tumors[i][-1] = '0'
        else:
            tumors[i][-1] = '1'

    # I clean malformed values by deleting the whole row
    invalid_index.sort(reverse=True)
    for i in invalid_index:
        tumors.remove(tumors[i])

    tumors = np.array(tumors[0:], dtype=np.float)
    tumors = tumors[:, 1:]

    return train_test_split(tumors, 9, True)

## Evaluation

In [0]:
def evaluation(prediction: np.ndarray, groundtruth: np.ndarray):
    # sanity check
    if len(prediction) != len(groundtruth):
        raise TypeError
    
    tn,fp,fn,tp = 0,0,0,0 # these stands for true negative, false positive, false negative, true positive
    
    for i in range(len(prediction)):
        if prediction[i] == 0 and groundtruth[i] == 0:
            tn += 1
        if prediction[i] == 1 and groundtruth[i] == 0:
            fp += 1
        if prediction[i] == 0 and groundtruth[i] == 1:
            fn += 1
        if prediction[i] == 1 and groundtruth[i] == 1:
            tp += 1
    return tn,fp,fn,tp

In [0]:
def merge_chunks(data_split,indices):
    indices = list(indices).sort()
    if len([indices]) < 2:
        return data_split[0]
    data_merged = data_split[indices[0]]
    indices.remove(indices[0])
    for i in indices:
        data_merged = np.concatenate(data_merged,data_split[i],axis=0)
        
    return data_merged

In [0]:
def confusion_m(prediction: np.ndarray, groundtruth: np.ndarray):
    tn,fp,fn,tp = evaluation(prediction,groundtruth)
    confusion_matrix = [[tp, fp],[fn,tn]]
    return confusion_matrix

In [0]:
def evaluate_acc(prediction: np.ndarray, groundtruth: np.ndarray):
    tn,fp,fn,tp = evaluation(prediction,groundtruth)
    return 1.0*(tp+tn)/(tp+tn+fp+fn)

## Cross Validation

In [0]:
def cross_validation(model,x: np.ndarray,y: np.ndarray, k: int):
    
    data = np.zeros((len(x),len(x[0])+1))
    #combine and save to "data"
    for i in range(len(x)):
        data[i] = np.append(x[i],[y[i]])
    # I shuffle the whole dataset then just split    
    np.random.shuffle(data)
    data_split = np.array_split(data,k)
    indices = set(range(k))
    # the list contains all the output accuracies by k folds
    acc_list = [] 
    for fold in range(k):
        # merge the numpy arrays except for the validation set for training
        other_indices = indices - set([fold])
        training_set = merge_chunks(data_split,other_indices)
        test_set = data_split[fold]
        x_train = training_set[:,:-1]
        y_train = training_set[:,-1]
        x_test = test_set[:,:-1]
        y_test = test_set[:,-1]
        
        model.fit(x_train,y_train)
        y_prediction = model.predict(x_test)
        
        acc_list.append(evaluate_acc(y_prediction,y_test))
    return sum(acc_list) / len(acc_list)

## Four Test

In [553]:
import time
ionosphere_train_x, ionosphere_train_y, ionosphere_test_x, ionosphere_test_y = process_ionosphere()
start = time.time()
clf = Logistic(0.001,1000)
print("LR on Ionosphere's training set by using 5-fold CV: ",cross_validation(clf,ionosphere_train_x,ionosphere_train_y,5))
end = time.time()
print("Final accuracy on Ionosphere's testing set: ", evaluate_acc(clf.predict(ionosphere_test_x),ionosphere_test_y))
print("LR on Ionosphere train set time: ", (end - start)/5)

LR on Ionosphere's training set by using 5-fold CV:  0.8603174603174603
Final accuracy on Ionosphere's testing set:  0.8611111111111112
LR on Ionosphere train set time:  2.4480565547943116


In [564]:
import time
Adult_train_x, Adult_train_y, Adult_test_x, Adult_test_y  = process_Adult()
start = time.time()
clf = Logistic(0.001,1000)
print("LR on Adult",cross_validation(clf,Adult_train_x,Adult_train_y,5))
end = time.time()
print("Final accuracy on Adult's testing set: ", evaluate_acc(clf.predict(Adult_test_x),Adult_test_y))
print("LR on Adult train set time: ", (end - start)/5)

LR on Adult 0.819357996220147
Final accuracy on Adult's testing set:  0.8260869565217391
LR on Adult train set time:  363.43937520980836


In [559]:
import time
cancer_train_x, cancer_train_y, cancer_test_x, cancer_test_y = process_cancer()
start = time.time()
clf = Logistic(0.001,1000)
print("LR on Cancer's training set by using 5-fold CV: ",cross_validation(clf,cancer_train_x, cancer_train_y,5))
end = time.time()
print("Final accuracy on Cancer's testing set: ", evaluate_acc(clf.predict(cancer_test_x),cancer_test_y))
print("LR on Cancer train set time: ", (end - start)/5)

100%|██████████| 699/699 [00:00<00:00, 138653.04it/s]


LR on Cancer's training set by using 5-fold CV:  0.9608556577369052
Final accuracy on Cancer's testing set:  1.0
LR on Cancer train set time:  2.233937931060791


In [560]:
import time
x_wines_train, y_wines_train, x_wines_test, y_wines_test = process_wines()
start = time.time()
clf = Logistic(0.001,1000)
print("LR on wines' training set by using 5-fold CV: ",cross_validation(clf,x_wines_train,y_wines_train,5))
end = time.time()
print("Final accuracy on Wine's testing set: ", evaluate_acc(clf.predict(x_wines_test),y_wines_test))
print("LR on wines train set time: ", (end - start)/5)

100%|██████████| 4898/4898 [00:00<00:00, 510770.52it/s]


LR on wines' training set by using 5-fold CV:  0.7454675551643283
Final accuracy on Wine's testing set:  0.7163265306122449
LR on wines train set time:  17.35471272468567
