In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import os
import tarfile
import io
import pandas as pd
import numpy as np
from numpy import dot as mmult
from numpy.linalg import inv
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.special import expit
import warnings

# Print versions
print("cv2 version:", cv2.__version__)
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("matplotlib version:", plt.matplotlib.__version__)
print("seaborn version:", sns.__version__)
print("missingno version:", msno.__version__)
print("sklearn version:", sklearn.__version__)

cv2 version: 4.6.0
pandas version: 2.1.4
numpy version: 1.26.3
matplotlib version: 3.8.0
seaborn version: 0.12.2
missingno version: 0.4.2
sklearn version: 1.2.2


In [2]:
wine_quality = fetch_openml(name='wine-quality-red', version=1, parser='auto')
X = wine_quality.data
Y = wine_quality.target

In [3]:
def handle_missing_values(X, Y):
    """
    Removing rows with missing values.
    """
    missing_values_count_X = X.isnull().sum()
    missing_values_count_Y = Y.isnull().sum()

    # Print warning if missing values exist in X or Y
    if missing_values_count_X.sum() > 0 or missing_values_count_Y > 0:
        print("Warning: Input data contains missing values!")
        if missing_values_count_X.sum() > 0:
            print(f"Missing values in X: {missing_values_count_X.sum()}")
        if missing_values_count_Y > 0:
            print(f"Missing values in Y: {missing_values_count_Y}")
        print()

        # Remove rows with missing values
        cleaned_data = X.dropna()
        Y_cleaned = Y.loc[cleaned_data.index]

        # Reset index for X
        X_cleaned = cleaned_data.reset_index(drop=True)
        Y_cleaned = Y_cleaned.reset_index(drop=True)
    else:
        X_cleaned = X
        Y_cleaned = Y

    return X_cleaned, Y_cleaned

X_cleaned, Y_cleaned = handle_missing_values(X, Y)

In [4]:
def is_same_number_rows(X, Y):
    if isinstance(Y, pd.Series):
        Y = pd.DataFrame(Y)
    return X.shape[0] == Y.shape[0]
print(f'is_same_number_rows - {is_same_number_rows(X_cleaned, Y_cleaned)}')

is_same_number_rows - True


In [5]:
def evaluate_model(X, Y, label='Dataset', test_size=0.33, random_state=33, cv=5):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
    model = RandomForestClassifier(random_state=random_state)
    scores = cross_val_score(model, X_train, y_train, cv=cv)
    
    print(f"{label} Cross-Validation Scores:", scores)
    print(f"{label} Mean Score:", scores.mean())
    print()

    return X_train, X_test, y_train, y_test

In [6]:
test_size = 0.3
X_train, X_test, y_train, y_test = evaluate_model(X_cleaned, Y_cleaned, f'PCA 10 & test_size: {test_size}', test_size)

PCA 10 & test_size: 0.3 Cross-Validation Scores: [0.67857143 0.69642857 0.63392857 0.65625    0.68609865]
PCA 10 & test_size: 0.3 Mean Score: 0.6702554452274183



In [7]:
class BinaryLogisticRegressionBase:
    # private:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        return 'Base Binary Logistic Regression Object, Not Trainable'
    
    # convenience, private and static:
    @staticmethod
    def _sigmoid(theta):
        return 1/(1+np.exp(-theta)) 
    
    @staticmethod
    def _add_intercept(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term
    
    # public:
    def predict_proba(self, X, add_intercept=True):
        # add bias term if requested
        Xb = self._add_intercept(X) if add_intercept else X
        return self._sigmoid(Xb @ self.w_) # return the probability y=1
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5) #return the actual prediction

In [8]:
class BinaryLogisticRegression(BinaryLogisticRegressionBase):
    #private:
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
        
    def _get_gradient(self,X,y):
        # programming \sum_i (yi-g(xi))xi
        gradient = np.zeros(self.w_.shape) # set gradient to zero
        for (xi,yi) in zip(X,y):
            # the actual update inside of sum
            gradi = (yi - self.predict_proba(xi,add_intercept=False))*xi 
            # reshape to be column vector and add to gradient
            gradient += gradi.reshape(self.w_.shape) 
        
        return gradient/float(len(y))
       
    # public:
    def fit(self, X, y):
        Xb = self._add_intercept(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            self.w_ += gradient*self.eta # multiply by learning rate 

In [9]:
class VectorBinaryLogisticRegression(BinaryLogisticRegression):
    # inherit from our previous class to get same functionality
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    # but overwrite the gradient calculation
    def _get_gradient(self,X,y):
        ydiff = y-self.predict_proba(X,add_intercept=False).ravel() # get y difference
        # gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        gradient = np.mean(X * ydiff.values[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through

        return gradient.reshape(self.w_.shape)

In [10]:
class LogisticRegression:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained MultiClass Logistic Regression Object'
        
    def fit(self,X,y):
        num_samples, num_features = X.shape
        self.unique_ = np.unique(y) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.classifiers_ = [] # will fill this array with binary classifiers
        
        for i,yval in enumerate(self.unique_): # for each unique value
            y_binary = (y==yval) # create a binary problem
            # train the binary classifier for this class
            blr = VectorBinaryLogisticRegression(self.eta,
                                                 self.iters)
            blr.fit(X,y_binary)
            # add the trained classifier to the list
            self.classifiers_.append(blr)
            
        # save all the weights into one matrix, separate column for each class
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T
        
    def predict_proba(self,X):
        probs = []
        for blr in self.classifiers_:
            probs.append(blr.predict_proba(X)) # get probability for each classifier
        
        return np.hstack(probs) # make into single matrix
    
    def predict(self,X):
        return self.unique_[np.argmax(self.predict_proba(X),axis=1)] # take argmax along row

In [11]:
lr = LogisticRegression(0.1,500)
lr.fit(X_train,y_train)
print(lr)

MultiClass Logistic Regression Object with coefficients:
[[-1.45432965e-03 -1.18119713e-01  1.39371979e-01 -3.81536705e-02
   9.06825276e-02  1.34509130e-02 -4.91475036e-02 -3.31481763e-02
  -1.20810866e-03  2.54739563e-02 -3.22423224e-02 -2.55328269e-01]
 [ 3.38519293e-02 -3.39633977e-01  2.86163892e-01 -1.66800376e-01
   2.68078961e-01 -1.16778797e-02 -2.72381031e-01 -7.55762883e-01
   3.36574317e-02  2.02793969e-01 -9.83362286e-02 -1.54341591e-01]
 [ 3.14323323e-01  5.15845869e-01  1.04436854e+00 -4.64736768e-01
  -3.14902734e-01  1.37080936e-01 -3.35854868e+00  9.78120731e-01
   3.18376287e-01  1.01525532e+00 -5.84115861e-01 -4.60110580e+00]
 [-5.03777114e-02  9.00197174e-02 -5.75357001e-01  1.91315229e-01
  -1.09370557e+00 -3.18439228e-02  2.39272077e+00 -4.84025928e-01
  -5.13516620e-02 -1.78933587e-01  3.29475702e-01  1.97475380e+00]
 [-4.58353040e-01 -8.61309727e-01 -9.06891774e-01  4.57168544e-01
   3.55602060e-01 -9.83622481e-02 -7.92311752e-01 -2.72254522e+00
  -4.59634489e-

In [12]:
yhat = lr.predict(X_test)
print('Accuracy of: ',accuracy_score(y_test,yhat))

Accuracy of:  0.40208333333333335
