## Implementing Logistic Regression with KFold

In [63]:
# Importing libraries
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

In [135]:
## Logistic regression class

class logisticRegressionClf:
    
    def __init__(self, eta = 0.1, nitermax = 50):
        
        self.eta = eta
        self.nitermax = nitermax
        self.w = None
        self.marginmax = 0
        
        return

    def fit(self, X, y):

        # Size of the X matrix - m rows and n columns (m x n)
        m, n = X.shape

        # Initializing the weights - Each feature has a weight
        w = np.zeros(n)
        self.marginmax = 0

        # Iterating

        for t in range(self.nitermax):
            for i in range(m):
                xymargin = y[i]*np.dot(w, X[i])
                if xymargin > self.marginmax:
                    self.marginmax = xymargin
                    
                philog = 1/(1 + np.exp(-(-xymargin)))
                deltaJ = -philog * y[i] * X[i]
                w = w - self.eta * deltaJ # Updating weights
        self.w = w
        
        return 
    
    
    def predict(self, X, y = None):
        
        #print(self.w)
        
        xw = np.dot(X,self.w)
        
        # Predicting + 1 probability
        Pyplus = 1/(1+np.exp(-xw))
        # Predicting -1 probability
        Pyminus = 1/(1+np.exp(xw))
        
        # Labels with greater probabilities
        y = 2*(Pyplus > Pyminus) -1
        
        return(y)
    
    def main():
        
        # Data
        from sklearn.datasets import load_breast_cancer
        X, y = load_breast_cancer(return_X_y=True)
        # Convert {0,1} output into {-1,+1}
        y = 2*y - 1  
        
        # Scaling
        #scaler = StandardScaler()
        #X = scaler.transform(X) # Scaling with L2
        
        mdata, ndim = X.shape
        
        nitermax = 50  # maximum iteration
        eta = 0.1 # learning speed 

        nfold = 5 # number of folds 

        ## Split the data into 5-folds
        cselection = KFold(n_splits=nfold, random_state=None, shuffle=False)

        clogreg = logisticRegressionClf()
        
        X /= np.outer(np.ones(X.shape[0]), np.max(np.abs(X),0))

        xf1 = np.zeros(nfold)

        n = 0
        
        for index_train, index_test in cselection.split(X):
            Xtrain = X[index_train]
            ytrain = y[index_train]
            Xtest = X[index_test]
            ytest = y[index_test]

            clogreg.fit(Xtrain, ytrain)
            yprediction = clogreg.predict(Xtest)
                        
            f1 = f1_score(yprediction, ytest)
            
            print("F1 score is:", f1)
            xf1[n] = f1
            n += 1
            
        print("The average F1: ", np.mean(xf1))

In [136]:
clf = logisticRegressionClf

clf.main()

F1 score is: 0.8823529411764706
F1 score is: 0.9285714285714286
F1 score is: 0.9605263157894737
F1 score is: 0.9941520467836257
F1 score is: 0.983050847457627
The average F1:  0.949730715955725
