My own library to implement machine learning
(Regression and Logistic Regression models for now)

In [13]:
import numpy as np; import pandas as pd;
from matplotlib import pyplot as plt;
import math;
#I generally add semicolons even in python. 

#designed for logistic Regression currently
class LogisticRegression:
    globalW = 0;
    globalB = 0;
    def __init__(self) -> None:
        self.globalW = 0;
        self.globalB = 0;
        pass
    '''returns the sigmoid function for all elements of z(if z is a numpy array).
    useful for classification problems using logistic regression'''
    def sigmoid(z): 
        g = 1/(1 + np.exp(-z)); 
        return g;

    '''uses numpy's vectorization to effeciently carry out finding the total cost'''
    def FindTotalCostLogistic(X, y, w, b, regularizationFactor = 0):
        """
        Computes the cost over all examples
        Arguments as follows
        X : (ndarray Shape (m,n)) data, m examples by n features
        y : (array_like Shape (m,)) target value 
        w : (array_like Shape (n,)) Values of parameters of the model      
        b : scalar Values of bias parameter of the model
        regularizationFactor: used for regularization with higher order terms to prevent overfitting
        Returns:
        total_cost: (scalar)         cost 
        """
    

        m, n = X.shape; 
        newX = X.transpose(); 

        z_wb = np.dot(w,newX) + b; #Now z_wb is an m sized array
        f_wb = LogisticRegression.sigmoid(z_wb);    #f2_wb = np.log(1-f_wb); f1_wb = np.log(f_wb);
        f2_wb= np.log(1-f_wb); f1_wb = np.log(f_wb);
        #print(z_wb); 

        # for i in range(m):
        #     if(f_wb[i] == 1):
        #         f2_wb[i] = -1; 
        #     else:
        #         f2_wb[i] = np.log(1-f_wb[i]); 

        #f_wb[np.isnan(f_wb)] = 0; f2_wb[np.isnan(f2_wb)] = 0; f1_wb[np.isnan(f2_wb)] = 0; 
        loss = -y*f1_wb - (1-y)*f2_wb; 
        #important fact ist that this number must be positive always
        #print(loss); 
        #print(np.sum(loss)); 
        regularizedCost = (regularizationFactor/(2 * m))*np.dot(w,w); 
        total_cost = np.sum(loss)/m + regularizedCost; 
        return total_cost; 
   
    '''Computes the gradient for the input data'''
    def CalculateGradientLogistic(X, y, w, b, regularizationFactor=0): 
        """
        Computes the gradient for logistic regression 
    
        Args:
        X : (ndarray Shape (m,n)) variable such as house size 
        y : (array_like Shape (m,1)) actual value 
        w : (array_like Shape (n,1)) values of parameters of the model      
        b : (scalar)                 value of parameter of the model 
        regularizationFactor: for regularization. 0 for no regularization.
        Returns
        dJdW: (array_like Shape (n,1)) The gradient of the cost w.r.t. the parameters w. 
        dJdB: (scalar)                The gradient of the cost w.r.t. the parameter b. 
        """
        m, n = X.shape; 
        dJdW = np.zeros(w.shape); 
        T = X.transpose(); 
        allZ = np.dot(w,T) + b; #vectorized
        allF = LogisticRegression.sigmoid(allZ) - y; #vectorized
        dJdB = np.sum(allF)/m; #vectorized
        
        modifiedAllF = allF*T; #vectorized
        for j in range(n):
            dJdW[j] = np.sum(modifiedAllF[j])/m; #vectorized over the examples, but not over the features      
        
        regularizedAddition = w*regularizationFactor/m; 
        dJdW += regularizedAddition; #to implement regularization of the w coeffecients
        return dJdB, dJdW; 
    
    '''performs gradient descent to learn and improve the variables'''
    def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, regularizationFactor): 
        """
        Performs batch gradient descent to learn theta. Updates theta by taking 
        num_iters gradient steps with learning rate alpha
        
        Args:
        X :    (array_like Shape (m, n)
        y :    (array_like Shape (m,))
        w_in : (array_like Shape (n,))  Initial values of parameters of the model
        b_in : (scalar)                 Initial value of parameter of the model
        cost_function:                  function to compute cost
        alpha : (float)                 Learning rate
        num_iters : (int)               number of iterations to run gradient descent
        regularizationFactor (scalar, float)         regularization constant
        
        Returns:
        w : (array_like Shape (n,)) Updated values of parameters of the model after
            running gradient descent
        b : (scalar)                Updated value of parameter of the model after
            running gradient descent
        """
        
        # number of training examples
        m = len(X); 
        
        # An array to store cost J and w's at each iteration primarily for graphing later
        J_history = []; 
        w_history = []; 
        
        for i in range(num_iters):

            # Calculate the gradient and update the parameters
            dJdB, dJdW = gradient_function(X, y, w_in, b_in, regularizationFactor); 
            # Update Parameters using w, b, alpha and gradient
            w_in = w_in - alpha * dJdW;               
            b_in = b_in - alpha * dJdB;             
        
            # Save cost J at each iteration for checking later
            if i<100000:      # prevent resource exhaustion 
                cost =  cost_function(X, y, w_in, b_in, regularizationFactor); 
                J_history.append(cost); 

            # Print cost every at intervals 10 times or as many iterations if < 10
            if i% math.ceil(num_iters/10) == 0 or i == (num_iters-1):
                w_history.append(w_in); 
                print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.2f}   "); 
            
        return w_in, b_in, J_history, w_history; #return w and J,w history for graphing
    
    # UNQ_C4
# GRADED FUNCTION: predict
    '''gives the final output 0 or 1 using the learned parameters
    returns a list of 0's and 1's that depict whether the ith input was predicted as 0 or 1'''
    def predict(self, X, w, b):
        if(w == None):
            w = self.globalW; 
        if(b == None):
            b = self.globalB;  
        """        
        Args:
        X : (ndarray Shape (m, n))
        w : (array_like Shape (n,))      Parameters of the model
        b : (scalar, float)              Parameter of the model
        Returns:
        p: (ndarray (m,1))
            The predictions for X using a threshold at 0.5
        """
        # number of training examples
        m, n = X.shape   
        p = np.zeros(m)
        #important, we should probably be trying to normalize everything first
        ### START CODE HERE ### 
        T = X.transpose();
        allZ = np.dot(w,T) + b;
        allF = LogisticRegression.sigmoid(allZ);
        for i in range(0,m):
            if(allF[i] >= 0.5):
                p[i] = 1;
        return p

    def UnnormalizedRegressionTrain(self,x_train,y_train,alpha = 0.01,iterations=1000,regularizationFactor=0):
        #here x_train is an mxn array of numbers
        #and y_train is a list of 0's and 1's  corresponding to the outcome
        #first we calculate the values of w and 
        m,n = x_train.shape; 
        w = np.zeros(n); b = 0; 
        self.globalW,self.globalB,CostHistory,WHistory = LogisticRegression.gradient_descent(x_train,y_train,w,b,LogisticRegression.FindTotalCostLogistic,
        LogisticRegression.CalculateGradientLogistic,alpha,iterations,regularizationFactor);

        return CostHistory, WHistory;  
        #now that we have these parameters, we can call the predict function directly
    

Testing on some short data:- 

In [14]:
x_train = np.array([[2,3,5,4,3],[3,5,3,4,7],[4,6,3,1,9],[4,3,4,5,5],[2,1,9,3,8],[4,1,11,6,10],[10,4,9,12,3]]); 
x_train = x_train.transpose(); 
y_train = np.array([1,0,0,0,1]); 
myLR = LogisticRegression(); 
myLR.UnnormalizedRegressionTrain(x_train,y_train,iterations=10000,alpha=0.001);

print(myLR.predict(x_train,None,None)); 

pd.DataFrame()

Iteration    0: Cost     0.69   
Iteration 1000: Cost     0.44   
Iteration 2000: Cost     0.34   
Iteration 3000: Cost     0.27   
Iteration 4000: Cost     0.22   
Iteration 5000: Cost     0.19   
Iteration 6000: Cost     0.16   
Iteration 7000: Cost     0.14   
Iteration 8000: Cost     0.12   
Iteration 9000: Cost     0.11   
Iteration 9999: Cost     0.10   
[1. 0. 0. 0. 1.]


Trying the Titanic Disaster Machine Learning competition on Kaggle

In [27]:
train_data = pd.read_csv("/OneDrive - IIT Delhi/Pictures/_PythonProjects/Machine Learning/CSV/TitanicTraining.csv",index_col=0) 
#m,n = train_data.size; 
test_data = pd.read_csv("/OneDrive - IIT Delhi/Pictures/_PythonProjects/Machine Learning/CSV/test.csv",index_col=0);
train_data.head() 
usefulColumns = [1,3,4,5,6,8,10];
y_train = train_data.loc[:,'Survived']; 
train_data = train_data.iloc[:,usefulColumns]
m,n = train_data.shape
for i in range(1,m+1):
    c = train_data.loc[i,'Embarked'];
    if(c == 'S'):
        train_data.loc[i,'Embarked'] = 0; 
    elif(c == 'Q'):
        train_data.loc[i,'Embarked'] = -1; 
    else:
        train_data.loc[i,'Embarked'] = 1;
    if(train_data.loc[i,'Sex'] == 'male'):
        train_data.loc[i,'Sex'] = 0; 
    else:
        train_data.loc[i,'Sex'] = 1; 
train_data.Age =  train_data.Age.fillna(float(30))



#now we have effectively reduced the number of features to input to our model
#Survived should be a y_train for this
#the features I will use to test my ML library are Pclass, Sex, Age, SibSp, Parch, Fare and Embarked
train_data.Sex = train_data.Sex.astype('float64');
train_data.Age = train_data.Age.astype('float64');
train_data.Fare = train_data.Fare.astype('float64');
train_data.Embarked = train_data.Embarked.astype('float64');
train_data = train_data.astype('float64');
train_data.to_csv("/OneDrive - IIT Delhi/Pictures/_PythonProjects/Machine Learning/CSV/myTrainingDataOnlyFirst.csv")
train_data.head()

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3.0,0.0,22.0,1.0,0.0,7.25,0.0
2,1.0,1.0,38.0,1.0,0.0,71.2833,1.0
3,3.0,1.0,26.0,0.0,0.0,7.925,0.0
4,1.0,1.0,35.0,1.0,0.0,53.1,0.0
5,3.0,0.0,35.0,0.0,0.0,8.05,0.0


In [34]:
newTrainData = pd.read_csv("/OneDrive - IIT Delhi/Pictures/_PythonProjects/Machine Learning/CSV/myTrainingDataOnlyFirst.csv"); 
newTrainData = newTrainData.drop('PassengerId',axis = 'columns')
arrayXdata = newTrainData.to_numpy();
arrayYdata =  y_train.to_numpy();
#arrayXdata = arrayXdata.transpose();   
print(arrayXdata.shape);
print(arrayYdata.shape);

(891, 7)
(891,)


In [41]:
#starting the machine learning algorithm
TitanicLR = LogisticRegression(); 
TitanicLR.UnnormalizedRegressionTrain(arrayXdata,arrayYdata,0.001,100000,0); 

Iteration    0: Cost     0.67   
Iteration 10000: Cost     0.52   
Iteration 20000: Cost     0.48   
Iteration 30000: Cost     0.46   
Iteration 40000: Cost     0.46   
Iteration 50000: Cost     0.45   
Iteration 60000: Cost     0.45   
Iteration 70000: Cost     0.45   
Iteration 80000: Cost     0.45   
Iteration 90000: Cost     0.45   
Iteration 99999: Cost     0.45   
