My own library to implement machine learning
(Regression and Logistic Regression models for now)

In [8]:
import numpy as np; import pandas as pd;
from matplotlib import pyplot as plt;
import math;
#I generally add semicolons even in python. 

#designed for logistic Regression currently
class LogisticRegression:
    globalW = np.zeros(1);
    criteria = 0.5; 
    globalB = 0;
    features = 0; #(n )
    MeanOfFeaturesTaken = np.zeros(features); 
    SDofFeaturesTaken = np.zeros(features); 
    Normalized = False;  #a bool to store the method of regression used
    def __init__(self) -> None:
        self.globalW = np.zeros(1);
        self.globalB = 0;
        pass
    '''returns the sigmoid function for all elements of z(if z is a numpy array).
    useful for classification problems using logistic regression'''
    def sigmoid(z): 
        g = 1/(1 + np.exp(-z)); 
        return g;

    '''uses numpy's vectorization to effeciently carry out finding the total cost'''
    def FindTotalCostLogistic(X, y, w, b, regularizationFactor = 0):
        """
        Computes the cost over all examples
        Arguments as follows
        X : (ndarray Shape (m,n)) data, m examples by n features
        y : (array_like Shape (m,)) target value 
        w : (array_like Shape (n,)) Values of parameters of the model      
        b : scalar Values of bias parameter of the model
        regularizationFactor: used for regularization with higher order terms to prevent overfitting
        Returns:
        total_cost: (scalar)         cost 
        """
    

        m, n = X.shape; 
        newX = X.transpose(); 

        z_wb = np.dot(w,newX) + b; #Now z_wb is an m sized array
        f_wb = LogisticRegression.sigmoid(z_wb);    #f2_wb = np.log(1-f_wb); f1_wb = np.log(f_wb);
        f2_wb= np.log(1-f_wb); f1_wb = np.log(f_wb);
        #print(z_wb); 
        # for i in range(m):
        #     if(f_wb[i] == 1):
        #         f2_wb[i] = -1; 
        #     else:
        #         f2_wb[i] = np.log(1-f_wb[i]); 

        #f_wb[np.isnan(f_wb)] = 0; f2_wb[np.isnan(f2_wb)] = 0; f1_wb[np.isnan(f2_wb)] = 0; 
        loss = -y*f1_wb - (1-y)*f2_wb; 
        #important fact ist that this number must be positive always
        #print(loss); 
        #print(np.sum(loss)); 
        regularizedCost = (regularizationFactor/(2 * m))*np.dot(w,w); 
        total_cost = np.sum(loss)/m + regularizedCost; 
        return total_cost; 
   
    '''Computes the gradient for the input data'''
    def CalculateGradientLogistic(X, y, w, b, regularizationFactor=0): 
        """
        Computes the gradient for logistic regression 
    
        Args:
        X : (ndarray Shape (m,n)) variable such as house size 
        y : (array_like Shape (m,1)) actual value 
        w : (array_like Shape (n,1)) values of parameters of the model      
        b : (scalar)                 value of parameter of the model 
        regularizationFactor: for regularization. 0 for no regularization.
        Returns
        dJdW: (array_like Shape (n,1)) The gradient of the cost w.r.t. the parameters w. 
        dJdB: (scalar)                The gradient of the cost w.r.t. the parameter b. 
        """
        m, n = X.shape; 
        dJdW = np.zeros(w.shape); 
        T = X.transpose(); 
        allZ = np.dot(w,T) + b; #vectorized
        allF = LogisticRegression.sigmoid(allZ) - y; #vectorized
        dJdB = np.sum(allF)/m; #vectorized
        
        modifiedAllF = allF*T; #vectorized
        for j in range(n):
            dJdW[j] = np.sum(modifiedAllF[j])/m; #vectorized over the examples, but not over the features      
        
        regularizedAddition = w*regularizationFactor/m; 
        dJdW += regularizedAddition; #to implement regularization of the w coeffecients
        return dJdB, dJdW; 
    
    '''performs gradient descent to learn and improve the variables'''
    def gradient_descent(X, y, w_in, b_in, cost_function, gradient_function, alpha, num_iters, regularizationFactor): 
        """
        Performs batch gradient descent to learn theta. Updates theta by taking 
        num_iters gradient steps with learning rate alpha
        
        Args:
        X :    (array_like Shape (m, n)
        y :    (array_like Shape (m,))
        w_in : (array_like Shape (n,))  Initial values of parameters of the model
        b_in : (scalar)                 Initial value of parameter of the model
        cost_function:                  function to compute cost
        alpha : (float)                 Learning rate
        num_iters : (int)               number of iterations to run gradient descent
        regularizationFactor (scalar, float)         regularization constant
        
        Returns:
        w : (array_like Shape (n,)) Updated values of parameters of the model after
            running gradient descent
        b : (scalar)                Updated value of parameter of the model after
            running gradient descent
        """
        
        # number of training examples
        m = len(X); 
        
        # An array to store cost J and w's at each iteration primarily for graphing later
        J_history = []; 
        w_history = []; 
        
        for i in range(num_iters):

            # Calculate the gradient and update the parameters
            dJdB, dJdW = gradient_function(X, y, w_in, b_in, regularizationFactor); 
            # Update Parameters using w, b, alpha and gradient
            w_in = w_in - alpha * dJdW;               
            b_in = b_in - alpha * dJdB;             
        
            # Save cost J at each iteration for checking later
            if i<100000:      # prevent resource exhaustion 
                cost =  cost_function(X, y, w_in, b_in, regularizationFactor); 
                J_history.append(cost); 

            # Print cost every at intervals 10 times or as many iterations if < 10
            if i% math.ceil(num_iters/10) == 0 or i == (num_iters-1):
                w_history.append(w_in); 
                print(f"Iteration {i:4}: Cost {float(J_history[-1]):8.6f}   "); 
            
        return w_in, b_in, J_history, w_history; #return w and J,w history for graphing
    

    '''gives the final output 0 or 1 using the learned parameters
    returns a list of 0's and 1's that depict whether the ith input was predicted as 0 or 1'''
    def predict(self, X, w, b):
        if(w == None):
            w = self.globalW; 
        if(b == None):
            b = self.globalB;  
        """        
        Args:
        X : (ndarray Shape (m, n))
        w : (array_like Shape (n,))      Parameters of the model
        b : (scalar, float)              Parameter of the model
        Returns:
        p: (ndarray (m,1))
            The predictions for X using a threshold at 0.5
        """
        # number of training examples
        m, n = X.shape   
        p = np.zeros(m)
        #important, we should probably be trying to normalize everything first
        
        T = X.transpose();
        print("here is the SD")
        print(self.SDofFeaturesTaken); 
        #then convert each value
        for i in range(n):
            T[i] = (T[i] - self.MeanOfFeaturesTaken[i])/self.SDofFeaturesTaken[i]; 
        print(T[:5]); 
        allZ = np.dot(w,T) + b;
        allF = LogisticRegression.sigmoid(allZ);
        for i in range(0,m):
            if(allF[i] >= self.criteria):
                p[i] = 1;
        return p


    def predictNormalized(self, X, w, b):
        if(w == None):
            w = self.globalW;
        if(b == None):
            b = self.globalB;  
        # number of training examples
        m, n = X.shape   
        p = np.zeros(m)
        #important, we should probably be trying to normalize everything first
        
        T = X.transpose();
        #then convert each value
        for i in range(n):
            T[i] = (T[i] - self.MeanOfFeaturesTaken[i])/self.SDofFeaturesTaken[i]; 
        
        #after this everything should be normalized
        allZ = np.dot(w,T) + b;
        allF = LogisticRegression.sigmoid(allZ);
        for i in range(0,m):
            if(allF[i] >= self.criteria):
                p[i] = 1;
        return p
    def UnnormalizedRegressionTrain(self,x_train,y_train,alpha = 0.01,iterations=1000,regularizationFactor=0):
        #here x_train is an mxn array of numbers
        #and y_train is a list of 0's and 1's  corresponding to the outcome
        #first we calculate the values of w and 
        m,n = x_train.shape; 
        if(self.globalW.shape != (n,)):
            self.globalW = np.zeros(n); 
        
        # self.MeanOfFeaturesTaken = np.zeros(n); 
        # self.SDofFeaturesTaken = np.ones(n);  
        self.globalW,self.globalB,CostHistory,WHistory = LogisticRegression.gradient_descent(x_train,y_train,self.globalW,self.globalB,LogisticRegression.FindTotalCostLogistic,
        LogisticRegression.CalculateGradientLogistic,alpha,iterations,regularizationFactor);

        return CostHistory, WHistory;  
        #now that we have these parameters, we can call the predict function directly
    
    '''used for implementing normalization on the data so the ranges of each feature become comparable to each other
    results in faster gradient descent'''
    def NormalizedRegressionTrain(self, x_train, y_train, alpha = 0.01, iterations = 1000, regularizationFactor = 0):
        #first we need to convert each of xtrain inputs to (x - mu)/ sd where mu is average and sd is standard deviation
        #we achieve this through a simple loop
        self.Normalized = True; 
        m,n = x_train.shape; 
        self.MeanOfFeaturesTaken = np.zeros(n); 
        self.SDofFeaturesTaken = np.zeros(n);  
        T = x_train.transpose(); 
        for i in range(n):
            self.MeanOfFeaturesTaken[i] = np.sum(T[i])/m; 
        #after the mean has been processed, we can also find the standard deviation similarly
        #DisplacedT = T - self.MeanOfFeaturesTaken; 
        DisplacedT = np.zeros(n*m).reshape(n,m); 
        for i in range(n):
            #self.SDofFeaturesTaken[i] = math.sqrt(np.dot(T[i]-self.MeanOfFeaturesTaken[i],T[i]-self.MeanOfFeaturesTaken[i])/m); 
            DisplacedT[i] = T[i] - self.MeanOfFeaturesTaken[i];
            self.SDofFeaturesTaken[i] = math.sqrt(np.dot(DisplacedT[i],DisplacedT[i])/m); 
         
        #now the average and SD has been computed
        #take note that SD must not be 0, and hence no input should be constant for using this version of regression Training.
        #now transform the inputs before training
        for i in range(n):
            DisplacedT[i] /= self.SDofFeaturesTaken[i]; 
        X = DisplacedT.transpose(); 
        # print(self.MeanOfFeaturesTaken);   
        # print(self.SDofFeaturesTaken); 
        
        return self.UnnormalizedRegressionTrain(X,y_train,alpha,iterations,regularizationFactor); 
    
    '''converts features to higher degrees, regularization required to prevent overfitting'''
    def FeatureExtraDegree(self, X_train):
        #should form n*(n+1)/2 total features at the end
        m,n = X_train.shape; 
        final = np.zeros(m*(int(n*(n+1)/2) + n)).reshape((int(n*(n+1)/2) + n),m); 
        T = X_train.transpose(); 
        pos = 0; 
        for i in range(n):
            for j in range(i,n):
                #multiply the two terms together to get a new factor
                final[pos] = T[i]*T[j]; 
                pos+=1;
        for j in range(n):
            final[pos] = T[i]; 
            pos+=1; 
        return (final.transpose());  

    def FeatureHigherDegree(X_train, degree):
        #will be used to make higher degree terms of the feature combinations
        #can be done directly
        
        pass;


Testing on some short data:- 

In [11]:
x_train = np.array([[2,3,5,4,3],[3,5,3,4,7],[4,6,3,1,9],[4,3,4,5,5],[2,1,9,3,8],[4,1,11,6,10],[10,4,9,12,3]]); 
x_train = x_train.transpose(); 
y_train = np.array([1,0,0,0,1]); 
myLR = LogisticRegression(); 
myLR.NormalizedRegressionTrain(x_train,y_train,iterations=10000,alpha=0.001);

print(myLR.predict(x_train,None,None)); 

pd.DataFrame()

Iteration    0: Cost 0.692861   
Iteration 1000: Cost 0.503382   
Iteration 2000: Cost 0.405594   
Iteration 3000: Cost 0.341363   
Iteration 4000: Cost 0.294354   
Iteration 5000: Cost 0.258092   
Iteration 6000: Cost 0.229231   
Iteration 7000: Cost 0.205748   
Iteration 8000: Cost 0.186309   
Iteration 9000: Cost 0.169988   
Iteration 9999: Cost 0.156129   
here is the SD
[1.0198039  1.49666295 2.72763634 0.74833148 3.26190129 3.72021505
 3.49857114]
[[-1  0  1  0  0]
 [ 0  0  0  0  1]
 [ 0  0  0 -1  1]
 [ 0 -1  0  1  1]
 [ 0 -1  1  0  1]]
[1. 0. 0. 0. 1.]


Trying the Titanic Disaster Machine Learning competition on Kaggle,
Initially with only linear parameters of just 7 features


In [12]:
train_data = pd.read_csv("/OneDrive - IIT Delhi/Pictures/_PythonProjects/Machine Learning/CSV/TitanicTraining.csv",index_col=0) 
#m,n = train_data.size; 
train_data.head() 
usefulColumns = [1,3,4,5,6,8,10];
y_train = train_data.loc[:,'Survived']; 
train_data = train_data.iloc[:,usefulColumns]
m,n = train_data.shape
for i in range(1,m+1):
    c = train_data.loc[i,'Embarked'];
    if(c == 'S'):
        train_data.loc[i,'Embarked'] = 0; 
    elif(c == 'Q'):
        train_data.loc[i,'Embarked'] = -1; 
    else:
        train_data.loc[i,'Embarked'] = 1;
    if(train_data.loc[i,'Sex'] == 'male'):
        train_data.loc[i,'Sex'] = 0; 
    else:
        train_data.loc[i,'Sex'] = 1; 
train_data.Age =  train_data.Age.fillna(float(30))



#now we have effectively reduced the number of features to input to our model
#Survived should be a y_train for this
#the features I will use to test my ML library are Pclass, Sex, Age, SibSp, Parch, Fare and Embarked
train_data.Sex = train_data.Sex.astype('float64');
train_data.Age = train_data.Age.astype('float64');
train_data.Fare = train_data.Fare.astype('float64');
train_data.Embarked = train_data.Embarked.astype('float64');
train_data = train_data.astype('float64');
train_data.to_csv("/OneDrive - IIT Delhi/Pictures/_PythonProjects/Machine Learning/CSV/myTrainingDataOnlyFirst.csv")
train_data.head()
 

Unnamed: 0_level_0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3.0,0.0,22.0,1.0,0.0,7.25,0.0
2,1.0,1.0,38.0,1.0,0.0,71.2833,1.0
3,3.0,1.0,26.0,0.0,0.0,7.925,0.0
4,1.0,1.0,35.0,1.0,0.0,53.1,0.0
5,3.0,0.0,35.0,0.0,0.0,8.05,0.0


In [13]:
newTrainData = pd.read_csv("/OneDrive - IIT Delhi/Pictures/_PythonProjects/Machine Learning/CSV/myTrainingDataOnlyFirst.csv"); 
newTrainData = newTrainData.drop('PassengerId',axis = 'columns')
arrayXdata = newTrainData.to_numpy();
arrayYdata =  y_train.to_numpy();  
print(arrayXdata.shape);
print(arrayYdata.shape);

(891, 7)
(891,)


In [14]:
#starting the machine learning algorithm
TitanicLR = LogisticRegression(); 
#TitanicLR.UnnormalizedRegressionTrain(arrayXdata,arrayYdata,0.001,100000,20);
TitanicLR.Normalized = True; 
TitanicLR.criteria = 0.5; 
AddedDegrees = TitanicLR.FeatureExtraDegree(arrayXdata); 
TitanicLR.NormalizedRegressionTrain(arrayXdata,arrayYdata,0.0001,100000,0); 

#then we test the prediction given on the testcase we have


#print(AddedDegrees.shape); 
# print(TitanicLR.Normalized); 

Iteration    0: Cost 0.693134   
Iteration 10000: Cost 0.597264   
Iteration 20000: Cost 0.546063   
Iteration 30000: Cost 0.515764   
Iteration 40000: Cost 0.496411   
Iteration 50000: Cost 0.483347   
Iteration 60000: Cost 0.474155   
Iteration 70000: Cost 0.467476   
Iteration 80000: Cost 0.462499   
Iteration 90000: Cost 0.458712   
Iteration 99999: Cost 0.455779   


In [19]:
#testing the prediction rate of our algorithm on the training data
predictedOutputTrain = TitanicLR.predict(arrayXdata,None,None); 
matchings = 0; 
actualDeaths = 0; predictedDeaths = 0; 
ActualDeathToLife = 0; actualLifeToDeath = 0;
for i in range(len(predictedOutputTrain)):
    if(predictedOutputTrain[i] == arrayYdata[i]):
        matchings += 1; 
    else:
        if(arrayYdata[i] == 1):
            actualLifeToDeath+=1;
        else:
            ActualDeathToLife += 1;
        
    if(predictedOutputTrain[i] == 0):
        predictedDeaths+=1;
    if(arrayYdata[i] == 0):
        actualDeaths += 1; 

print("% correct =" ,matchings*100/len(predictedOutputTrain)); 
print("predicted deaths =" , predictedDeaths); print("actual deaths=", actualDeaths); 
print("actually died but predicted lived ", ActualDeathToLife);
print("actually lived but predicted died ", actualLifeToDeath);
print(arrayXdata.transpose()[:5]); 

here is the SD
[ 0.83560193  0.47772176 12.99527138  1.10212444  0.80560476 49.66553444
  0.51606398]
[[-13.06452586 -17.97397556 -13.06452586 ... -13.06452586 -17.97397556
  -13.06452586]
 [-26.44431454  13.74632038  13.74632038 ...  13.74632038 -26.44431454
  -26.44431454]
 [ -2.48081895  -2.48077578  -2.48080816 ...  -2.48079737  -2.48080816
   -2.48079197]
 [ -1.35693797  -1.35693797  -1.97189794 ...  -1.35693797  -1.97189794
   -1.97189794]
 [ -3.82204932  -3.82204932  -3.82204932 ...   2.0720829   -3.82204932
   -3.82204932]]
% correct = 78.67564534231201
predicted deaths = 577
actual deaths= 549
actually died but predicted lived  81
actually lived but predicted died  109
[[-13.06452586 -17.97397556 -13.06452586 ... -13.06452586 -17.97397556
  -13.06452586]
 [-26.44431454  13.74632038  13.74632038 ...  13.74632038 -26.44431454
  -26.44431454]
 [ -2.48081895  -2.48077578  -2.48080816 ...  -2.48079737  -2.48080816
   -2.48079197]
 [ -1.35693797  -1.35693797  -1.97189794 ...  -1.356

Testing the code above ->

In [26]:
 
#then we remove the Name column and passenger ID as those are unnecessary
#and convert all the string inputs to numerical inputs
test_data = pd.read_csv("/OneDrive - IIT Delhi/Pictures/_PythonProjects/Machine Learning/CSV/test.csv",index_col=0);

test_data = test_data.drop('Name',axis = 'columns'); 
test_data = test_data.drop('Ticket',axis = 'columns'); 
test_data = test_data.drop('Cabin',axis = 'columns'); 
m,n = test_data.shape
test_data.Age =  test_data.Age.fillna(float(30))
#print(test_data.shape)
#test_data.loc['Embarked']; 
# for i in range(896,891+m+1):
#     c = train_data.loc[i,'Embarked'];
    # if(c == 'S'):
    #     train_data.loc[i,'Embarked'] = 0; 
    # elif(c == 'Q'):
    #     train_data.loc[i,'Embarked'] = -1; 
    # else:
    #     train_data.loc[i,'Embarked'] = 1;
#     if(train_data.loc[i,'Sex'] == 'male'):
#         train_data.loc[i,'Sex'] = 0; 
#     else:
#         train_data.loc[i,'Sex'] = 1; 
for i in range(892,1310):
    c = test_data.loc[i,'Embarked']; 
    if(c == 'S'):
        test_data.loc[i,'Embarked'] = 0; 
    elif(c == 'Q'):
        test_data.loc[i,'Embarked'] = -1; 
    else:
        test_data.loc[i,'Embarked'] = 1;
    if(test_data.loc[i,'Sex'] == 'male'):
        test_data.loc[i,'Sex'] = 0; 
    else:
        test_data.loc[i,'Sex'] = 1; 
test_data.Sex = test_data.Sex.astype('float64');
test_data.Age = test_data.Age.astype('float64');
test_data.Fare = test_data.Fare.astype('float64');
test_data.Embarked = test_data.Embarked.astype('float64');
test_data = test_data.astype('float64');
test_data.to_csv("/OneDrive - IIT Delhi/Pictures/_PythonProjects/Machine Learning/CSV/EditedTestingFile2.csv", index=False)
test_data = pd.read_csv("/OneDrive - IIT Delhi/Pictures/_PythonProjects/Machine Learning/CSV/EditedTestingFile2.csv", index_col=False)
print(test_data.shape); 
testingXdata = test_data.to_numpy(); 
print(TitanicLR.Normalized); 
#testingXdata = TitanicLR.FeatureExtraDegree(testingXdata);  #use when second degree features are considered
print(testingXdata.shape); 




(418, 7)
True
(418, 7)


In [27]:
outputY = TitanicLR.predict(testingXdata,None,None); 

here is the SD
[ 0.83560193  0.47772176 12.99527138  1.10212444  0.80560476 49.66553444
  0.51606398]
[[ 0.82737724  0.82737724 -0.36936484 ...  0.82737724  0.82737724
   0.82737724]
 [-0.73769513  1.35557354 -0.73769513 ... -0.73769513 -0.73769513
  -0.73769513]
 [ 0.36483356  1.3267219   2.48098791 ...  0.67263783  0.01855376
   0.01855376]
 [-0.4745452   0.43279337 -0.4745452  ... -0.4745452  -0.4745452
   0.43279337]
 [-0.47367361 -0.47367361 -0.47367361 ... -0.47367361 -0.47367361
   0.76762988]]


In [28]:
print(TitanicLR.globalW); 
print(TitanicLR.globalB); 

[-0.53805332  1.00468557 -0.25966058 -0.20307676 -0.03268501  0.24328424
  0.10012872]
-0.47084978609827316


Converting the output received back to a dataframe and then to a csv to upload

In [29]:
indices = [i for i in range(892,1310)]; 
#print(indices); 
finalOutputDataFrame = pd.DataFrame({'PassengerId': indices ,'Survived': outputY}); 

# print(outputY.size); 
# print(len(indices))
# print(testingXdata.shape)
finalOutputDataFrame.Survived = finalOutputDataFrame.Survived.astype('Int64'); 
finalOutputDataFrame.PassengerId = finalOutputDataFrame.PassengerId.astype('Int64'); 
finalOutputDataFrame.to_csv("/OneDrive - IIT Delhi/Pictures/_PythonProjects/Machine Learning/CSV/TitanicPredictions/AnswerFirstDegree4.csv", index=False)
finalOutputDataFrame.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
