#Prerequisites

In [None]:
#First we import all the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Linear Regression**
In this part we will train a model linear regression model with multiple features

In [None]:
#Now we load and print our training data
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/linear_train.csv") #Loading data from google drive
linear_train_data = np.array(df) #Converting loaded data in a matrix
X_train = linear_train_data[:,1:21] #Extracting X_train(Input data)
Y_train = linear_train_data[:,21] ##Extracting Y_train(Labels)
Y_train = Y_train.reshape(Y_train.size,1)
# Printing the first 5 rows of our loaded data
print("Input\n",X_train[:5])
print("labels\n",Y_train[:5])
print(X_train.shape)
print(Y_train.shape)

## Creating the model
As we are making a linear model, we will first initialize a random array **w** having n = 20 features and a random scalar value **b**

In [None]:
# Initialize initial w and b (having small values)
np.random.seed(1) #Used so that we get the same random state on multiple executions of code
w_initial = 0.01*(np.random.randn(20,1)-0.5)
b_initial = np.random.randn()
print(w_initial)
print(b_initial)

##Predict function
We know f_wb is given as f_wb = **w** * **x** + b
Here we create a predict function to make a prediction on the given data


In [None]:
def predict(X,w,b):
  """
  Arguments
  X(2D numpy array of training examples) Shape(m,n)
  w(numpy array of model parameters) Shape(n,1)
  b(model parameter) scalar 
  """
  m,n = X.shape
  a = np.matmul(X,w)
  a = a.reshape(m,1)
  f_wb = a + b
  return f_wb

Now lets test our model on the first 5 training examples of our data

In [None]:
  f_wb = predict(X_train[:5],w_initial,b_initial)
  print("Prediction:-\n",f_wb)
  print("Expected Output:-\n",Y_train[:5])

##Computing Cost
As you can see the prediction is far off from the expected output so now we compute the cost function , as a metric to measure how far off is our model from the training examples 

In [None]:
def compute_cost(X,Y,w,b):
  """
   Arguments
   X(2-D numpy array containing the training examples) Shape(m,n)
   Y(1-D numpy array containing the output label of all the training examples) Shape(m,1)
   w(numpy array of model parameters) Shape(n,1)
   b(model parameter) scalar 
  """
  m = len(Y) #Getting the number of training examples
  f_wb = predict(X,w,b)
  err = (f_wb - Y)**2
  err = err.reshape(m,1)
  cost = np.sum(err) # specific cost for each training example is added and divided by 2m to give final cost
  cost = cost/(2*m)
  return cost

Computing Cost of our initial prediction:

In [None]:
J = compute_cost(X_train,Y_train,w_initial,b_initial)
print("Cost:",J)

##Gradient descent
After knowing the cost of the model we know define a function to reduce the cost using gradient descent

In [None]:
def gradient_descent(X,Y,w_in,b_in,alpha,iters,predict,compute_cost):
  
  J_history = [] #We create a list containg cost after every iteration(For later plotting and analysis)
  w = w_in.copy() 
  b = b_in
  m,n = X.shape #Getting the number of training examples
  for i in range(iters):
    #Compute gradient dj_dw and dj_db
    f_wb = predict(X,w,b)
    err = f_wb - Y
    dj_dw = (np.matmul(err.T,X).T)/m
    dj_db = np.sum(err)/m 
    #Update parameters w,b (Simultaneously)
    w = w-alpha*dj_dw
    b = b-alpha*dj_db
    #Record Cost J after every iteration
    J_history.append(compute_cost(X, Y, w, b)) #Add the current cost to J_history
    # Print cost after interval of 10 times,[-1] refers to the last element(reverse indexing)
    if i%(np.math.ceil(iters / 10)) == 0:
      print("Iteration :",i," Cost :",J_history[-1]) #F-strings used to encode variables i and J_history  
  return w, b, J_history #return final w,b and J_history(for graphing)

##Tuning Hyperparameters

###Choosing the learning rate(alpha)
Let's start by choosing the learning rate alpha by trying 3 examples 1e-7 , 1e-8 and 1e-6 and see which fits best

In [None]:
##For alpha = 1e-7
w_final,b_final,J_history = gradient_descent(X_train,Y_train,w_initial,b_initial,0.0000001,1000,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_history)
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.show()

In [None]:
# Now trying for alpha = 1e-6 and reduce the number of iterations to 100 (will be justified)
w_final,b_final,J_history = gradient_descent(X_train,Y_train,w_initial,b_initial,0.000001,100,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_history)
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.show()

As you can see the model is clearly overshooting,the cost keeps on increasing , so we can conclude the learning rate is too high and we don't need to go above 100 iterations

In [None]:
# Now trying for alpha = 1e-8 
w_final,b_final,J_history = gradient_descent(X_train,Y_train,w_initial,b_initial,0.00000001,1000,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_history)
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.show()

As you can see after some iterations the the decrease in the cost function decreases very slowly,although higher number of iterations might further decrease the cost function it would be computationaly intesive and higher learning rate might lead to faster convergence.

From the above results we can conclude the learning rate 1e-7 is ideal for our model

###Deciding the number of iterations
Let's try 3 different combinations, no.of iteration = 100 ,1000 and 10000,100000 and measure performance in each of the 4 test cases

In [None]:
##For 100 iterations
w_final,b_final,J_history = gradient_descent(X_train,Y_train,w_initial,b_initial,0.0000001,100,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_history)
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.show()

Here it seems the function could still be decreasing further and might yield better results at higher number of iterations

In [None]:
##For 1000 iterations
w_final,b_final,J_history = gradient_descent(X_train,Y_train,w_initial,b_initial,0.0000001,1000,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_history)
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.show()

Here it seems that the cost function has pretty much converged, however let's zoom in to the tail end of the plot

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, constrained_layout=True, figsize=(12,4)) #We create 2 subplots ax1 and ax2 to see the cost at start and end
ax1.plot(J_history[:200]) #1st subplot plots till the first 100 examples
ax2.plot(J_history[200:]) #2nd subplot plots on the examples after 200 iterations
ax1.set_title("Cost vs. iteration(start)");  ax2.set_title("Cost vs. iteration (end)")
ax1.set_ylabel('Cost')            ;  ax2.set_ylabel('Cost') 
ax1.set_xlabel('Number of iterations')  ;  ax2.set_xlabel('Number of iterations') 
plt.show()

It seems the cost function could further decrease if we increased the number of iterations. Let's try 10000 iterations

In [None]:
##For 10000 iterations
w_final,b_final,J_history = gradient_descent(X_train,Y_train,w_initial,b_initial,0.0000001,10000,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_history)
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.show()

As we can see that our cost function has come down significantly lets try to go even higher and run our model for 100,000 iterations and see the improvements

In [None]:
##For 100000 iterations
w_final,b_final,J_history = gradient_descent(X_train,Y_train,w_initial,b_initial,0.0000001,100000,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_history)
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.show()

Now we will take the pretrained parameters w and b and again run gradient descent for 100,000 iterations to see if the cost further decreases significantly

In [None]:
#Using pretrained parameters we run the gradient descent for another 100000 iterations
w_final,b_final,J_history = gradient_descent(X_train,Y_train,w_final,b_final,0.0000001,100000,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_history)
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.show()

We can see the cost function saturates and after a point,and there is no point in having higher number of iterations

###Visualization of parameters
Here we will plot 20 different plots depicting the ith feature of X against the output label y(Predicted by our model) and we will also show the actual training data on the plot. 

In [None]:
for i in range(20):
  plt.scatter(X_train[:,i],Y_train, marker='.',c='r',label="Training data")
  plt.xlabel(f"Feature no:-{i+1}")
  plt.ylabel("Output")
  a = np.zeros(20)
  a[i] = 1
  a = a.reshape(20,1)
  f_wb = predict(X_train,a*w_final,b_final)
  plt.plot(X_train[:,i],f_wb,label="Prediction")
  plt.legend()
  plt.show()

##Feature scaling
Taking a closer look at the data we find out that the data in X_train is spread across a huge range (from near 0 to upto 1000).It seems that applying z-score normalization to all the features of x will allow it to come under similar ranges and hence give us a better chance at making more accurate predictions.(**Note:- Even the test data must be z-score normalized before being used as input for the model**)

In [None]:
def z_score_normalization(X):
  """
  Arguments
  X(the input data which is to be normalized): Shape (m,n) 
  """
  # First get no.of training examples and features
  m,n = (X).shape
  mean = np.sum(X,axis=0)/m #Has the mean of each of the n features stored in a 1-D array
  mean = mean.reshape(1,-1)
  sq_X = X**2 #Squares the matrix(element wise)
  sq_mean = np.sum(sq_X,axis=0)/m #Mean of each of the n features in the squared matrix
  sq_mean = sq_mean.reshape(1,-1)
  std_dev = np.sqrt(sq_mean - (mean)**2) #Formula for standard deviation
  std_dev = std_dev.reshape(1,-1)
  ##Finally the normalized data can be written as
  X_normalized = (X - mean)/std_dev #here broadcasting is used to make these operations possible 
  return X_normalized, mean, std_dev

Let's print the first 5 rows of our new normalized data!!

In [None]:
X_train_normalized,u,s = z_score_normalization(X_train)
print("Data:-",X_train_normalized[:5])
print("Mean:-",u)
print("std_dev:-",s)

As you can see after normalization most of our training data belongs to the range -3 to 3 which makes gradient descent much easier.Let's train our model on the new normalized data!!

In [None]:
#Training the model on 10000 iterations
w_final,b_final,J_history = gradient_descent(X_train_normalized,Y_train,w_initial,b_initial,0.001,10000,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_history)
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.show()

The cost decreases very slowly in the end so we will try a higher learning rate

In [None]:
w_final,b_final,J_history = gradient_descent(X_train_normalized,Y_train,w_initial,b_initial,0.1,10000,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_history)
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.show()

In [None]:
print(w_final)
print(b_final)

###Visualizing model after feature scaling

In [None]:
for i in range(20):
  plt.scatter(X_train_normalized[:,i],Y_train, marker='.',c='r',label = "Training data")
  plt.xlabel(f"Feature no:-{i+1}")
  plt.ylabel("Output")
  a = np.zeros(20)
  a[i] = 1
  a = a.reshape(20,1)
  f_wb = predict(X_train_normalized,a*w_final,b_final)
  plt.plot(X_train_normalized[:,i],f_wb, label="Prediction")
  plt.show()

##Creating a cross validation set
No, matter how good our model does on the training set,there is always chance of overfitting,so we create a cross validation test,ratio of test to train is 80% train,20% test

In [None]:
def train_test_split(X,Y,ratio):
    #ratio is the ratio of the train set to that of complete set
    m,n = X.shape
    a = m*ratio
    a = round(a)
    Z = np.append(X,Y,axis=1)
    np.random.seed(261)
    np.random.shuffle(Z)
    X = Z[:,:-1]
    Y = Z[:,-1]
    Y = Y.reshape(Y.size,1)
    X_train_r = X[:a]
    X_cv_r = X[a:]
    Y_train_r = Y[:a]
    Y_cv_r = Y[a:]
    return X_train_r,X_cv_r,Y_train_r,Y_cv_r

In [None]:
X_train_r,X_cv_r,Y_train_r,Y_cv_r = train_test_split(X_train_normalized,Y_train,0.8)
print(X_train_r.shape)
print(Y_train_r.shape)
print(X_cv_r.shape)
print(Y_cv_r.shape)

Now we define another gradient descent function to account for J_cv

In [None]:
def gradient_descent_cv(X,Y,w_in,b_in,alpha,iters,ratio,predict,compute_cost):
  
  J_train_history = [] #We create a list containg cost after every iteration(For later plotting and analysis)
  J_cv_history = []
  w = w_in.copy() 
  b = b_in
  X_train,X_cv,Y_train,Y_cv = train_test_split(X,Y,ratio)
  m,n = X_train.shape #Getting the number of training examples
  for i in range(iters):
    #Compute gradient dj_dw and dj_db
    f_wb = predict(X_train,w,b)
    err = f_wb - Y_train
    dj_dw = (np.matmul(err.T,X_train).T)/m
    dj_db = np.sum(err)/m 
    #Update parameters w,b (Simultaneously)
    w = w-alpha*dj_dw
    b = b-alpha*dj_db
    #Record Cost J after every iteration
    J_train_history.append(compute_cost(X_train, Y_train, w, b)) #Add the current cost to J_history
    J_cv_history.append(compute_cost(X_cv, Y_cv, w, b))
    # Print cost after interval of 10 times,[-1] refers to the last element(reverse indexing)
    if i%(np.math.ceil(iters / 10)) == 0:
      print("Iteration :",i," Cost :",J_train_history[-1]) #F-strings used to encode variables i and J_history  
  return w, b, J_train_history,J_cv_history #return final w,b and J_history(for graphing)

In [None]:
w_final,b_final,J_train_history,J_cv_history = gradient_descent_cv(X_train_normalized,Y_train,w_initial,b_initial,0.01,1000,0.8,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

J train and J cv are almost overlapping

##Regularization
Now we finally introduce a regularized model(to ensure the values of weights are small enough to not overfit on the data)

In [None]:
def compute_cost_l(X,Y,w,b,l):
  """
   Arguments
   X(2-D numpy array containing the training examples) Shape(m,n)
   Y(1-D numpy array containing the output label of all the training examples) Shape(m,1)
   w(numpy array of model parameters) Shape(n,1)
   b(model parameter) scalar 
  """
  m = len(Y) #Getting the number of training examples
  f_wb = predict(X,w,b)
  err = (f_wb - Y)**2
  err = err.reshape(m,1)
  cost = np.sum(err) # specific cost for each training example is added and divided by 2m to give final cost
  cost = cost/(2*m)
  cost += (l/2*m)*np.sum(w**2)
  #print(cost,"Cost of model")
  return cost

In [None]:
def gradient_descent_cv_l(X,Y,w_in,b_in,alpha,iters,ratio,l,predict,compute_costs):
  J_train_history = [] #We create a list containg cost after every iteration(For later plotting and analysis)
  J_cv_history = []
  w = w_in.copy() 
  b = b_in
  X_train,X_cv,Y_train,Y_cv = train_test_split(X,Y,ratio)
  m,n = X_train.shape #Getting the number of training examples
  for i in range(iters):
    #Compute gradient dj_dw and dj_db
    f_wb = predict(X_train,w,b)
    err = f_wb - Y_train
    dj_dw = (np.matmul(err.T,X_train).T)/m
    dj_dw += (l/m)*w
    dj_db = np.sum(err)/m 
    #Update parameters w,b (Simultaneously)
    w = w-alpha*dj_dw
    b = b-alpha*dj_db
    #Record Cost J after every iteration
    #Note:- Even though we have applied regularization the method for computing training and testing error is still same
    J_train_history.append(compute_costs(X_train, Y_train, w, b)) #Add the current cost to J_history
    J_cv_history.append(compute_costs(X_cv, Y_cv, w, b))
    # Print cost after interval of 10 times,[-1] refers to the last element(reverse indexing)
    if i%(np.math.ceil(iters / 10)) == 0:
      print("Iteration :",i," Cost :",J_train_history[-1]) #F-strings used to encode variables i and J_history  
  return w, b, J_train_history,J_cv_history #return final w,b and J_history(for graphing)

In [None]:
#Final answer
w_ans,b_ans,J_train_history,J_cv_history = gradient_descent_cv_l(X_train_normalized,Y_train,w_initial,b_initial,0.01,10000,0.8,0.000001,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

J-train and J-cv are almost overlapping

##Calculating accuracy of model (Using R_2 square method)

In [None]:
def compute_R_2(w,b,X,Y,compute_cost):
  m,n = Y.shape
  mean = np.sum(Y)/m
  sq_Y = Y**2 
  sq_mean = np.sum(sq_Y)/m 
  var_y = sq_mean - (mean)**2 
  r_2 = 1 -2*(compute_cost(X,Y,w,b))/(var_y)
  return r_2

In [None]:
#Computing the accuracy of our trained model
r_2 = compute_R_2(w_ans,b_ans,X_train_normalized,Y_train,compute_cost)
print("Accuracy:",r_2*100,"%")

##Generalizing Linear regression for n features (USING CLASSES)

The code below will be later used to make my own python library

In [None]:
class LinearRegression:
  def __init__(self,X,Y,alpha,l,iters):
    self.X = X
    self.Y = Y
    m,n = X.shape
    self.m = m
    self.n = n
    self.alpha = alpha
    self.l = l
    self.iters = iters
    np.random.seed(2)
    self.w = 0.01*(np.random.randn(self.n,1)-0.5)
    self.b = np.random.randn()
    self.J_history = []
  
  def predict(self):
    a = np.matmul(self.X,self.w)
    a = a.reshape(self.m,1)
    f_wb = a + self.b
    return f_wb
  
  def compute_cost(self):
    f_wb = self.predict()
    err = (f_wb - self.Y)**2
    err = err.reshape(self.m,1)
    cost = np.sum(err) 
    cost = cost/(2*(self.m))
    return cost
  
  def gradient_descent(self):
    J_history = [] 
    for i in range(self.iters):
      f_wb = self.predict()
      err = f_wb - self.Y
      dj_dw = (np.matmul(err.T,self.X).T)/(self.m)
      dj_dw = dj_dw + (self.l/self.m)*(self.w)
      dj_db = np.sum(err)/(self.m) 
      self.w = self.w-(self.alpha)*dj_dw
      self.b = self.b-(self.alpha)*dj_db
      J_history.append(self.compute_cost()) 
      if i%(np.math.ceil((self.iters) / 10)) == 0:
        print("Iteration :",i," Cost :",J_history[-1]) 
    self.J_history = J_history   
    return self.w, self.b, J_history

  def reset_model(self):
    np.random.seed(2)
    self.w = 0.01*(np.random.randn(self.n,1)-0.5)
    self.b = np.random.randn()
    self.J_history = []

##Using our trained model to make a prediction on the test dataset

In [None]:
#Now we load and print our test data
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/linear_test_data.csv") #Loading data from google drive
linear_test_data = np.array(df) #Converting loaded data in a matrix
test_id = linear_test_data[:,0] #Getting the Id's of all labels
test_id = test_id.reshape(test_id.size,1)
X_test = linear_test_data[:,1:21] #Extracting X_test
# Printing the first 5 rows of our loaded data
print("Input:-\n",X_test[:5])
print("Shape of test data:-",X_test.shape)
print("Test Id's:-\n",test_id[:5])

Now we make a prediction on the test data

In [None]:
#Before making a prediction we first have to normalize our data
X_train_normalized,mean,std_dev = z_score_normalization(X_train)
X_test_normalized = (X_test -mean)/std_dev
Y_test_prediction = predict(X_test_normalized,w_ans,b_ans)
#We also append the id's of data with our prediction
prediction = np.append(test_id,Y_test_prediction,axis = 1)
print(prediction[:5])

Now finally we export our prediction back into a csv file

In [None]:
dfp=pd.DataFrame(prediction) #Convert matrix to dataframe
dfp.columns = ["ids","prediction"] #Set labels to our dataframe
dfp.to_csv("linear_test_prediction.csv",index=False)#Convert dataframe to csv file

Also for ease of acess we store our trained w and b in npy files

In [None]:
#These files can later be loaded using np.load()
np.save("linear_regression_w.npy",w_ans)
np.save("linear_regression_b.npy",b_ans)

In [None]:
w_ans = np.load("linear_regression_w.npy")
b_ans = np.load("linear_regression_b.npy")
print("Weights:-\n",w_ans)
print("Bias:-\n",b_ans)

#Polynomial Regression
Here we will train a polynomial model using multiple features

##Creating the model

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/polynomial_train.csv") #Loading data from google drive
polynomial_train_data = np.array(df) #Converting loaded data in a matrix
X_train_p = polynomial_train_data[:,1:4]
Y_train_p = polynomial_train_data[:,4]
Y_train_p = Y_train_p.reshape(Y_train_p.size,1)

In [None]:
print(X_train_p[:5])
print(Y_train_p[:5])

In this type of regression we have 3 different features A,B and C.We will combine these features to come up with a 3 degree polynomial model for making a prediction.The features are:-
a^3 ,b^3 ,c^3 ,(b^2)*c ,b*(c^2) ,(a^2)*b ,(a^2)*c ,a*(b^2) ,a*(c^2) ,a*b*c ,a^2 ,b^2 ,c^2 ,a*b ,a*c ,b*c ,a ,b ,c
So we have 19 weights and 1 bias.Now we will restructure our data into a matrix having 19 features (each representing one polynomial term) 

In [None]:
m,n = X_train_p.shape
A = X_train_p[:,0].reshape(m,1)
B = X_train_p[:,1].reshape(m,1)
C = X_train_p[:,2].reshape(m,1)
X_train_p_c = np.concatenate((A*A*A,B*B*B,C*C*C,B*B*C,B*C*C,A*A*B,A*A*C,A*B*B,A*C*C,A*B*C,A*A,B*B,C*C,A*B,B*C,C*A,A,B,C), axis=1)
print(X_train_p_c[:5])

In [None]:
#Now we normalize our data
X_train_normalized_p_c,u,s = z_score_normalization(X_train_p_c)
print(X_train_normalized_p_c[:5])

##Running Gradient descent

In [None]:
#Now we can run our function on predefined gradient descent for linear regression
np.random.seed(3)
w_initial_p = 0.01*(np.random.randn(19,1)-0.5)
w_initial_p = w_initial_p.reshape(19,1)
b_initial_p = np.random.randn()
w_final_p,b_final_p,J_train_history_p,J_cv_history_p = gradient_descent_cv_l(X_train_normalized_p_c,Y_train_p,w_initial_p,b_initial_p,0.1,10000,0.8,0,predict,compute_cost)
plt.figure(figsize=(12,4))
plt.plot(J_train_history_p , label = "J train")
plt.plot(J_cv_history_p , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

##Visualization of the 3 degree Polynomial model
We will now plot our output prediction against each of the features A,B,C.Note :- While plotting for A features of B and C will be 0,and so on for B as well as C

###Feature A

In [None]:
plt.scatter(X_train_normalized_p_c[:,-3],Y_train_p, marker='.',c='r', label = "Training data")
plt.xlabel("Feature A")
plt.ylabel("Output")
a = np.array([[1],[0],[0],[0],[0],[0],[0],[0],[0],[0],[1],[0],[0],[0],[0],[0],[1],[0],[0]])
f_wb = predict(X_train_normalized_p_c,a*w_final_p,b_final_p)
plt.plot(X_train_normalized_p_c[:,-3],f_wb, label = "Prediction")
plt.legend()
plt.show()

###Feature B

In [None]:
plt.scatter(X_train_normalized_p_c[:,-2],Y_train_p, marker='.',c='r',label = "Training data")
plt.xlabel("Feature B")
plt.ylabel("Output")
b = np.array([[0],[1],[0],[0],[0],[0],[0],[0],[0],[0],[0],[1],[0],[0],[0],[0],[0],[1],[0]])
f_wb = predict(X_train_normalized_p_c,b*w_final_p,b_final_p)
plt.plot(X_train_normalized_p_c[:,-2],f_wb, label = "Prediction")
plt.legend()
plt.show()

###Feature C

In [None]:
plt.scatter(X_train_normalized_p_c[:,-1],Y_train_p, marker='.',c='r' , label = "Training data")
plt.xlabel("Feature C")
plt.ylabel("Output")
c = np.array([[0],[0],[1],[0],[0],[0],[0],[0],[0],[0],[0],[0],[1],[0],[0],[0],[0],[0],[1]])
f_wb = predict(X_train_normalized_p_c,c*w_final_p,b_final_p)
plt.plot(X_train_normalized_p_c[:,-1],f_wb, label = "Prediction")
plt.legend()
plt.show()

##Computing Accuracy of 3 degree polynomial model(Using R_2 score):-

In [None]:
r_2 = compute_R_2(w_final_p,b_final_p,X_train_normalized_p_c,Y_train_p,compute_cost)
print("Accuracy:",r_2*100,"%")

##Generalizing Polynomial Regression for n features(USING CLASSES)
Following code will later be used to make my own python library

In [None]:
##Functions defined outside of classes are not to be directly accessible by the users of the library
def combo(k,i):
  if(k == 1):
    return np.array([[i]],dtype=int)
  matrix = np.empty((0,k),dtype=int)
  for j in range(i+1):
    m,n = combo(k-1,j).shape
    a = np.full([m,1],i-j,dtype=int)
    mt = np.append(a,combo(k-1,j),axis=1)
    matrix = np.append(matrix,mt,axis = 0)
  return matrix

def get_terms(X,d):
  m,n = X.shape
  e = np.empty((m,0))
  for i in range(1,d+1): 
    c = np.empty((m,0))
    mat = combo(n,i)
    a,b = mat.shape
    for j in range(a):
      f = np.prod(np.power(X,mat[j]),axis=1)
      f = f.reshape(m,1)
      c = np.append(c,f,axis = 1)
    e = np.append(e,c,axis=1)
  return e

def z_score_normalization(X):
  m,n = (X).shape
  mean = np.sum(X,axis=0)/m 
  sq_X = X**2 
  sq_mean = np.sum(sq_X,axis=0)/m 
  sq_mean = sq_mean.reshape(1,-1)
  std_dev = np.sqrt(sq_mean - (mean)**2)
  std_dev = std_dev.reshape(1,-1)
  X_normalized = (X - mean)/std_dev  
  return X_normalized,mean,std_dev

def compute_cost(X,Y,w,b):
  m = len(Y) 
  f_wb = predict(X,w,b)
  err = (f_wb - Y)**2
  err = err.reshape(m,1)
  cost = np.sum(err)
  cost = cost/(2*m)
  return cost

def predict (X,w,b):
  m,n = X.shape
  a = np.matmul(X,w)
  a = a.reshape(m,1)
  f_wb = a + b
  return f_wb

class PolynomialRegression:
  def __init__(self,X,degree,Y,alpha,l,iters,ratio):
    self.degree = degree
    G = get_terms(X,self.degree)
    self.X,u,s = z_score_normalization(G)
    self.Y = Y
    m,n = self.X.shape
    self.m = m
    self.n = n
    self.alpha = alpha
    self.l = l
    self.iters = iters
    self.ratio = ratio
    np.random.seed(4)
    self.w = 0.01*(np.random.randn(self.n,1)-0.5)
    self.b = np.random.randn()
    self.J_train_history = []
    self.J_cv_history = []
  
  def predict(self):
    a = np.matmul(self.X,self.w)
    a = a.reshape(self.m,1)
    f_wb = a + self.b
    return f_wb
  
  def compute_cost(self):
    f_wb = self.predict()
    err = (f_wb - self.Y)**2
    err = err.reshape(self.m,1)
    cost = np.sum(err) 
    cost = cost/(2*(self.m))
    return cost
  
  def gradient_descent_cv_l(self):
    J_train_history = []
    J_cv_history = []
    m,n = self.X.shape
    a = m*self.ratio
    a = round(a)
    Z = np.append(self.X,self.Y,axis=1)
    np.random.seed(262)
    np.random.shuffle(Z)
    X_int = Z[:,:-1]
    Y_int = Z[:,-1]
    Y_int = Y_int.reshape(Y_int.size,1)
    X_train = X_int[:a]
    X_cv = X_int[a:]
    Y_train = Y_int[:a]
    Y_cv = Y_int[a:]
    m,n = X_train.shape 
    for i in range(self.iters):
      f_wb = predict(X_train,self.w,self.b)
      err = f_wb - Y_train
      dj_dw = np.matmul(err.T,X_train).T/(self.m)
      dj_dw += (self.l/self.m)*(self.w)
      dj_db = np.sum(err)/(self.m) 
      self.w = self.w-(self.alpha)*dj_dw
      self.b = self.b-(self.alpha)*dj_db   
      J_train_history.append(compute_cost(X_train, Y_train, self.w, self.b)) 
      J_cv_history.append(compute_cost(X_cv, Y_cv, self.w, self.b))
      if i%(np.math.ceil(self.iters / 10)) == 0:
        print("Iteration :",i," Cost :",J_train_history[-1])   
    self.J_train_history = J_train_history
    self.J_cv_history = J_cv_history
    return self.w, self.b, J_train_history,J_cv_history 
  
  def reset_model(self):
    np.random.seed(4)
    self.w = np.random.randn(self.n,1)
    self.b = np.random.randn()
    self.J_train_history = []
    self.J_cv_history = []
  
  def r2_score(self):
    m,n = self.Y.shape
    mean = np.sum(self.Y)/m
    sq_Y = (self.Y)**2 
    sq_mean = np.sum(sq_Y)/m 
    var_y = sq_mean - (mean)**2 
    r_2 = 1 -2*(compute_cost(self.X,self.Y,self.w,self.b))/(var_y)
    return r_2

In [None]:
test2 = PolynomialRegression(X_train_p,5,Y_train_p,0.01,0,100000,0.8)
print(test2.predict())
print(test2.compute_cost())

In [None]:
#Note the parameters w and b, which we receive are of the modified test set(After normalization)
test2.reset_model()
test2.w,test2.b,test2.J_train_history,test2.J_cv_history = test2.gradient_descent_cv_l()
plt.figure(figsize=(12,4))
plt.plot(test2.J_train_history, label="J train")
plt.plot(test2.J_cv_history, label="J cv" )
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#We run code for another 10000 iterations
test2.w,test2.b,test2.J_train_history,test2.J_cv_history = test2.gradient_descent_cv_l()
plt.figure(figsize=(12,4))
plt.plot(test2.J_train_history, label="J train")
plt.plot(test2.J_cv_history, label="J cv" )
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#We run code for another 10000 iterations
test2.w,test2.b,test2.J_train_history,test2.J_cv_history = test2.gradient_descent_cv_l()
plt.figure(figsize=(12,4))
plt.plot(test2.J_train_history, label="J train")
plt.plot(test2.J_cv_history, label="J cv" )
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#We run code for another 10000 iterations
test2.w,test2.b,test2.J_train_history,test2.J_cv_history = test2.gradient_descent_cv_l()
plt.figure(figsize=(12,4))
plt.plot(test2.J_train_history, label="J train")
plt.plot(test2.J_cv_history, label="J cv" )
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#We run code for another 10000 iterations
test2.w,test2.b,test2.J_train_history,test2.J_cv_history = test2.gradient_descent_cv_l()
plt.figure(figsize=(12,4))
plt.plot(test2.J_train_history, label="J train")
plt.plot(test2.J_cv_history, label="J cv" )
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#We run code for another 10000 iterations
test2.w,test2.b,test2.J_train_history,test2.J_cv_history = test2.gradient_descent_cv_l()
plt.figure(figsize=(12,4))
plt.plot(test2.J_train_history, label="J train")
plt.plot(test2.J_cv_history, label="J cv" )
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#We run code for another 10000 iterations
test2.w,test2.b,test2.J_train_history,test2.J_cv_history = test2.gradient_descent_cv_l()
plt.figure(figsize=(12,4))
plt.plot(test2.J_train_history, label="J train")
plt.plot(test2.J_cv_history, label="J cv" )
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#We run code for another 10000 iterations
test2.w,test2.b,test2.J_train_history,test2.J_cv_history = test2.gradient_descent_cv_l()
plt.figure(figsize=(12,4))
plt.plot(test2.J_train_history, label="J train")
plt.plot(test2.J_cv_history, label="J cv" )
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#We run code for another 10000 iterations
test2.w,test2.b,test2.J_train_history,test2.J_cv_history = test2.gradient_descent_cv_l()
plt.figure(figsize=(12,4))
plt.plot(test2.J_train_history, label="J train")
plt.plot(test2.J_cv_history, label="J cv" )
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#We run code for another 10000 iterations
test2.w,test2.b,test2.J_train_history,test2.J_cv_history = test2.gradient_descent_cv_l()
plt.figure(figsize=(12,4))
plt.plot(test2.J_train_history, label="J train")
plt.plot(test2.J_cv_history, label="J cv" )
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

##Computing accuracy of 5 degree polynomial model

In [None]:
##Checking Accuracy
r2 = test2.r2_score()
print("Accuracy:",r2*100,"%")

##Using our trained model to make a prediction on the test dataset

In [None]:
#Now we load and print our test data
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/polynomial_test_data.csv") #Loading data from google drive
polynomial_test_data = np.array(df) #Converting loaded data in a matrix
test_id_p = polynomial_test_data[:,0] #Getting the Id's of all labels
test_id_p = test_id_p.reshape(test_id_p.size,1)
X_test_p = polynomial_test_data[:,1:] #Extracting X_test
# Printing the first 5 rows of our loaded data
print("Input:-\n",X_test_p[:5])
print("Shape of test data:-",X_test_p.shape)
print("Test Id's:-\n",test_id_p[:5])

Now we make a prediction on the test data

In [None]:
#Before making a prediction we first have modify and normalize our data
G = get_terms(X_train_p,5)
X_train_p_normalized,mean,std_dev = z_score_normalization(G)
X_test_m_normalized = (get_terms(X_test_p,5)-mean)/std_dev
Y_test_prediction_p = predict(X_test_m_normalized,test2.w,test2.b)
#We also append the id's of data with our prediction
prediction_p = np.append(test_id_p,Y_test_prediction_p,axis=1)
print(prediction_p[:5])

Now finally we export our prediction back into a csv file

In [None]:
dfp=pd.DataFrame(prediction_p) #Convert matrix to dataframe
dfp.columns = ["ids","prediction"] #Set labels to our dataframe
dfp.to_csv("polynomial_test_prediction.csv",index=False)#Convert dataframe to csv file

Saving trained parameters for easier access later

In [None]:
#These files can later be loaded using np.load()
np.save("polynomial_regression_w.npy",test2.w)
np.save("polynomial_regression_b.npy",test2.b)

In [None]:
#First save the files in the same folder of this notebook
w_ans = np.load("polynomial_regression_w.npy")
b_ans = np.load("polynomial_regression_b.npy")
print("Weights:-\n",w_ans)
print("Bias:-\n",b_ans)

#Visualizing the data for classification tasks

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/classification_train.csv") #Loading data from google drive
classification_train_data = np.array(df) #Converting loaded data in a matrix
X_train_c = classification_train_data[:,2:786]
Y_train_c = classification_train_data[:,1]
Y_train_c = Y_train_c.reshape(Y_train_c.size,1)

In [None]:
print(X_train_c[:5])
print(Y_train_c[:5])

In [None]:
#Here we write code to display some of the images with their labels
for i in range(9):
  data = X_train_c[i].reshape(28,28)
  plt.imshow(data,cmap="Greys_r")
  plt.title(f"{Y_train_c[i]}")
  plt.show()

# KNN
(K nearest neigbours algorithm)

##Creating the model

In [None]:
def predict_knn(X,Y,k,p):
  a,b = X.shape
  m,n = p.shape #Get number of predictions to be made and number of features
  f = len(np.unique(Y))
  a = np.sum((p)**2,axis=1).reshape(-1,1)
  a = a - 2*np.matmul(p,X.T)
  a = a + np.sum((X.T)**2,axis=0).reshape(1,-1)
  a = np.argsort(a,axis=1)
  a = Y[a].reshape(-1,Y.size)  
  c = a[:,:k]
  c = pd.DataFrame(c)
  d = np.array(c.mode(axis=1,numeric_only=True))
  d = d[:,0]
  ans = np.array(d,dtype=int).reshape(d.size,1)
  return ans

In [None]:
p = X_train_c[:5]
predict_knn(X_train_c,Y_train_c,3,p)

##Visualizing KNN data

In [None]:
#We will pick the first 2 features of our training dataset and plot it on the 2D plane(After normalization)
X_plot = X_train_c[:,:2]
X_plot_0 = []
X_plot_1 = []
X_plot_2 = []
X_plot_3 = []
X_plot_4 = []
X_plot_5 = []
X_plot_6 = []
X_plot_7 = []
X_plot_8 = []
X_plot_9 = []
Y_plot = Y_train_c
for i in range(Y_train_c.size):
  if Y_plot[i] == 0:
    X_plot_0.append(X_plot[i])
  elif Y_plot[i] == 1:
    X_plot_1.append(X_plot[i])
  elif Y_plot[i] == 2:
    X_plot_2.append(X_plot[i])
  elif Y_plot[i] == 3:
    X_plot_3.append(X_plot[i])
  elif Y_plot[i] == 4:
    X_plot_4.append(X_plot[i])
  elif Y_plot[i] == 5:
    X_plot_5.append(X_plot[i])
  elif Y_plot[i] == 6:
    X_plot_6.append(X_plot[i])
  elif Y_plot[i] == 7:
    X_plot_7.append(X_plot[i])
  elif Y_plot[i] == 8:
    X_plot_8.append(X_plot[i])
  elif Y_plot[i] == 9:
    X_plot_9.append(X_plot[i])
X_0 = np.array(X_plot_0)
X_1 = np.array(X_plot_1)
X_2 = np.array(X_plot_2)
X_3 = np.array(X_plot_3)
X_4 = np.array(X_plot_4)
X_5 = np.array(X_plot_5)
X_6 = np.array(X_plot_6)
X_7 = np.array(X_plot_7)
X_8 = np.array(X_plot_8)
X_9 = np.array(X_plot_9)
plt.scatter(X_0[:,0],X_0[:,1],color='k',marker='o',label=0)
plt.scatter(X_1[:,0],X_1[:,1],color='b',marker='o',label=1)
plt.scatter(X_2[:,0],X_2[:,1],color='g',marker='o',label=2)
plt.scatter(X_3[:,0],X_3[:,1],color='r',marker='o',label=3)
plt.scatter(X_4[:,0],X_4[:,1],color='c',marker='o',label=4)
plt.scatter(X_5[:,0],X_5[:,1],color='m',marker='o',label=5)
plt.scatter(X_6[:,0],X_6[:,1],color='y',marker='o',label=6)
plt.scatter(X_7[:,0],X_7[:,1],color='#3d251e',marker='o',label=7)
plt.scatter(X_8[:,0],X_8[:,1],color='#ed7014',marker='o',label=8)
plt.scatter(X_9[:,0],X_9[:,1],color='#8f00ff',marker='o',label=9)
plt.title("Labels plotted against the 1st two pixel grayscale values")
plt.xlabel("First pixel")
plt.ylabel("Second pixel")
plt.legend()
plt.show()

##Computing accuracy of KNN model
For computing accuracy we first need to split our training dataset

In [None]:
X_train_c_t,X_cv_c,Y_train_c_t,Y_cv_c = train_test_split(X_train_c,Y_train_c,0.999)
print(X_train_c_t.shape)
print(X_cv_c.shape)
print(Y_train_c_t.shape)
print(Y_cv_c.shape)

In [None]:
def accuracy_knn(X_train,X_cv,Y_train,Y_cv,k):
  cnt = 0
  m,n = X_cv.shape
  cnt = (predict_knn(X_train,Y_train,k,X_cv) == Y_cv).sum()
  return cnt/m

##Plotting K v.s Accuracy

In [None]:
#Takes time(approx 2 minutes)
acc = []
for i in range(1,25):
  acc.append(accuracy_knn(X_train_c_t,X_cv_c,Y_train_c_t,Y_cv_c,i))
plt.figure(figsize=(12,4))
plt.plot(acc)
plt.title("Accuracy v.s k")
plt.ylabel("Accuracy")
plt.xlabel("k")
plt.show()

Most ideal value of k seems to be 4

##Generalizing knn using classes

In [None]:
def train_test_split(X,Y,ratio):
    #ratio is the ratio of the train set to that of complete set
    m,n = X.shape
    a = m*ratio
    a = round(a)
    Z = np.append(X,Y,axis=1)
    np.random.seed(261)
    np.random.shuffle(Z)
    X = Z[:,:-1]
    Y = Z[:,-1]
    Y = Y.reshape(Y.size,1)
    X_train_r = X[:a]
    X_cv_r = X[a:]
    Y_train_r = Y[:a]
    Y_cv_r = Y[a:]
    return X_train_r,X_cv_r,Y_train_r,Y_cv_r

def predict_knn(X,Y,k,p):
   a,b = X.shape
   m,n = p.shape #Get number of predictions to be made and number of features
   f = len(np.unique(Y))
   a = np.sum((p)**2,axis=1).reshape(-1,1)
   a = a - 2*np.matmul(p,X.T)
   a = a + np.sum((X.T)**2,axis=0).reshape(1,-1)
   a = np.argsort(a,axis=1)
   a = Y[a].reshape(-1,Y.size)  
   c = a[:,:k]
   c = pd.DataFrame(c)
   d = np.array(c.mode(axis=1,numeric_only=True))
   d = d[:,0]
   ans = np.array(d,dtype=int).reshape(d.size,1)
   return ans

class knn:
  def __init__(self,X,Y,k,p,ratio):
    self.X = X
    self.Y = Y
    self.k = k
    self.p = p
    self.ratio = ratio
    self.f = len(np.unique(self.Y))
  
  def predict_knn(self):
   a,b = self.X.shape
   m,n = self.p.shape #Get number of predictions to be made and number of features
   f = len(np.unique(self.Y))
   a = np.sum((self.p)**2,axis=1).reshape(-1,1)
   a = a - 2*np.matmul(self.p,self.X.T)
   a = a + np.sum((self.X.T)**2,axis=0).reshape(1,-1)
   a = np.argsort(a,axis=1)
   a = self.Y[a].reshape(-1,self.Y.size)  
   c = a[:,:self.k]
   c = pd.DataFrame(c)
   d = np.array(c.mode(axis=1,numeric_only=True))
   d = d[:,0]
   ans = np.array(d,dtype=int).reshape(d.size,1)
   return ans

  def accuracy_knn(self):
    X_train_c_t,X_cv_c_t,Y_train_c_t,Y_cv_c_t = train_test_split(self.X,self.Y,self.ratio)
    cnt = 0
    m,n = X_cv_c_t.shape
    cnt = (predict_knn(X_train_c_t,Y_train_c_t,self.k,X_cv_c_t) == Y_cv_c_t).sum()
    return cnt/m

##Visualizing performance of knn model

In [None]:
for i in range(9):
  data = X_train_c[i].reshape(28,28)
  plt.imshow(data,cmap="Greys_r")
  Y_knn = predict_knn(X_train_c,Y_train_c,3,X_train_c[i].reshape(1,-1))
  plt.title(f"label:{Y_train_c[i]} knn:{Y_knn}")
  plt.show()

##Making a prediction on test dataset

In [None]:
#Now we load and print our test data
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/classification_test.csv") #Loading data from google drive
classification_test_data = np.array(df) #Converting loaded data in a matrix
test_id_c = classification_test_data[:,0] #Getting the Id's of all labels
test_id_c = test_id_c.reshape(test_id_c.size,1)
X_test_c = classification_test_data[:,1:] #Extracting X_test
# Printing the first 5 rows of our loaded data
print("Input:-\n",X_test_c[:5])
print("Shape of test data:-",X_test_c.shape)
print("Test Id's:-\n",test_id_c[:5])

Now we make a prediction on the test data

In [None]:
#Code takes time approx 15 minutes
Y_test_prediction_c = predict_knn(X_train_c,Y_train_c,3,X_test_c)
print(Y_test_prediction_c)

In [None]:
#We also append the id's of data with our prediction
prediction_c = np.append(test_id_c,Y_test_prediction_c,axis=1)
print(prediction_c[:5])

Now finally we export our prediction back into a csv file

In [None]:
dfc=pd.DataFrame(prediction_c) #Convert matrix to dataframe
dfc.columns = ["ids","prediction"] #Set labels to our dataframe
dfc.to_csv("classification_test_prediction_knn.csv",index=False)#Convert dataframe to csv file

#Logistic Regression

##Creating the model
We are going to create a softmax model for making a prediction on multiple classes(SOFTMAX)

###Initializing model parameters

In [None]:
np.random.seed(5)
#For initializing small values near 0
#As there are 10 different classes and 784 different features
w_initial_lo = 0.0001*(np.random.randn(10,784)-0.5)
b_initial_lo = np.random.randn(1,10)
print(w_initial_lo)
print(b_initial_lo)

###Modifying the data
Before running gradient descent we will apply one hot encoding to Y_train_c and we will generate a vector containing frequency of every class(Useful for gradient descent),After one-hot encoding we will perform other operations

In [None]:
def modify_data_lo(X,Y):
  #Getting the frequency array
  Y_train_ca = np.bincount(Y.flatten())
  Y_train_ca = Y_train_ca.reshape(1,Y_train_ca.size)
  #One hot encoding the data (Run this cell after just after loading classification data)
  ab = pd.DataFrame(Y,columns = ["Category"])
  Y_train_cb = pd.get_dummies(ab,columns = ["Category"])
  Y_train_cb = np.array(Y_train_cb)
  return Y_train_cb,Y_train_ca

In [None]:
Y_train_cw,Y_train_cb = modify_data_lo(X_train_c,Y_train_c)
print("For updating w:-\n",Y_train_cw)
print("For updating b:-\n",Y_train_cb)

##Predict Function
We define 2 predict functions one for probability and other for actual prediction

In [None]:
def predict_lo (X,w,b):
  """
  Arguments
  X(2D numpy array of training examples) Shape(m,n)
  w(numpy array of model parameters) Shape(n,1)
  b(model parameter) scalar 
  """
  m,n = X.shape
  a = np.matmul(X,w.T) + b
  #Applying softmax activation
  c = np.exp(a)
  d = np.sum(c,axis=1).reshape(m,1)
  probability = c/d
  prediction = probability.argmax(axis=1).reshape(m,1)
  return probability,prediction

In [None]:
probability,prediction = predict_lo(X_train_c,w_initial_lo,b_initial_lo)
print(probability)
print(prediction)

##Computing Cost

In [None]:
def compute_cost_lo(X,Y,w,b):
  m = len(Y) #Getting the number of training examples
  probability,prediction = predict_lo(X,w,b)
  l = np.choose(Y.T,probability.T).T
  loss = (-1)*np.log(l)
  cost = np.sum(loss)/m
  return cost

In [None]:
print(compute_cost_lo(X_train_c,Y_train_c,w_initial_lo,b_initial_lo))

##Computing cost(with regularization)

In [None]:
def compute_cost_lo_l(X,Y,w,b,l):
  m = len(Y)
  cost = compute_cost_lo(X,Y,w,b)
  cost += (l/2*m)*np.sum(w**2) #Adding regularized term
  return cost

In [None]:
print(compute_cost_lo_l(X_train_c,Y_train_c,w_initial_lo,b_initial_lo,0.01))

##Running Gradient descent

In [None]:
def gradient_descent_lo(X,Y,w_in,b_in,alpha,iters,ratio,l,predict,compute_cost_lo,modify_data_lo):
  
  J_train_history = [] #We create a list containg cost after every iteration(For later plotting and analysis)
  J_cv_history = []
  w = w_in.copy() 
  b = b_in
  
  X_train,X_cv,Y_train,Y_cv = train_test_split(X,Y,ratio)
  m,n = X_train.shape #Getting the number of training examples
  Y_train = Y_train.astype(np.int64)
  Y_cv = Y_cv.astype(np.int64)
  X_w,X_b = modify_data_lo(X_train,Y_train)
  for i in range(iters):
    #Compute gradient dj_dw and dj_db
    probability,prediction = predict_lo(X_train,w,b)
    dj_db = (np.sum(probability,axis=0).reshape(1,-1) - X_b)/m #Here -1 in .reshape() automatically gets the possible dimension(According to the number of elements in the array)
    dj_dw = (l*w - np.matmul((X_w - probability).T,X_train))/m
    #Update parameters w,b (Simultaneously)
    w = w-alpha*dj_dw
    b = b-alpha*dj_db
    #Record Cost J after every iteration
    # Print cost after interval of 10 times,[-1] refers to the last element(reverse indexing)
    if i%(np.math.ceil(iters / 10)) == 0:
     #Note:- Even though we have applied regularization the method for computing training and testing error is kept same
      J_train_history.append(compute_cost_lo(X_train, Y_train, w, b)) #Add the current cost to J_history
      J_cv_history.append(compute_cost_lo(X_cv, Y_cv, w, b))
      print("Iteration :",i," Cost :",J_train_history[-1])  #F-strings used to encode variables i and J_history  
  return w, b, J_train_history,J_cv_history #return final w,b and J_history(for graphing)

In [None]:
#Running logistic gradient descent
w_final_lo,b_final_lo,J_train_history,J_cv_history = gradient_descent_lo(X_train_c,Y_train_c,w_initial_lo,b_initial_lo,0.00001,1000,0.8,0,predict_lo,compute_cost_lo,modify_data_lo)
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

As we see our cost function fluctuates so we apply z-score normalization on the data

In [None]:
X_train_cn,u,s = z_score_normalization(X_train_c)
w_final_lo,b_final_lo,J_train_history,J_cv_history = gradient_descent_lo(X_train_cn,Y_train_c,w_initial_lo,b_initial_lo,0.1,1000,0.8,0,predict_lo,compute_cost_lo,modify_data_lo)
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#Further continuing the descent with  learning rate
w_final_lo,b_final_lo,J_train_history,J_cv_history = gradient_descent_lo(X_train_cn,Y_train_c,w_final_lo,b_final_lo,0.1,1000,0.8,0,predict_lo,compute_cost_lo,modify_data_lo)
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

The cost function has almost saturated now we compute accuracy

##Computing Accuracy of classification model

In [None]:
def accuracy_lo(X,Y,w,b):
  m = len(Y) #Getting number of training examples
  probability,prediction = predict_lo(X,w,b)
  cnt = (Y == prediction).sum()
  return cnt/m

In [None]:
a = accuracy_lo(X_train_cn,Y_train_c,w_final_lo,b_final_lo)
print(f"Accuracy:- {a*100}%")

##Plotting the prediction
Here we used our trained model to make prediction on the training dataset(So it can be compared with the previous plot of the classification data in knn)

In [None]:
#We will pick the first 2 features of our training dataset and plot it on the 2D plane(After normalization)
X_plot = X_train_c[:,:2]
X_plot_0 = []
X_plot_1 = []
X_plot_2 = []
X_plot_3 = []
X_plot_4 = []
X_plot_5 = []
X_plot_6 = []
X_plot_7 = []
X_plot_8 = []
X_plot_9 = []
probability,Y_plot = predict_lo(X_train_cn,w_final_lo,b_initial_lo)
for i in range(Y_train_c.size):
  if Y_plot[i] == 0:
    X_plot_0.append(X_plot[i])
  elif Y_plot[i] == 1:
    X_plot_1.append(X_plot[i])
  elif Y_plot[i] == 2:
    X_plot_2.append(X_plot[i])
  elif Y_plot[i] == 3:
    X_plot_3.append(X_plot[i])
  elif Y_plot[i] == 4:
    X_plot_4.append(X_plot[i])
  elif Y_plot[i] == 5:
    X_plot_5.append(X_plot[i])
  elif Y_plot[i] == 6:
    X_plot_6.append(X_plot[i])
  elif Y_plot[i] == 7:
    X_plot_7.append(X_plot[i])
  elif Y_plot[i] == 8:
    X_plot_8.append(X_plot[i])
  elif Y_plot[i] == 9:
    X_plot_9.append(X_plot[i])
X_0 = np.array(X_plot_0)
X_1 = np.array(X_plot_1)
X_2 = np.array(X_plot_2)
X_3 = np.array(X_plot_3)
X_4 = np.array(X_plot_4)
X_5 = np.array(X_plot_5)
X_6 = np.array(X_plot_6)
X_7 = np.array(X_plot_7)
X_8 = np.array(X_plot_8)
X_9 = np.array(X_plot_9)
plt.scatter(X_0[:,0],X_0[:,1],color='k',marker='o',label=0)
plt.scatter(X_1[:,0],X_1[:,1],color='b',marker='o',label=1)
plt.scatter(X_2[:,0],X_2[:,1],color='g',marker='o',label=2)
plt.scatter(X_3[:,0],X_3[:,1],color='r',marker='o',label=3)
plt.scatter(X_4[:,0],X_4[:,1],color='c',marker='o',label=4)
plt.scatter(X_5[:,0],X_5[:,1],color='m',marker='o',label=5)
plt.scatter(X_6[:,0],X_6[:,1],color='y',marker='o',label=6)
plt.scatter(X_7[:,0],X_7[:,1],color='#3d251e',marker='o',label=7)
plt.scatter(X_8[:,0],X_8[:,1],color='#ed7014',marker='o',label=8)
plt.scatter(X_9[:,0],X_9[:,1],color='#8f00ff',marker='o',label=9)
plt.title("Labels plotted against the 1st two pixel grayscale values")
plt.xlabel("First pixel")
plt.ylabel("Second pixel")
plt.legend()
plt.show()

##Visualizing performance of logistic model

In [None]:
for i in range(9):
  data = X_train_c[i].reshape(28,28)
  plt.imshow(data,cmap="Greys_r")
  prob,Y_lo = predict_lo(X_train_cn,w_final_lo,b_final_lo)
  ans = Y_lo[i]
  plt.title(f"label:{Y_train_c[i]} logistic:{ans}")
  plt.show()

##Generalizing Logistic Regression for n features(USING CLASSES)
Following code will later be used to make my own python library

In [None]:
##Functions defined outside of classes are not to be directly accessible by the users of the library
def predict_lo(X,w,b):
  m,n = X.shape
  a = np.matmul(X,w.T) + b
  c = np.exp(a)
  d = np.sum(c,axis=1).reshape(m,1)
  probability = c/d
  prediction = probability.argmax(axis=1).reshape(m,1)
  return probability,prediction

def compute_cost_lo(X,Y,w,b):
  m = len(Y)
  probability,prediction = predict_lo(X,w,b)
  l = np.choose(Y.T,probability.T).T
  loss = (-1)*np.log(l)
  cost = np.sum(loss)/m
  return cost

def train_test_split(X,Y,ratio):
    m,n = X.shape
    a = m*ratio
    a = round(a)
    X_train_r = X[:a]
    X_cv_r = X[a:]
    Y_train_r = Y[:a]
    Y_cv_r = Y[a:]
    return X_train_r,X_cv_r,Y_train_r,Y_cv_r

def modify_data_lo(X,Y):
  Y_train_ca = np.bincount(Y.flatten())
  Y_train_ca = Y_train_ca.reshape(1,Y_train_ca.size)
  ab = pd.DataFrame(Y,columns = ["Category"])
  Y_train_cb = pd.get_dummies(ab,columns = ["Category"])
  Y_train_cb = np.array(Y_train_cb)
  return Y_train_cb,Y_train_ca

class LogisticRegression:
  def __init__(self,X,Y,alpha,l,iters,ratio):
    self.X,self.mean,self.std_dev = z_score_normalization(X)
    self.Y = Y
    m,n = self.X.shape
    self.m = m
    self.n = n
    self.k = len(np.unique(self.Y))
    self.alpha = alpha
    self.l = l
    self.iters = iters
    self.ratio = ratio
    np.random.seed(7)
    self.w = 0.0001*(np.random.randn(self.k,self.n)-0.5)
    self.b = np.random.randn(1,self.k)
    self.J_train_history = []
    self.J_cv_history = []
  
  def predict(self):  
    a = np.matmul(self.X,(self.w).T) + self.b
    c = np.exp(a)
    d = np.sum(c,axis=1).reshape(self.m,1)
    probability = c/d
    prediction = probability.argmax(axis=1).reshape(self.m,1)
    return probability,prediction
  
  def compute_cost(self):
    probability,prediction = self.predict()
    l = np.choose((self.Y).T,probability.T).T
    loss = (-1)*np.log(l)
    cost = np.sum(loss)/(self.m)
    return cost
  
  def gradient_descent(self):
    J_train_history = []
    J_cv_history = []
    X_train,X_cv,Y_train,Y_cv = train_test_split(self.X,self.Y,self.ratio)
    m,n = X_train.shape 
    Y_train = Y_train.astype(np.int64)
    Y_cv = Y_cv.astype(np.int64)
    X_w,X_b = modify_data_lo(X_train,Y_train)
    for i in range(self.iters):
      probability,prediction = predict_lo(X_train,self.w,self.b)
      dj_db = (np.sum(probability,axis=0).reshape(1,-1) - X_b)/m 
      dj_dw = ((self.l)*(self.w) - np.matmul((X_w - probability).T,X_train))/m
      self.w = self.w-self.alpha*dj_dw
      self.b = self.b-self.alpha*dj_db
      if i<=100000: 
        if i%(np.math.ceil(self.iters/10)) == 0:
          J_train_history.append(compute_cost_lo(X_train, Y_train, self.w, self.b))
          J_cv_history.append(compute_cost_lo(X_cv, Y_cv, self.w, self.b))
          print(f"Iteration :{i} Cost :{J_train_history[-1]}")   
    self.J_train_history = J_train_history
    self.J_cv_history = J_cv_history
    return self.w, self.b, J_train_history,J_cv_history 
  
  def reset_model(self):
    np.random.seed(7)
    self.w = 0.0001*(np.random.randn(self.k,self.n)-0.5)
    self.b = np.random.randn(1,self.k)
    self.J_train_history = []
    self.J_cv_history = []
  
  def accuracy(self):
    probability,prediction = self.predict()
    cnt = (self.Y == prediction).sum()
    return cnt/(self.m)

##Using our trained data to make a prediction on test dataset

In [None]:
#Now we load and print our test data
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/classification_test.csv") #Loading data from google drive
classification_test_data = np.array(df) #Converting loaded data in a matrix
test_id_c = classification_test_data[:,0] #Getting the Id's of all labels
test_id_c = test_id_c.reshape(test_id_c.size,1)
X_test_c = classification_test_data[:,1:] #Extracting X_test
# Printing the first 5 rows of our loaded data
print("Input:-\n",X_test_c[:5])
print("Shape of test data:-",X_test_c.shape)
print("Test Id's:-\n",test_id_c[:5])

Now we make a prediction on the test data

In [None]:
#We normalize data before making prediction
Xn,u,s = z_score_normalization(X_test_c)
X_test_cn = (X_test_c - u)/s
probability,Y_test_prediction_c = predict_lo(X_test_cn,w_final_lo,b_final_lo)

In [None]:
#We also append the id's of data with our prediction
prediction_c = np.append(test_id_c,Y_test_prediction_c,axis=1)
print(prediction_c[:5])

Now finally we export our prediction back into a csv file

In [None]:
dfc=pd.DataFrame(prediction_c) #Convert matrix to dataframe
dfc.columns = ["ids","prediction"] #Set labels to our dataframe
dfc.to_csv("classification_test_prediction_logistic.csv",index=False)#Convert dataframe to csv file

Saving trained parameters for easier access later

In [None]:
#These files can later be loaded using np.load()
np.save("logistic_w.npy",w_final_lo)
np.save("logistic_b.npy",b_final_lo)

In [None]:
w_ans = np.load("logistic_w.npy")
b_ans = np.load("logistic_b.npy")
print("Weights:-\n",w_ans)
print("Bias:-\n",b_ans)

# Neural Networks

##Creating the model

In [None]:
#For the L'th layer W is a matrix where each column corresponds to the weights of each neuron and b is the bias vector(with just 1 row)
def dense(A_in,W,b,activation,z_list):
  z = np.matmul(A_in,W) + b
  z_list.append(z)
  if activation == "sigmoid":
    A_out = 1/(1+np.exp(-z)) 
  elif activation == "linear":
    A_out = z
  elif activation == "relu":
    A_out = np.maximum(z,0)
  elif activation == "tanh":
    A_out = (np.exp(z) - np.exp(-z))/(np.exp(z) + np.exp(-z))
  elif activation == "softmax":
    a = np.exp(z)
    sum = np.sum(a,axis=1).reshape(-1,1)
    A_out = a/sum
  return A_out,z_list

In [None]:
#Now we will constuct the gen_w_b function which will define the architecture of our neural network(Unilayer)
def gen_w_b(X,units):
  W_list = []
  b_list = []
  m,n = X.shape #m is no.of training examples
  np.random.seed(264)
  for i in range(len(units)):
    if i==0:
      W_list.append(0.0000001*(np.random.randn(n,units[0])-0.5))
    else:
      W_list.append(0.0000001*(np.random.randn(units[i-1],units[i])-0.5))
    b_list.append(0.0000001*(np.random.randn(1,units[i])-0.5))
  return W_list,b_list

##Forward propogation

In [None]:
#Now this function will perform forward propogation in the network
def f_prop(X,W_list,b_list,activation):
  a_out = X
  z_list = []
  z_list.append(X)
  a_out_l = []
  a_out_l.append(X)
  for i in range(len(activation)):
    a_out,z_list = dense(a_out,W_list[i],b_list[i],activation[i],z_list)
    a_out_l.append(a_out)
  return a_out,z_list,a_out_l

##Cost function
The cost function of the neural network will depend on the type of problem we are solving.As the activation of the last neuron also depends on the type of problem we are solving

In [None]:
def compute_cost(X,Y,W_list,b_list,activation,cost):
   m = len(Y) #Getting the number of training examples
   if cost == "mean_squared_error": #Regression problems
      f_wb,z_list,a_out_l = f_prop(X,W_list,b_list,activation)
      err = (f_wb - Y)**2
      err = err.reshape(m,1)
      cost = np.sum(err)/(2*m)
   elif cost == "binary_cross_entropy":  #Binary class classification
     f_wb,z_list,a_out_l = f_prop(X,W_list,b_list,activation)
     loss = -Y*np.log(f_wb)-(1-Y)*np.log(1-f_wb)
     cost = np.sum(loss)/m
   elif cost == "cross_entropy": #Multi class classification
      probability,z_list,a_out_l = f_prop(X,W_list,b_list,activation)
      Y = np.array(Y,dtype="int64")
      l = np.choose(Y.T,probability.T).T
      loss = (-1)*np.log(l)
      cost = np.sum(loss)/m
   return cost

##Regularized cost function

In [None]:
def compute_cost_l(X,Y,W_list,b_list,activation,cost,l):
   m = len(Y) #Getting the number of training examples
   if cost == "mean_squared_error": #Regression problems
      f_wb,z_list,a_out_l = f_prop(X,W_list,b_list,activation)
      err = (f_wb - Y)**2
      err = err.reshape(m,1)
      cost = np.sum(err)/(2*m)
   elif cost == "binary_cross_entropy":  #Binary class classification
     f_wb,z_list,a_out_l = f_prop(X,W_list,b_list,activation)
     loss = -Y*np.log(f_wb)-(1-Y)*np.log(1-f_wb)
     cost = np.sum(loss)/m
   elif cost == "cross_entropy": #Multi class classification
      probability,z_list,a_out_l = f_prop(X,W_list,b_list,activation)
      l = np.choose(Y.T,probability.T).T
      loss = (-1)*np.log(l)
      cost = np.sum(loss)/m
   cnt = 0
   for i in range(len(W_list)):
     cnt+=np.sum((W_list[i])**2)
   cnt = (l/2*m)*cnt
   cost+=cnt
   return cost

## Back Propogation

In [None]:
#Now we compute the function which has derivatives of all activation functions
def func_d(z,activation):
  if activation == "sigmoid":
    A_out = np.exp(-z)/(1+np.exp(-z))**2 
  elif activation == "linear":
    A_out = np.full(z.shape,1)
  elif activation == "relu":
    def d_relu(z):
      if z>0:
        return 1
      else:
        return 0
    d_relu = np.vectorize(d_relu)
    A_out = d_relu(z)
  elif activation == "tanh":
    a_out = (np.exp(z) - np.exp(-z))/(np.exp(z) + np.exp(-z))
    A_out = 1 - (a_out)**2
  elif activation == "softmax":
    a = np.exp(z)
    sum = np.sum(a,axis=1).reshape(-1,1)
    a_out = a/sum
    A_out = a_out - (a_out)**2
  return A_out

###Assumptions
For the neural network to work correctly it has been assumed that for the cost function "Binary cross entropy" and "Cross entropy" the activation of the output layer is "sigmoid" and "softmax" respectively.

In [None]:
def b_prop(X,Y,W_list,b_list,activation,cost,l,a_out_l,z_list):
  m,o = X.shape
  n = len(activation)
  dj_dw = []
  dj_db = []
  if cost == "mean_squared_error":
    act = activation[-1]
    Y_s = Y
  elif cost == "cross_entropy":
    act = "linear"
    ab = pd.DataFrame(Y,columns = ["Category"])
    Y_s = pd.get_dummies(ab,columns = ["Category"])
    Y_s = np.array(Y_s)
  else:
    act = "linear"
    Y_s = Y
  if n==1:
    s = (a_out_l[-1]-Y_s)/m
    s = s*func_d(z_list[-1],act)
    dj_dw_i = np.matmul((a_out_l[-2]).T,s)
    dj_db_i = np.sum(s,axis=0).reshape(1,-1)
    dj_dw_i += (l/m)*W_list[-1]
    dj_dw.append(dj_dw_i)
    dj_db.append(dj_db_i)
  else:
    s = (a_out_l[-1]-Y_s)/m
    s = s*func_d(z_list[-1],act)
    dj_dw_i = np.matmul((a_out_l[-2]).T,s)
    dj_db_i = np.sum(s,axis=0).reshape(1,-1)
    dj_dw_i += (l/m)*W_list[-1]
    dj_dw.append(dj_dw_i)
    dj_db.append(dj_db_i)
    for i in range(2,n+1):
      s = np.matmul(s,(W_list[1-i]).T)
      s = s*func_d(z_list[-i],activation[-i])
      dj_dw_i = np.matmul((a_out_l[-(i+1)]).T,s)
      dj_db_i = np.sum(s,axis=0).reshape(1,-1)
      dj_dw_i += (l/m)*W_list[-i]
      dj_dw.append(dj_dw_i)
      dj_db.append(dj_db_i)
  dj_dw.reverse()
  dj_db.reverse()
  return dj_dw,dj_db

##Gradient Descent

In [None]:
def gradient_descent_nn(X,Y,W_list,b_list,alpha,iters,ratio,l,cost,activation):
  J_train_history = [] #We create a list containg cost after every iteration(For later plotting and analysis)
  J_cv_history = []
  W_list = W_list.copy()
  b_list = b_list
  X_train,X_cv,Y_train,Y_cv = train_test_split(X,Y,ratio)
  m,n = X_train.shape #Getting the number of training examples
  for i in range(iters):
    #Compute gradient dj_dw and dj_db
    f_wb,z_list,a_out_l = f_prop(X_train,W_list,b_list,activation)
    dj_dw,dj_db = b_prop(X_train,Y_train,W_list,b_list,activation,cost,l,a_out_l,z_list)
    #Update parameters W,b (Simultaneously)
    for j in range(len(W_list)):
      W_list[j] = W_list[j] - alpha*(dj_dw[j])
      b_list[j] = b_list[j] - alpha*(dj_db[j])
    #Record Cost J after every iteration
    #Note:- Even though we have applied regularization the method for computing training and testing error is still same
    J_train_history.append(compute_cost(X_train,Y_train,W_list,b_list,activation,cost)) #Add the current cost to J_history
    J_cv_history.append(compute_cost(X_cv,Y_cv,W_list,b_list,activation,cost))
    # Print cost after interval of 10 times,[-1] refers to the last element(reverse indexing)
    if i%(np.math.ceil(iters / 10)) == 0:
      print("Iteration :",i," Cost :",J_train_history[-1]) #F-strings used to encode variables i and J_history  
  return W_list, b_list, J_train_history,J_cv_history #return final w,b and J_history(for graphing)

##Unilayer regression by neural network

###Neural network trained on linear train data(single layer)

In [None]:
#Now we load and print our training data
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/linear_train.csv") #Loading data from google drive
linear_train_data = np.array(df) #Converting loaded data in a matrix
X_train_l = linear_train_data[:,1:21] #Extracting X_train(Input data)
Y_train_l = linear_train_data[:,21] ##Extracting Y_train(Labels)
Y_train_l = Y_train_l.reshape(Y_train_l.size,1)
# Printing the first 5 rows of our loaded data
print("Input\n",X_train_l[:5])
print("labels\n",Y_train_l[:5])
print(X_train_l.shape)
print(Y_train_l.shape)

In [None]:
#Normalizing the data
X_train_ln,u,s = z_score_normalization(X_train_l)
print("Normalized Data:-",X_train_ln[:5])
print("Mean of orignal data:-",u)
print("std_dev of orignal data:-",s)

In [None]:
units_lin = [20,1]
activation_lin = ["relu","linear"]
W_lin,b_lin = gen_w_b(X_train_ln,units_lin)

In [None]:
#Running gradient descent for neural network
W_lin,b_lin,J_train_history,J_cv_history = gradient_descent_nn(X_train_ln,Y_train_l,W_lin,b_lin,0.001,1000,0.8,0,"mean_squared_error",activation_lin)
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

###Neural network trained on polynomial train data(single layer)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/polynomial_train.csv") #Loading data from google drive
polynomial_train_data = np.array(df) #Converting loaded data in a matrix
X_train_p = polynomial_train_data[:,1:4]
Y_train_p = polynomial_train_data[:,4]
Y_train_p = Y_train_p.reshape(Y_train_p.size,1)

In [None]:
print(X_train_p[:5])
print(Y_train_p[:5])

In [None]:
def combo(k,i):
  if(k == 1):
    return np.array([[i]],dtype=int)
  matrix = np.empty((0,k),dtype=int)
  for j in range(i+1):
    m,n = combo(k-1,j).shape
    a = np.full([m,1],i-j,dtype=int)
    mt = np.append(a,combo(k-1,j),axis=1)
    matrix = np.append(matrix,mt,axis = 0)
  return matrix

def get_terms(X,d):
  m,n = X.shape
  e = np.empty((m,0))
  for i in range(1,d+1): 
    c = np.empty((m,0))
    mat = combo(n,i)
    a,b = mat.shape
    for j in range(a):
      f = np.prod(np.power(X,mat[j]),axis=1)
      f = f.reshape(m,1)
      c = np.append(c,f,axis = 1)
    e = np.append(e,c,axis=1)
  return e

In [None]:
#Getting the terms for a 3 degree polynomial
X_train_pm = get_terms(X_train_p,5)
print(X_train_pm[:5])
print(X_train_pm.shape)

In [None]:
#Normalizing the data
X_train_pmn,u,s = z_score_normalization(X_train_pm)
print("Normalized Data:-",X_train_pmn[:5])
print("Mean of orignal data:-",u)
print("std_dev of orignal data:-",s)

In [None]:
units_poly = [20,1]
activation_poly = ["relu","linear"]
W_poly,b_poly = gen_w_b(X_train_pmn,units_poly)

In [None]:
#Takes a lot of time
W_poly,b_poly,J_train_history,J_cv_history = gradient_descent_nn(X_train_pmn,Y_train_p,W_poly,b_poly,0.0000001,1000,0.8,0,"mean_squared_error",activation_poly)
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

###Neural network trained on classification data(single layer)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/classification_train.csv") #Loading data from google drive
classification_train_data = np.array(df) #Converting loaded data in a matrix
X_train_c = classification_train_data[:,2:786]
Y_train_c = classification_train_data[:,1]
Y_train_c = Y_train_c.reshape(Y_train_c.size,1)

In [None]:
print(X_train_c[:5])
print(Y_train_c[:5])

In [None]:
#Normalizing the data
X_train_cn,u,s = z_score_normalization(X_train_c)
print("Normalized Data:-",X_train_cn[:5])

In [None]:
units_cl = [100,10]
activation_cl = ["relu","softmax"]
W_cl,b_cl = gen_w_b(X_train_cn,units_cl)

In [None]:
W_cl,b_cl,J_train_history,J_cv_history = gradient_descent_nn(X_train_cn,Y_train_c,W_cl,b_cl,0.1,1000,0.8,0,"cross_entropy",activation_cl)
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

##Genralizing neural network for n layers using classes

In [None]:
def z_score_normalization(X):
  m,n = (X).shape
  mean = np.sum(X,axis=0)/m 
  mean = mean.reshape(1,-1)
  sq_X = X**2 
  sq_mean = np.sum(sq_X,axis=0)/m 
  sq_mean = sq_mean.reshape(1,-1)
  std_dev = np.sqrt(sq_mean - (mean)**2) 
  std_dev = std_dev.reshape(1,-1)
  X_normalized = (X - mean)/std_dev 
  return X_normalized, mean, std_dev

def train_test_split(X,Y,ratio):
    m,n = X.shape
    a = m*ratio
    a = round(a)
    Z = np.append(X,Y,axis=1)
    np.random.seed(261)
    np.random.shuffle(Z)
    X = Z[:,:-1]
    Y = Z[:,-1]
    Y = Y.reshape(Y.size,1)
    X_train_r = X[:a]
    X_cv_r = X[a:]
    Y_train_r = Y[:a]
    Y_cv_r = Y[a:]
    return X_train_r,X_cv_r,Y_train_r,Y_cv_r

def dense(A_in,W,b,activation,z_list):
  z = np.matmul(A_in,W) + b
  z_list.append(z)
  if activation == "sigmoid":
    A_out = 1/(1+np.exp(-z)) 
  elif activation == "linear":
    A_out = z
  elif activation == "relu":
    A_out = np.maximum(z,0)
  elif activation == "tanh":
    A_out = (np.exp(z) - np.exp(-z))/(np.exp(z) + np.exp(-z))
  elif activation == "softmax":
    a = np.exp(z)
    sum = np.sum(a,axis=1).reshape(-1,1)
    A_out = a/sum
  return A_out,z_list

def gen_w_b(X,units):
  W_list = []
  b_list = []
  m,n = X.shape 
  np.random.seed(111)
  for i in range(len(units)):
    if i==0:
      W_list.append(1*(np.random.randn(n,units[0])-0.5))
    else:
      W_list.append(1*(np.random.randn(units[i-1],units[i])-0.5))
    b_list.append(1*(np.random.randn(1,units[i])-0.5))
  return W_list,b_list

def f_prop(X,W_list,b_list,activation):
  a_out = X
  z_list = []
  z_list.append(X)
  a_out_l = []
  a_out_l.append(X)
  for i in range(len(activation)):
    a_out,z_list = dense(a_out,W_list[i],b_list[i],activation[i],z_list)
    a_out_l.append(a_out)
  return a_out,z_list,a_out_l

def compute_cost(X,Y,W_list,b_list,activation,cost):
   m = len(Y)
   if cost == "mean_squared_error": 
      f_wb,z_list,a_out_l = f_prop(X,W_list,b_list,activation)
      err = (f_wb - Y)**2
      err = err.reshape(m,1)
      cost = np.sum(err)/(2*m)
   elif cost == "binary_cross_entropy":  
     f_wb,z_list,a_out_l = f_prop(X,W_list,b_list,activation)
     loss = -Y*np.log(f_wb)-(1-Y)*np.log(1-f_wb)
     cost = np.sum(loss)/m
   elif cost == "cross_entropy": 
      probability,z_list,a_out_l = f_prop(X,W_list,b_list,activation)
      Y = np.array(Y,dtype="int64")
      l = np.choose(Y.T,probability.T).T
      loss = (-1)*np.log(l)
      cost = np.sum(loss)/m
   return cost

def func_d(z,activation):
  if activation == "sigmoid":
    A_out = np.exp(-z)/(1+np.exp(-z))**2 
  elif activation == "linear":
    A_out = np.full(z.shape,1)
  elif activation == "relu":
    def d_relu(z):
      if z>0:
        return 1
      else:
        return 0
    d_relu = np.vectorize(d_relu)
    A_out = d_relu(z)
  elif activation == "tanh":
    a_out = (np.exp(z) - np.exp(-z))/(np.exp(z) + np.exp(-z))
    A_out = 1 - (a_out)**2
  elif activation == "softmax":
    a = np.exp(z)
    sum = np.sum(a,axis=1).reshape(-1,1)
    a_out = a/sum
    A_out = a_out - (a_out)**2
  return A_out

def b_prop(X,Y,W_list,b_list,activation,cost,l,a_out_l,z_list):
  m,o = X.shape
  n = len(activation)
  dj_dw = []
  dj_db = []
  if cost == "mean_squared_error":
    act = activation[-1]
    Y_s = Y
  elif cost == "cross_entropy":
    act = "linear"
    ab = pd.DataFrame(Y,columns = ["Category"])
    Y_s = pd.get_dummies(ab,columns = ["Category"])
    Y_s = np.array(Y_s)
  else:
    act = "linear"
    Y_s = Y
  if n==1:
    s = (a_out_l[-1]-Y_s)/m
    s = s*func_d(z_list[-1],act)
    dj_dw_i = np.matmul((a_out_l[-2]).T,s)
    dj_db_i = np.sum(s,axis=0).reshape(1,-1)
    dj_dw_i += (l/m)*W_list[-1]
    dj_dw.append(dj_dw_i)
    dj_db.append(dj_db_i)
  else:
    s = (a_out_l[-1]-Y_s)/m
    s = s*func_d(z_list[-1],act)
    dj_dw_i = np.matmul((a_out_l[-2]).T,s)
    dj_db_i = np.sum(s,axis=0).reshape(1,-1)
    dj_dw_i += (l/m)*W_list[-1]
    dj_dw.append(dj_dw_i)
    dj_db.append(dj_db_i)
    for i in range(2,n+1):
      s = np.matmul(s,(W_list[1-i]).T)
      s = s*func_d(z_list[-i],activation[-i])
      dj_dw_i = np.matmul((a_out_l[-(i+1)]).T,s)
      dj_db_i = np.sum(s,axis=0).reshape(1,-1)
      dj_dw_i += (l/m)*W_list[-i]
      dj_dw.append(dj_dw_i)
      dj_db.append(dj_db_i)
  dj_dw.reverse()
  dj_db.reverse()
  return dj_dw,dj_db

def gradient_descent_nn(X,Y,W_list,b_list,alpha,iters,ratio,l,cost,activation):
  J_train_history = [] 
  J_cv_history = []
  W_list = W_list.copy()
  b_list = b_list
  X_train,X_cv,Y_train,Y_cv = train_test_split(X,Y,ratio)
  m,n = X_train.shape 
  for i in range(iters):
    f_wb,z_list,a_out_l = f_prop(X_train,W_list,b_list,activation)
    dj_dw,dj_db = b_prop(X_train,Y_train,W_list,b_list,activation,cost,l,a_out_l,z_list)
    for j in range(len(W_list)):
      W_list[j] = W_list[j] - alpha*(dj_dw[j])
      b_list[j] = b_list[j] - alpha*(dj_db[j])
    J_train_history.append(compute_cost(X_train,Y_train,W_list,b_list,activation,cost)) 
    J_cv_history.append(compute_cost(X_cv,Y_cv,W_list,b_list,activation,cost))
    if i%(np.math.ceil(iters / 10)) == 0:
      print("Iteration :",i," Cost :",J_train_history[-1])   
  return W_list, b_list, J_train_history,J_cv_history 

class NeuralNetwork:
  def __init__(self,X,Y,units,activation,cost,alpha,iters,ratio,l):
    self.X,u,s = z_score_normalization(X)
    self.Y = Y
    self.units = units
    self.activation = activation
    self.cost = cost
    self.w,self.b = gen_w_b(self.X,self.units)
    self.alpha = alpha
    self.iters = iters
    self.ratio = ratio
    self.l = l
    self.J_train_history = []
    self.J_cv_history = []
  
  def predict(self):
    a_out,g1,g2 = f_prop(self.X,self.w,self.b,self.activation)
    if self.cost == "binary_cross_entropy":
      def d_s(z):
        if z>=0.5:
          return 1
        else:
          return 0
      d_s = np.vectorize(d_s)
      A_out = d_s(a_out)
    elif self.cost == "cross_entropy":
      m,n = a_out.shape
      A_out = a_out.argmax(axis=1).reshape(m,1)
    else:
      A_out=a_out
    return A_out
  
  def compute_cost(self):
    return compute_cost(self.X,self.Y,self.w,self.b,self.activation,self.cost)
  
  def gradient_descent(self):
    self.w,self.b,self.J_train_history,self.J_cv_history = gradient_descent_nn(self.X,self.Y,self.w,self.b,self.alpha,self.iters,self.ratio,self.l,self.cost,self.activation)
    return self.w,self.b,self.J_train_history,self.J_cv_history

  def accuracy(self):
    if self.cost == "binary_cross_entropy" or self.cost == "cross_entropy":
      m,n = self.Y.shape
      cnt = (self.predict() == self.Y).sum()
      acc = cnt/(m)
    else:
      m,n = self.Y.shape
      mean = np.sum(self.Y)/m
      sq_Y = (self.Y)**2 
      sq_mean = np.sum(sq_Y)/m 
      var_y = sq_mean - (mean)**2 
      acc = 1 -2*(self.compute_cost())/(var_y) 
    return acc

  def reset_model(self):
    self.w,self.b = gen_w_b(self.X,self.units)
    self.J_cv_history = []
    self.J_cv_history = []

###Regression neural network for linear data

In [None]:
m,n = X_train_l.shape
print(0.8*m)

In [None]:
#Defining the network
nn_l = NeuralNetwork(X_train_l,Y_train_l,[20,10,5,1],["relu","relu","relu","linear"],"mean_squared_error",0.0001,1000,0.8,0)

In [None]:
#Prediction
print(nn_l.predict())

In [None]:
#Computing cost
print(nn_l.compute_cost())

In [None]:
#Running gradient descent
nn_l.reset_model()
nn_l.w,nn_l.b,J_train_history,J_cv_history = nn_l.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#Running for another 1000 iterations
nn_l.w,nn_l.b,J_train_history,J_cv_history = nn_l.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
#Running for another 500 iterations
nn_l.iters = 500
nn_l.w,nn_l.b,J_train_history,J_cv_history = nn_l.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

### Computing accuracy

In [None]:
#Computing accuracy
print("Accuracy:",nn_l.accuracy()*100,"%")

###Making a prediction on test dataset

In [None]:
#Now we load and print our test data
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/linear_test_data.csv") #Loading data from google drive
linear_test_data = np.array(df) #Converting loaded data in a matrix
test_id = linear_test_data[:,0] #Getting the Id's of all labels
test_id = test_id.reshape(test_id.size,1)
X_test = linear_test_data[:,1:21] #Extracting X_test
# Printing the first 5 rows of our loaded data
print("Input:-\n",X_test[:5])
print("Shape of test data:-",X_test.shape)
print("Test Id's:-\n",test_id[:5])

In [None]:
#Before making a prediction we first have to normalize our data
X_train_normalized,mean,std_dev = z_score_normalization(X_train_l)
X_test_normalized = (X_test -mean)/std_dev
nn_l.X = X_test_normalized
Y_test_prediction = nn_l.predict()
#We also append the id's of data with our prediction
prediction = np.append(test_id,Y_test_prediction,axis = 1)
print(prediction[:5])

Now finally we export our prediction back into a csv file

In [None]:
dfp=pd.DataFrame(prediction) #Convert matrix to dataframe
dfp.columns = ["ids","prediction"] #Set labels to our dataframe
dfp.to_csv("neural_network_linear_test_prediction.csv",index=False)#Convert dataframe to csv file

Also for ease of acess we store our trained w and b in npy files

In [None]:
#These files can later be loaded using np.load()
np.savez("neural_network_linear_w",*nn_l.w)
np.savez("neural_network_linear_b",*nn_l.b)

In [None]:
w_ans = np.load("neural_network_linear_w.npz")
b_ans = np.load("neural_network_linear_b.npz")
for i in range(len(w_ans)):
  print(f"Weight {i}:-\n",w_ans[f'arr_{i}'])
  print(f"Bias {i}:-\n",b_ans[f'arr_{i}'])

###Regression neural network for polynomial data

In [None]:
def gen_w_b(X,units):
  W_list = []
  b_list = []
  m,n = X.shape #m is no.of training examples
  np.random.seed(793)
  for i in range(len(units)):
    if i==0:
      W_list.append(1*(np.random.randn(n,units[0])-0.5))
    else:
      W_list.append(1*(np.random.randn(units[i-1],units[i])-0.5))
    b_list.append((1*np.random.randn(1,units[i])-0.5))
  return W_list,b_list

In [None]:
print(X_train_pmn.shape)
m,n = X_train_pmn.shape
print(0.8*m)

In [None]:
#Defining the network
nn_p = NeuralNetwork(X_train_pmn,Y_train_p,[50,20,1],["relu","relu","linear"],"mean_squared_error",0.000000001,100,0.8,0)

In [None]:
#Prediction
print(nn_p.predict())

In [None]:
#Computing cost
print(nn_p.compute_cost())

In [None]:
#Running gradient descent
nn_p.reset_model()
nn_p.w,nn_p.b,J_train_history,J_cv_history = nn_p.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
nn_p.w,nn_p.b,J_train_history,J_cv_history = nn_p.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
nn_p.iters = 1000
nn_p.w,nn_p.b,J_train_history,J_cv_history = nn_p.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
nn_p.iters = 1000
nn_p.w,nn_p.b,J_train_history,J_cv_history = nn_p.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
nn_p.iters = 1000
nn_p.w,nn_p.b,J_train_history,J_cv_history = nn_p.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
nn_p.iters = 1000
nn_p.w,nn_p.b,J_train_history,J_cv_history = nn_p.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
nn_p.alpha = 0.0000000001
nn_p.iters = 1000
nn_p.w,nn_p.b,J_train_history,J_cv_history = nn_p.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

###Computing accuracy

In [None]:
#Computing accuracy
print("Accuracy:",nn_p.accuracy()*100,"%")

###Making a prediction on test dataset

In [None]:
#Now we load and print our test data
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/polynomial_test_data.csv") #Loading data from google drive
polynomial_test_data = np.array(df) #Converting loaded data in a matrix
test_id_p = polynomial_test_data[:,0] #Getting the Id's of all labels
test_id_p = test_id_p.reshape(test_id_p.size,1)
X_test_p = polynomial_test_data[:,1:] #Extracting X_test
# Printing the first 5 rows of our loaded data
print("Input:-\n",X_test_p[:5])
print("Shape of test data:-",X_test_p.shape)
print("Test Id's:-\n",test_id_p[:5])

In [None]:
#Before making a prediction we first have to normalize our data
X_train_pm = get_terms(X_train_p,5)
X_train_pmn,mean,std_dev = z_score_normalization(X_train_pm)
X_test_pm = get_terms(X_test_p,5)
X_test_pmn = (X_test_pm -mean)/std_dev
nn_p.X = X_test_pmn
Y_test_prediction = nn_p.predict()
#We also append the id's of data with our prediction
prediction = np.append(test_id_p,Y_test_prediction,axis = 1)
print(prediction[:5])

Now finally we export our prediction back into a csv file

In [None]:
dfp=pd.DataFrame(prediction) #Convert matrix to dataframe
dfp.columns = ["ids","prediction"] #Set labels to our dataframe
dfp.to_csv("neural_network_polynomial_test_prediction.csv",index=False)#Convert dataframe to csv file

Also for ease of acess we store our trained parameters w and b in npy files

In [None]:
#These files can later be loaded using np.load()
np.savez("neural_network_polynomial_w",*nn_p.w)
np.savez("neural_network_polynomial_b",*nn_p.b)

In [None]:
w_ans = np.load("neural_network_polynomial_w.npz")
b_ans = np.load("neural_network_polynomial_b.npz")
for i in range(len(w_ans)):
  print(f"Weight {i}:-\n",w_ans[f'arr_{i}'])
  print(f"Bias {i}:-\n",b_ans[f'arr_{i}'])

###Classification neural network

In [None]:
print(X_train_cn.shape)

In [None]:
def gen_w_b(X,units):
  W_list = []
  b_list = []
  m,n = X.shape #m is no.of training examples
  np.random.seed(793)
  for i in range(len(units)):
    if i==0:
      W_list.append(0.1*(np.random.randn(n,units[0])-0.5))
    else:
      W_list.append(0.1*(np.random.randn(units[i-1],units[i])-0.5))
    b_list.append((1*np.random.randn(1,units[i])-0.5))
  return W_list,b_list

In [None]:
#Defining the network
nn_c = NeuralNetwork(X_train_cn,Y_train_c,[10,10,10],["relu","relu","softmax"],"cross_entropy",0.1,1000,0.8,0.1)

In [None]:
#Prediction
print(nn_c.predict())

In [None]:
#Computing cost
print(nn_c.compute_cost())

In [None]:
#Running Gradient descent
nn_c.reset_model()
nn_c.w,nn_c.b,J_train_history,J_cv_history = nn_c.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
nn_c.alpha=0.1
nn_c.w,nn_c.b,J_train_history,J_cv_history = nn_c.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
nn_c.alpha=0.01
nn_c.w,nn_c.b,J_train_history,J_cv_history = nn_c.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
nn_c.alpha=0.01
nn_c.iters=1000
nn_c.w,nn_c.b,J_train_history,J_cv_history = nn_c.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

In [None]:
nn_c.alpha=0.01
nn_c.iters = 1000
nn_c.w,nn_c.b,J_train_history,J_cv_history = nn_c.gradient_descent()
plt.figure(figsize=(12,4))
plt.plot(J_train_history , label = "J train")
plt.plot(J_cv_history , label = "J cv")
plt.title("Cost v.s iteration")
plt.ylabel("Cost")
plt.xlabel("Number of iterations")
plt.legend()
plt.show()

###Visualizing the performance of the classification neural network

In [None]:
for i in range(9):
  data = X_train_c[i].reshape(28,28)
  plt.imshow(data,cmap="Greys_r")
  Y_nn = nn_c.predict()
  plt.title(f"label:{Y_train_c[i]} Neural Network:{Y_nn[i]}")
  plt.show()

###Computing accuracy

In [None]:
#Computing accuracy
print("Accuracy:",nn_c.accuracy()*100,"%")

###Making a prediction on test dataset

In [None]:
#Now we load and print our test data
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/classification_test.csv") #Loading data from google drive
classification_test_data = np.array(df) #Converting loaded data in a matrix
test_id_c = classification_test_data[:,0] #Getting the Id's of all labels
test_id_c = test_id_c.reshape(test_id_c.size,1)
X_test_c = classification_test_data[:,1:] #Extracting X_test
# Printing the first 5 rows of our loaded data
print("Input:-\n",X_test_c[:5])
print("Shape of test data:-",X_test_c.shape)
print("Test Id's:-\n",test_id_c[:5])

In [None]:
#Before making a prediction we first have to normalize our data
X_train_cn,mean,std_dev = z_score_normalization(X_train_c)
X_test_normalized = (X_test_c - mean)/std_dev
nn_c.X = X_test_normalized
Y_test_prediction = nn_c.predict()
#We also append the id's of data with our prediction
prediction = np.append(test_id_c,Y_test_prediction,axis = 1)
print(prediction[:5])

In [None]:
dfp=pd.DataFrame(prediction) #Convert matrix to dataframe
dfp.columns = ["ids","prediction"] #Set labels to our dataframe
dfp.to_csv("neural_network_classification_test_prediction.csv",index=False)#Convert dataframe to csv file

In [None]:
#These files can later be loaded using np.load()
np.savez("neural_network_classification_w",*nn_c.w)
np.savez("neural_network_classification_b",*nn_c.b)

In [None]:
w_ans = np.load("neural_network_classification_w.npz")
b_ans = np.load("neural_network_classification_b.npz")
for i in range(len(w_ans)):
  print(f"Weight {i}:-\n",w_ans[f'arr_{i}'])
  print(f"Bias {i}:-\n",b_ans[f'arr_{i}'])