In [None]:
#importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

#### Linear Regression with one variable

In [None]:
#load the dataset with one variable
BSOM_data=pd.read_csv('BSOM_DataSet_for_HW2.csv',usecols = ['all_mcqs_avg_n20','STEP_1'])
#checking for missing values
BSOM_data.isnull().sum()

In [None]:
#handling misssing values
BSOM_data['STEP_1']=BSOM_data['STEP_1'].fillna(BSOM_data['STEP_1'].mean())
BSOM_data.isnull().sum()

In [None]:
#split the data into train(80%) and test(20%) datasets
features_X = BSOM_data.iloc[:,:-1].to_numpy()
y=BSOM_data.iloc[:,-1].to_numpy()
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(features_X, y, test_size = 0.2, random_state = 0)

In [None]:
#plot features and actual y values
x1=Xtrain
plt.scatter(x1,ytrain)
plt.xlabel("mcqs_avg")
plt.ylabel("STEP_1")
plt.title("LinearRegression with One variable")
plt.show()

In [None]:
#training and test features,training and test target,number of examples of training and test data
train_X = Xtrain
m_train=train_X.shape[0]
#add bias term for training features
train_X=np.append(np.ones((m_train,1)),train_X,axis=1).T
test_X = Xtest
m_test=test_X.shape[0]
#add bias term for test features
test_X=np.append(np.ones((m_test,1)),test_X,axis=1).T
train_y=ytrain
train_y=train_y.reshape(1,train_y.shape[0])

test_y=ytest
test_y=test_y.reshape(1,test_y.shape[0])


In [None]:
# initialise the parameters with zeros
def initial_parameters(size):
    parameters=np.zeros((size,1))
    return parameters

In [None]:
#calculate the prediction
def hypothesis(X,thetas):
    h=np.dot(np.transpose(thetas),X)
    return h

In [None]:
#Calculate the cost function
def calc_cost(thetas,X,y):
    h=hypothesis(X,thetas)
    m=X.shape[1]
    J=(1/(2*m))*np.sum((h-y)**2)
    return J

In [None]:
#calculate the gradient descent
def Gradientdescent(X,y,alpha):
    m=X.shape[1]
    thetas=initial_parameters(X.shape[0])
    cost_list=[]
    thetas_list=[]
    thetas_list.append(thetas)
    iterations=0
    while True:
        ypred=hypothesis(X,thetas)
        cost=calc_cost(thetas,X,y)
        cost_list.append(cost)
        
        if (len(cost_list)>=2) and (cost_list[iterations-1]==cost_list[iterations]):
            print("convergence is reached at iteration  ",str(iterations),"when alpha is: ",str(alpha))
            print("minimum cost is :",str(cost))
            
            break
        update_thetas=thetas-(alpha/m)*np.matmul(X,(ypred-y).T)
        thetas=update_thetas
        iterations+=1
        
    return thetas,cost_list,iterations,ypred

In [None]:
#training the model using the train dataset
alphas_list=[0.1,0.5,0.6,0.7]
for i in alphas_list:
    print("learning_rate :",str(i))
    bestcoef,J_list,iter_num,train_pred=Gradientdescent(train_X,train_y,i)
    print("best parameters ",str(bestcoef))

In [None]:
#gradient descent with good learning rate
#best parameters with best alpha
bestcoef_train,J_list_train,iter_num_train,train_pred_1=Gradientdescent(train_X,train_y,0.7)
print("best thetas : ",str(bestcoef_train))

In [None]:
#plotting the cost (vs) iterations graph
iterations=list(np.arange(0,iter_num,500))
cost_J=[]
for i in iterations:
    cost_J.append(J_list[i])

plt.plot(iterations,cost_J)
plt.xlabel("#Iterations")
plt.ylabel("J (cost)")
plt.title("LinearRegression with One variable")
plt.show()

In [None]:
#plotting the prediction of train dataset with best parameters and learning rate
x1=Xtrain
plt.scatter(x1,ytrain)
plt.plot(x1,train_pred_1.T, color='red')
plt.xlabel("mcqs_avg")
plt.ylabel("STEP_1")
plt.title("Prediction on train dataset with One variable")
plt.show()


In [None]:
#predicting using test set with best parameters and learning rate
test_pred=hypothesis(test_X,bestcoef_train)
#test_pred

In [None]:
#plotting the prediction of test dataset with best parameters and learning rate

xt=Xtest
plt.scatter(xt,ytest)
plt.plot(xt,test_pred.T, color='red')
plt.xlabel("mcqs_avg")
plt.ylabel("STEP_1")
plt.title("Prediction on test set with One variable")
plt.show()

In [None]:
#caculate Mean squared error
def meansquarederror(actual,predicted):
    m=actual.shape[1]
    MSE=(1/m)*np.sum((predicted-actual)**2)
    return MSE

In [None]:
#calculate R squared
def Rsquared(actual,predicted):
    y_mean=np.mean(actual)
    SSE=np.sum((actual-predicted)**2)
    SST=np.sum((actual-y_mean)**2)
    Rsquare=1-(SSE/SST)
    return Rsquare

In [None]:
#Calculate Pearson correlation coefficient
def Correlation_coef(actual,predicted):
    y_mean=np.mean(actual)
    x_mean=np.mean(predicted)
    numerator=np.sum((actual-y_mean)*(predicted-x_mean))
    denominator=np.sqrt(np.sum((actual-y_mean)**2)*np.sum((predicted-x_mean)**2))
    pc=numerator/denominator
    return pc
    

In [None]:
#Calculate R2 score of training data with one variable
rs1=Rsquared(train_y,train_pred_1)
print("R2 score of training data :",str(rs1))

In [None]:
#Calculate Mean Squared Error of training data with one variable
ms1=meansquarederror(train_y,train_pred_1)
print("MSE of training data : ",str(ms1))

In [None]:
#Calculate Pearson correlation coefficient of training data with one variable
pc1=Correlation_coef(train_y,train_pred_1)
print("correlation coefficient on training data : ",str(pc1))

In [None]:
#Calculate R2 score of test data with one variable
rs2=Rsquared(test_y,test_pred)
print("R2 score of test data :",str(rs2))

In [None]:
#Calculate Mean Squared Error of test data with one variable
ms2=meansquarederror(test_y,test_pred)
print("MSE of test data : ",str(ms2))

In [None]:
#Calculate Pearson correlation coefficient of test data with one variable
pc2=Correlation_coef(test_y,test_pred)
print("correlation coefficient on training data : ",str(pc2))

#### Linear Regression with two variables

In [None]:
#load the dataset with two variable
BSOM_data_2=pd.read_csv('BSOM_DataSet_for_HW2.csv',usecols = ['all_mcqs_avg_n20','all_NBME_avg_n4','STEP_1'])
#checking for missing values
BSOM_data_2.isnull().sum()

In [None]:
#handling misssing values
BSOM_data_2['STEP_1']=BSOM_data_2['STEP_1'].fillna(BSOM_data_2['STEP_1'].mean())
BSOM_data_2.isnull().sum()

In [None]:
#split the data into train(80%) and test(20%) datasets
features_X_2 = BSOM_data_2.iloc[:,:-1].to_numpy()
y_2=BSOM_data_2.iloc[:,-1].to_numpy()
from sklearn.model_selection import train_test_split
Xtrain_2, Xtest_2, ytrain_2, ytest_2 = train_test_split(features_X_2, y_2, test_size = 0.2, random_state = 0)

In [None]:
#training and test features,training and test target,number of examples of training and test data
train_X_2 = Xtrain_2
m_train_2=train_X_2.shape[0]
#adding bias term for features in training data
train_X_2=np.append(np.ones((m_train_2,1)),train_X_2,axis=1).T
test_X_2 = Xtest_2
m_test_2=test_X_2.shape[0]
#adding bias term for features in training data
test_X_2=np.append(np.ones((m_test_2,1)),test_X_2,axis=1).T
train_y_2=ytrain_2
train_y_2=train_y_2.reshape(1,train_y_2.shape[0])

test_y_2=ytest_2
test_y_2=test_y_2.reshape(1,test_y_2.shape[0])


In [None]:
#training the model using the train dataset
alphas_list=[0.1,0.5,0.6,0.7]
for i in alphas_list:
    print("learning_rate :",str(i))
    coef,costs_J,num_iter,train_pred_2v=Gradientdescent(train_X_2,train_y_2,i)
    print("best parameters ",str(coef))

In [None]:
#best parameters with best alpha
bestcoef_train2,J_list_train2,iter_num_train2,train_pred_2=Gradientdescent(train_X_2,train_y_2,0.7)

In [None]:
#plotting the cost (vs) iterations graph
iterations=list(np.arange(0,num_iter,10000))

cost_J=[]
for i in iterations:
    cost_J.append(costs_J[i])

plt.plot(iterations,cost_J)
plt.xlabel("#Iterations")
plt.ylabel("J (cost)")
plt.title("LinearRegression with 2 variables")
plt.show()

In [None]:
#predicting using test set with best parameters and learning rate
test_pred_2=hypothesis(test_X_2,bestcoef_train2)
#test_pred_2

In [None]:
#Calculate R2 score of training data with two variables
rs2v=Rsquared(train_y_2,train_pred_2)
print("R2 score of training data :",str(rs2v))

In [None]:
#Calculate Mean Squared Error of training data with 2 variables
ms2v=meansquarederror(train_y_2,train_pred_2)
print("MSE of training data : ",str(ms2v))

In [None]:
#Calculate Pearson correlation coefficient of training data with 2 variables
pc2v=Correlation_coef(train_y_2,train_pred_2)
print("correlation coefficient on training data : ",str(pc2v))

In [None]:
#Calculate R2 score of test data with 2 variables
rstest2=Rsquared(test_y_2,test_pred_2)
print("R2 score of test data :",str(rstest2))

In [None]:
#Calculate Mean Squared Error of test data with 2 variables
ms2test=meansquarederror(test_y_2,test_pred_2)
print("MSE of test data : ",str(ms2test))

In [None]:
#Calculate Pearson correlation coefficient of test data with 2 variables
pc2v_test=Correlation_coef(test_y_2,test_pred_2)
print("correlation coefficient on training data : ",str(pc2v_test))