In [None]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
#load the dataset with 2 variables
BSOM_data=pd.read_csv('BSOM_DataSet_for_HW2.csv',usecols = ['all_mcqs_avg_n20','all_NBME_avg_n4','LEVEL'])
#checking for missing values
BSOM_data.isnull().sum()

In [None]:
#removing the rows with missing values
BSOM_data=BSOM_data.dropna(axis=0)
BSOM_data.isnull().sum()

In [None]:
#plot the BSOM data with different actual classes
def plot_data(BSOM_data):
    
    x1=BSOM_data[BSOM_data['LEVEL']=='A']
    x2=BSOM_data[BSOM_data['LEVEL']=='B']
    x3=BSOM_data[BSOM_data['LEVEL']=='C']
    x4=BSOM_data[BSOM_data['LEVEL']=='D']
    plt.figure(figsize=(10,10))
    plt.scatter(x1['all_NBME_avg_n4'],x1['all_mcqs_avg_n20'],c='g',alpha=0.5, marker='o',label="A",s=60)
    plt.scatter(x2['all_NBME_avg_n4'],x2['all_mcqs_avg_n20'],c='r',alpha=0.8, marker='x',label="B",s=60)
    plt.scatter(x3['all_NBME_avg_n4'],x3['all_mcqs_avg_n20'],c='b',alpha=0.5, marker='v',label="C",s=60)
    plt.scatter(x4['all_NBME_avg_n4'],x4['all_mcqs_avg_n20'],c='y',alpha=0.8, marker='s',label="D",s=60)
    plt.xlabel("NBME_avg")
    plt.ylabel("MCQs_avg")
    plt.title("LogisticRegression with two variables")
    plt.legend(loc='upper left')
    plt.show()

In [None]:
print("Plotting the BSOM data with actual labels...")
plot_data(BSOM_data)

In [None]:
#initialise the parameters with zeros
def initial_parameters(size):
    parameters=np.zeros((size,1))
    return parameters

In [None]:
#predicting the probabilities
def hypothesis(X,thetas):
    x=np.dot(np.transpose(thetas),X)
    h=1 / (1 + np.exp(-x))
    zero_matrix=np.zeros(h.shape)
    ones_matrix=np.ones(h.shape)
    if np.array_equal(h,zero_matrix):
        h[h==0.0]=0.00001
     
    return h

In [None]:
#calculating the cost function
def Calc_cost(thetas,X,y):
    h=hypothesis(X,thetas)
    m=X.shape[1]
    J=(-1/(m))*np.sum(y*np.log(h.astype(float))+(1-y)*np.log(1-h.astype(float)))
    return J

In [None]:
#calculating the gradient descent and updating the parameters
def Gradientdescent(X,y,alpha):
    m=X.shape[1]
    thetas=initial_parameters(X.shape[0])
    cost_list=[]
    thetas_list=[]
    thetas_list.append(thetas)
    count=0
    final_h=np.zeros(y.shape)
    while True:
        ypred=hypothesis(X,thetas)
        cost=Calc_cost(thetas,X,y)
        cost_list.append(cost)
        
        if (len(cost_list)>=2) and ((cost_list[count-1]-cost_list[count])<0.00001):
            print("convergence is reached at iteration",str(count),"\n")
            final_h=ypred
            break
        update_thetas=thetas-(alpha/m)*np.matmul(X,(ypred-y).T)
        thetas=update_thetas
        count+=1
    return thetas,cost_list,count,final_h

In [None]:
#predicting the labels based on maximum probability among the 4 classifiers
def final_prediction(h1,h2,h3,h4):
    h1=list(h1)[0]
    h2=list(h2)[0]
    h3=list(h3)[0]
    h4=list(h4)[0]
    final_h=[]
    max_index=[]
    for i in range(0,len(h1)):
        temp_list=[]
        temp_list.append(h1[i])
        temp_list.append(h2[i])
        temp_list.append(h3[i])
        temp_list.append(h4[i])
        max_index.append(temp_list.index(max(temp_list)))
        final_h.append(max(temp_list))
    return final_h,max_index

In [None]:
#split the data into train(70%) and test(30%) datasets
features_X = BSOM_data.iloc[:,:-1].to_numpy()
y=BSOM_data.iloc[:,-1].to_numpy()
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(features_X, y, test_size = 0.3,random_state=0)

In [None]:
#adding bias term to the feature vector and create 4 classifier labels(for each of the 4 classes) in the train and test data sets
train_X = Xtrain
m_train=train_X.shape[0]
train_X=np.append(np.ones((m_train,1)),train_X,axis=1).T
train_y=ytrain
train_y=pd.get_dummies(train_y).to_numpy()
test_X = Xtest
m_test=test_X.shape[0]
test_X=np.append(np.ones((m_test,1)),test_X,axis=1).T
test_y=ytest
test_y=pd.get_dummies(test_y).to_numpy()


In [None]:
#encode the class labels in the train data
actual_train=ytrain
actual_train=np.where(actual_train=='A', 0, actual_train)
actual_train=np.where(actual_train=='B', 1, actual_train)
actual_train=np.where(actual_train=='C', 2, actual_train)
actual_train=np.where(actual_train=='D', 3, actual_train)
#encode the class labels in the test data
actual_test=ytest
actual_test=np.where(actual_test=='A', 0, actual_test)
actual_test=np.where(actual_test=='B', 1, actual_test)
actual_test=np.where(actual_test=='C', 2, actual_test)
actual_test=np.where(actual_test=='D', 3, actual_test)

In [None]:
#training the model using the train dataset for different learning rates

alphas_list=[0.1,0.5,0.6,0.7]

for i in alphas_list:
    print("learning_rate :\n",str(i))
    print("classifier 1(class A vs not class A)")
    coef1,costs_J1,num_iter1,train_pred1=Gradientdescent(train_X,train_y[:,0],i)
    
    print("classifier 2(class B vs not class B)")
    coef2,costs_J2,num_iter2,train_pred2=Gradientdescent(train_X,train_y[:,1],i)

    
    print("classifier 3(class C vs not class C)")
    coef3,costs_J3,num_iter3,train_pred3=Gradientdescent(train_X,train_y[:,2],i)
    
    print("classifier 4(class D vs not class D)")
    coef4,costs_J4,num_iter4,train_pred4=Gradientdescent(train_X,train_y[:,3],i)

    fh,labels=final_prediction(train_pred1,train_pred2,train_pred3,train_pred4)
    final_labels=np.array(labels)
    print("Confusion Matrix \n")
    cf=confusion_matrix(list(actual_train),list(final_labels))
    print(cf)
    pr=precision_score(list(actual_train),list(final_labels),average='macro')
    rc=recall_score(list(actual_train),list(final_labels),average='macro')
    f1=f1_score(list(actual_train),list(final_labels),average='macro')
    print("Precision : ",str(pr))
    print("Recall : ",str(rc))
    print("F1 score : ",str(f1))
    

In [None]:
def plot_costfunction(iter_num,J_list,classname):
    #plotting the cost (vs) iterations graph
    iterations=list(np.arange(0,iter_num,1))
    cost_J=[]
    for i in iterations:
        cost_J.append(J_list[i])

    plt.plot(iterations,cost_J)
    plt.xlabel("#Iterations")
    plt.ylabel("J (cost)")
    plt.title("Logistic Regression class "+str(classname)+" vs not class "+str(classname))
    plt.show()

In [None]:
def plot_confusion_matrix(cf_matrix):
    sns.heatmap(cf,xticklabels=['A','B','C','D'],yticklabels=['A','B','C','D'],annot=True,linecolor='white',linewidths=0.5,cmap='coolwarm')
    plt.xlabel("Predicted labels")
    plt.ylabel("actual labels")
    plt.show()

In [None]:
##best parameters with best alpha on training data set
best_alpha=0.6
print("classifier 1(class A vs not class A)\n")
best_coef1,costs_J1,num_iter1,best_train_pred1=Gradientdescent(train_X,train_y[:,0],best_alpha)
print("classifier 2(class B vs not class B)\n")
best_coef2,costs_J2,num_iter2,best_train_pred2=Gradientdescent(train_X,train_y[:,1],best_alpha)
print("classifier 3(class C vs not class C)\n")
best_coef3,costs_J3,num_iter3,best_train_pred3=Gradientdescent(train_X,train_y[:,2],best_alpha)
print("classifier 4(class D vs not class D)\n")
best_coef4,costs_J4,num_iter4,best_train_pred4=Gradientdescent(train_X,train_y[:,3],best_alpha)
best_pred,best_labels=final_prediction(best_train_pred1,best_train_pred2,best_train_pred3,best_train_pred4)
final_labels_train=np.array(best_labels)
print("Confusion Matrix \n")
cf=confusion_matrix(list(actual_train),list(final_labels_train))
print(cf)

pr=precision_score(list(actual_train),list(final_labels_train),average='macro')
rc=recall_score(list(actual_train),list(final_labels_train),average='macro')
f1=f1_score(list(actual_train),list(final_labels_train),average='macro')
print("Precision : ",str(pr))
print("Recall : ",str(rc))
print("F1 score : ",str(f1))

#plot the cost function for all the classifiers
plot_costfunction(num_iter1,costs_J1,'A')
plot_costfunction(num_iter2,costs_J2,'B')
plot_costfunction(num_iter3,costs_J3,'C')
plot_costfunction(num_iter4,costs_J4,'D')
print("confusion matrix of training data")
plot_confusion_matrix(cf)


In [None]:
#predicting on test data with best alpha and best parameters

test_pred1=hypothesis(test_X,best_coef1)
test_pred2=hypothesis(test_X,best_coef2)
test_pred3=hypothesis(test_X,best_coef3)
test_pred4=hypothesis(test_X,best_coef4)
pred_test,labels_test=final_prediction(test_pred1,test_pred2,test_pred3,test_pred4)
final_labels_test=np.array(labels_test)
print("Confusion Matrix of test data\n")
cf=confusion_matrix(list(actual_test),list(final_labels_test))
print(cf)
plot_confusion_matrix(cf)
pr=precision_score(list(actual_test),list(final_labels_test),average='macro')
rc=recall_score(list(actual_test),list(final_labels_test),average='macro')
f1=f1_score(list(actual_test),list(final_labels_test),average='macro')
print("Precision : ",str(pr))
print("Recall : ",str(rc))
print("F1 score : ",str(f1))

In [None]:
#plot the BSOM data with different classes
def plot_predicteddata(predict_data,name):
    
    x1=predict_data[predict_data['predicted_labels']==0]
    x2=predict_data[predict_data['predicted_labels']==1]
    x3=predict_data[predict_data['predicted_labels']==2]
    x4=predict_data[predict_data['predicted_labels']==3]

    plt.figure(figsize=(8,8))
    plt.scatter(x1['x1'],x1['x2'],c='g',alpha=0.5, marker='o',label="A",s=60)
    plt.scatter(x2['x1'],x2['x2'],c='r',alpha=0.8, marker='x',label="B",s=60)
    plt.scatter(x3['x1'],x3['x2'],c='b',alpha=0.5, marker='v',label="C",s=60)
    plt.scatter(x4['x1'],x4['x2'],c='y',alpha=0.8, marker='s',label="D",s=60)
    plt.xlabel("NBME_avg")
    plt.ylabel("MCQs_avg")
    plt.title("LogisticRegression prediction on "+str(name))
    plt.legend(loc='upper left')
    plt.show()

In [None]:
#plot the predicted classes for train data and test data sets
d_train = {'x1':list(Xtrain[:,0]), 'x2':list(Xtrain[:,1]),'predicted_labels':final_labels_train }

d_test = {'x1':list(Xtest[:,0]), 'x2':list(Xtest[:,1]),'predicted_labels':final_labels_test }
traindata_predict=pd.DataFrame(data=d_train)
testdata_predict=pd.DataFrame(data=d_test)
predicted_data=traindata_predict.append(testdata_predict)

plot_predicteddata(traindata_predict,'training data')

plot_predicteddata(testdata_predict,'test data')

plot_predicteddata(predicted_data,'complete data')
