In [15]:
#importing required packages
import pandas as pd #Responsible for dataset processing
import numpy as np  #For arrays
import matplotlib.pyplot as plt #For plotting graph
from sklearn.linear_model import LinearRegression as LR  #This contains the linear regression model
from sklearn.metrics import mean_squared_error as mse, r2_score #For checking the accuracy of our model
import sklearn.model_selection as sk  #This is for dividing our model into test and train data
from sklearn import preprocessing      #This package contains our label encoder
from mpl_toolkits.mplot3d import Axes3D

In [16]:
#retrieving dataset
df=pd.read_excel(r"D:\MAIL\datastructure.xlsx")

In [17]:
#to view our dataset
df.head()

Unnamed: 0,YEAR,EXAM TYPE,TOPICS,SUBTOPICS,MARKS
0,2014-15,SEMESTER END,bst,construction,10
1,2014-15,SEMESTER END,bst,traversal,4
2,2014-15,SEMESTER END,threaded binary tree,theory,6
3,2014-15,SEMESTER END,deque,input restricted,8
4,2014-15,SEMESTER END,infix to postfix,algorithm,8


In [18]:
#Now we gonna label encode our columns
le=preprocessing.LabelEncoder()
#Encode Year and store the mappings
year=le.fit_transform(df.YEAR)
year_map = dict(zip(le.classes_, le.transform(le.classes_)))
#Encode Exam type and store the mapping
types=le.fit_transform(df['EXAM TYPE'])
types_map = dict(zip(le.classes_, le.transform(le.classes_)))
#Encode topics and store the mapping
top=le.fit_transform(df.TOPICS)
top_map = dict(zip(le.classes_, le.transform(le.classes_)))
#Encode subtopics and store the mapping
subtop=le.fit_transform(df.SUBTOPICS)
subtop_map = dict(zip(le.classes_, le.transform(le.classes_)))
year_map


{'2014-15': 0, '2015-16': 1, '2016-17': 2, '2017-18': 3, '2018-19': 4}

In [19]:
#Now first model will predict the marks when we feed the year and subtopic into it
independant=list(zip(year,subtop)) #First models independant features
model1=LR(); #creates the model, we just need to fit test data into it 
xtest,xtrain,ytest,ytrain=sk.train_test_split(independant,df["MARKS"],test_size=25,train_size=75,random_state=30)
#The above code just split our dataset into test and train data


In [20]:
#Now we are going to fit our data into the first linear regression model
model1.fit(np.array(xtrain).reshape(-1,2),np.array(ytrain).reshape(-1,1))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [21]:
print("Accuracy based on train set")
ypred=model1.predict(xtrain)
accuracy=r2_score(ytrain,ypred)
print(mse(ytrain,ypred))
print(accuracy)

print("\nAccuracy based on entire dataset")
ypred=model1.predict(np.array(independant).reshape(-1,2))
accuracy=r2_score(df.MARKS,ypred)
print(mse(df['MARKS'],ypred))
print(accuracy)

print("\nAccuracy based on test set")
ypred=model1.predict(np.array(xtest).reshape(-1,2))
accuracy=r2_score(ytest,ypred)
print(mse(ytest,ypred))
print(accuracy)

Accuracy based on train set
4.578060593343497
0.21737202657557853

Accuracy based on entire dataset
8.567250312393032
-0.2250733270878198

Accuracy based on test set
11.96589032462786
-0.1973553398803094


In [22]:
ypred=model1.predict(np.array([0,0]).reshape(-1,2))
ypred

array([[4.68239759]])

In [23]:
#Now second model will predict the marks when we feed the year,examtype and subtopic into it
independant=list(zip(year,types,subtop)) #First models independant features
model2=LR(); #creates the model, we just need to fit test data into it 
xtest,xtrain,ytest,ytrain=sk.train_test_split(independant,df["MARKS"],test_size=25,train_size=75,random_state=30)
#The above code just split our dataset into test and train data

In [24]:
#Now we are going to fit our data into the second linear regression model
model2.fit(np.array(xtrain).reshape(-1,3),np.array(ytrain).reshape(-1,1))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
print("Accuracy based on train set")
ypred=model2.predict(xtrain)
accuracy=r2_score(ytrain,ypred)
print(mse(ytrain,ypred))
print(accuracy)

print("\nAccuracy based on entire dataset")
ypred=model2.predict(np.array(independant).reshape(-1,3))
accuracy=r2_score(df.MARKS,ypred)
print(mse(df['MARKS'],ypred))
print(accuracy)

print("\nAccuracy based on test set")
ypred=model2.predict(np.array(xtest).reshape(-1,3))
accuracy=r2_score(ytest,ypred)
print(mse(ytest,ypred))
print(accuracy)

Accuracy based on train set
4.488712317181796
0.2326462805693048

Accuracy based on entire dataset
8.575430636692086
-0.22624307195814342

Accuracy based on test set
12.168015371652633
-0.21758078887013999


In [26]:
#Now second model will predict the marks when we feed the year, topic into it
independant=list(zip(year,top,subtop)) #First models independant features
model3=LR(); #creates the model, we just need to fit test data into it 
xtest,xtrain,ytest,ytrain=sk.train_test_split(independant,df["MARKS"],test_size=25,train_size=75,random_state=30)
#The above code just split our dataset into test and train data

In [27]:
#Now we are going to fit our data into the second linear regression model
model3.fit(np.array(xtrain).reshape(-1,3),np.array(ytrain).reshape(-1,1))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [28]:
print("Accuracy based on train set")
ypred=model3.predict(xtrain)
accuracy=r2_score(ytrain,ypred)
print(mse(ytrain,ypred))
print(accuracy)

print("\nAccuracy based on entire dataset")
lpred=model3.predict(np.array(independant).reshape(-1,3))
accuracy=r2_score(df.MARKS,lpred)
print(mse(df['MARKS'],lpred))
print(accuracy)

print("\nAccuracy based on test set")
ypred=model3.predict(np.array(xtest).reshape(-1,3))
accuracy=r2_score(ytest,ypred)
print(mse(ytest,ypred))
print(accuracy)

Accuracy based on train set
4.153220826436043
0.28999917491178173

Accuracy based on entire dataset
9.458320150006141
-0.35249179285310683

Accuracy based on test set
12.23450660136269
-0.2242341700050723
