# Linear Regression Model

#### Importing libraries and datasets

In [1]:
#importing required packages
import pandas as pd #Responsible for dataset processing
import numpy as np  #For arrays
import matplotlib.pyplot as plt #For plotting graph
from sklearn.linear_model import LinearRegression as LR  #This contains the linear regression model
from sklearn.metrics import mean_squared_error as mse, r2_score #For checking the accuracy of our model
import sklearn.model_selection as sk  #This is for dividing our model into test and train data
from sklearn import preprocessing      #This package contains our label encoder
from mpl_toolkits.mplot3d import Axes3D


In [2]:
#retrieving dataset
df=pd.read_excel(r"C:\Users\Tushar\Desktop\Question Paper Prediction-ML project\oops-dataset.xlsx")

#### Lets view our dataset
As we can see, we have 5 attributes
Year, exam type, topic, subtopics are categorical whereas marks is numerical

In [3]:
#to view our dataset
df.head()

Unnamed: 0,YEAR,EXAM TYPE,TOPICS,SUBTOPICS,MARKS
0,2014-15,SEMESTER END,Introduction,OOPS Features,6.0
1,2014-15,SEMESTER END,Introduction,Pointers,4.0
2,2014-15,SEMESTER END,Functions,Default Arguments,6.0
3,2014-15,SEMESTER END,Functions,Inline Function,4.0
4,2014-15,SEMESTER END,Constructors,Types of Constructors,6.0


#### label encoding

In [4]:
#Now we gonna label encode our columns
le=preprocessing.LabelEncoder()
#Encode Year and store the mappings
year=le.fit_transform(df.YEAR)
year_map = dict(zip(le.classes_, le.transform(le.classes_)))
#Encode Exam type and store the mapping
types=le.fit_transform(df['EXAM TYPE'])
types_map = dict(zip(le.classes_, le.transform(le.classes_)))
#Encode topics and store the mapping
top=le.fit_transform(df.TOPICS)
top_map = dict(zip(le.classes_, le.transform(le.classes_)))
#Encode subtopics and store the mapping
subtop=le.fit_transform(df.SUBTOPICS)
subtop_map = dict(zip(le.classes_, le.transform(le.classes_)))
marks=le.fit_transform(df.MARKS)
marks_map = dict(zip(le.classes_, le.transform(le.classes_)))
print(list(top_map.keys())[list(top_map.values()).index(2)]) 

Files


#### Fitting dataframes into different models and checking accuracy

In [5]:
#Now first model will predict the marks when we feed the year and subtopic into it
independant=list(zip(year,subtop)) #First models independant features
model1=LR(); #creates the model, we just need to fit test data into it 
xtest,xtrain,ytest,ytrain=sk.train_test_split(independant,marks,test_size=25,train_size=75,random_state=30)
#The above code just split our dataset into test and train data


In [6]:
#Now we are going to fit our data into the first linear regression model
model1.fit(np.array(xtrain).reshape(-1,2),np.array(ytrain).reshape(-1,1))

LinearRegression()

In [7]:
print("Accuracy based on train set")
ypred=model1.predict(xtrain)
accuracy=r2_score(ytrain,ypred)
print(mse(ytrain,ypred))
print(accuracy)

print("\nAccuracy based on entire dataset")
ypred=model1.predict(np.array(independant).reshape(-1,2))
accuracy=r2_score(marks,ypred)
print(mse(marks,ypred))
print(accuracy)

print("\nAccuracy based on test set")
ypred=model1.predict(np.array(xtest).reshape(-1,2))
accuracy=r2_score(ytest,ypred)
print(mse(ytest,ypred))
print(accuracy)

Accuracy based on train set
5.0312231691055835
0.11422127304479157

Accuracy based on entire dataset
3.4627426920097104
0.01082767059427725

Accuracy based on test set
2.156786898136935
0.10465488545975954


In [8]:
ypred=model1.predict(np.array([0,0]).reshape(-1,2))
ypred

array([[5.57185644]])

In [9]:
#Now second model will predict the marks when we feed the year,examtype and subtopic into it
independant=list(zip(year,types,subtop)) #First models independant features
model2=LR(); #creates the model, we just need to fit test data into it 
xtest,xtrain,ytest,ytrain=sk.train_test_split(independant,marks,test_size=25,train_size=75,random_state=30)
#The above code just split our dataset into test and train data

In [10]:
#Now we are going to fit our data into the second linear regression model
model2.fit(np.array(xtrain).reshape(-1,3),np.array(ytrain).reshape(-1,1))

LinearRegression()

In [11]:
print("Accuracy based on train set")
ypred=model2.predict(xtrain)
accuracy=r2_score(ytrain,ypred)
print(mse(ytrain,ypred))
print(accuracy)

print("\nAccuracy based on entire dataset")
ypred=model2.predict(np.array(independant).reshape(-1,3))
accuracy=r2_score(marks,ypred)
print(mse(marks,ypred))
print(accuracy)

print("\nAccuracy based on test set")
ypred=model2.predict(np.array(xtest).reshape(-1,3))
accuracy=r2_score(ytest,ypred)
print(mse(ytest,ypred))
print(accuracy)

Accuracy based on train set
5.006748103472776
0.11853026347310291

Accuracy based on entire dataset
3.471284370845357
0.00838764166276329

Accuracy based on test set
2.1785856387672844
0.09560559276265868


In [12]:
#Now second model will predict the marks when we feed the year, topic into it
independant=list(zip(year,top,subtop)) #First models independant features
model3=LR(); #creates the model, we just need to fit test data into it 
xtest,xtrain,ytest,ytrain=sk.train_test_split(independant,marks,test_size=25,train_size=75,random_state=30)
#The above code just split our dataset into test and train data

In [13]:
#Now we are going to fit our data into the second linear regression model
model3.fit(np.array(xtrain).reshape(-1,3),np.array(ytrain).reshape(-1,1))

LinearRegression()

In [14]:
print("Accuracy based on train set")
ypred=model3.predict(xtrain)
accuracy=r2_score(ytrain,ypred)
print(mse(ytrain,ypred))
print(accuracy)

print("\nAccuracy based on entire dataset")
lpred=model3.predict(np.array(independant).reshape(-1,3))
accuracy=r2_score(marks,lpred)
print(mse(marks,lpred))
print(accuracy)

print("\nAccuracy based on test set")
ypred=model3.predict(np.array(xtest).reshape(-1,3))
accuracy=r2_score(ytest,ypred)
print(mse(ytest,ypred))
print(accuracy)

Accuracy based on train set
3.663222084297735
0.3550665344546241

Accuracy based on entire dataset
3.7470818394213308
-0.07039708151779833

Accuracy based on test set
2.9084833931561542
-0.207396242546374
