<h1> <i> <u> Student Perception Analysis using Multiple Linear Regression

## Importing libaries and understanding the data

In [None]:
import numpy as np
import pandas as pd
from pandas.plotting import table
import matplotlib.pyplot as plt
import matplotlib.colors as pltcol
import matplotlib.ticker as ticker
import seaborn as sns
%matplotlib inline
import math
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.tools.eval_measures as ev
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
# importing r2_score module
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# predicting the accuracy score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [None]:
data = pd.read_csv('StudentData.csv')
len(data)

In [None]:
data.info()
# Data is processed through feature engineering techniques using bivariable analysis

In [None]:
data.describe()
# comparing median, max and min, there may be outliers in Age, tdu, doc and marks

#### ocd = Online class duration (H0)
#### eocd = expected online class duration
#### tdu = Total data usage
#### ss = self study
#### doc = Data online classes (H0)
#### ac = Academic Outcome (H0)
#### is = Internet speed (H0)
<!--     (5 point likert scale data) to measure satisfaction-->
#### buc = beter in understanding the concept (H0) 
<!--     (ordinal scale) to measure degree fo occurence-->
#### poc = Participation in online classes (H0)
#### ata = availability of teacher's assistance (H0)
#### smu = social media usage (H0)
#### bc = bored in class (H0) 
#### ce = chear in exams (H0) 

    after testing different models:
    buc variable has no impact on response variable
    doc has many outliers and also not impacting the variable

In [None]:
for i in data.columns:
    print(i)
    print(data[i].value_counts())
    print('------------------------------')

## Exploratory Data Analysis

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(data=data.corr(method='spearman'), annot=True, vmin=-1, cmap='winter') # spearman's rank correlation
# ss, ocd, eocd, doc has cosiderable correlation
# selected these variables and validating using exploratory data analysis considering ocd and eocd has significant correlation, colleniearity must be removed

### EDA / Univariate
To detect outliers or anomolies in the data to manipulate accordingly by comparing using bivariate data analysis

In [None]:
plt.figure(figsize=(10, 10))
data['Age'].plot()
#Age predictor has consistent line graph with possible outliers at age of 27-30 
#(because of less data available from phd students)

In [None]:
plt.figure(figsize=(10, 10))
data.ss.plot()
# consistent graph with no possible outliers
# possible for right skewed distribution

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data,binwidth=0.9, x='ss')

In [None]:
plt.figure(figsize=(10, 10))
data.ocd.plot()
# cosistent graph with possible outliers at 1 or consistent

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x= 'ocd', binwidth=1)
# left skewed

In [None]:
plt.figure(figsize=(10, 10))
data.eocd.plot()
# cosistent graph, possible outliers at 0

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x='eocd', binwidth=0.9, kde=True)
# possibly left skewed with most of the dist. in right part of the dist.

In [None]:
plt.figure(figsize=(10, 10))
data.tdu.plot()
# outlier at 6 and possibly right skewed

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x='tdu', binwidth=1)
# right skewed with ouliers on right end of dist.

In [None]:
plt.figure(figsize=(10, 10))
data.doc.plot()
# possible outliers at 0 and 3 and possible to be right skewed

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x='doc', binwidth=0.5)

In [None]:
plt.figure(figsize=(12, 10))
data.drop('Marks', axis=1).boxplot(grid = False)
plt.xticks(size=11);
plt.yticks(size=13);
plt.xlabel('Predictor variables')
plt.title('Box plot for outlier analysis', size=20)
# Inter quartile range 

### EDA / Bivariate data analysis
Compare the response variable with avialable ordianal variables to hypothesise the impact and to select the variable for predicting the response variable.

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(x=data['Marks'], hue=data['Gender'], multiple='stack', binwidth=5)
# Gender ordinal variable has no significan factoring impact on the response variable
# variable not selected

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x = 'Marks', bins=10, hue= 'Education', multiple='stack');
# Due to less avialability of data from phd students and no significant difference in impacting the response variable
# variable no selected # undecided
# testing models, pg students has less marks and compared to other grads, even though its not significant, it helped incressing 2% more accuracy

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x='Marks', hue='ic', multiple='stack');
# plt.legend( fontsize='x-large', title = "Internet speed", loc='upper left')
# Internet speed variable has impact on the response variable, people with the best and good internet connection are more likely to get good marks and agrees online classes are better
# variable selected

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x='Marks', hue='ac', multiple='stack', hue_order=['Yes', 'No'])
# Academic outcome has a significant impact on the response variable
# variable selected

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x='Marks', hue='buc', multiple='stack')
# even though there is no significant difference of impact, most of the student with above 80 marks has agreed that online lernign is better that offline learning
# variable selected

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x='Marks', hue='poc', multiple='stack');
# no impact on response variable
# variable not selected

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x='Marks', hue='ata', multiple='stack')
# The higher the marks the most people agreed they are getting teachers assistance
# Even though there is no significant impact, the diffecrence in acceptence in good marks region can impact the response variable moderately
# variable selected

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x='Marks', hue='smu', multiple='stack')
# some people of above 75 marks has not uses socail media
# variable selected

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x='Marks', hue= 'bc', multiple='stack')
# some people above 75 have never got bored in online classes
# varible selected

In [None]:
plt.figure(figsize=(10, 10))
sns.histplot(data=data, x='Marks', hue='ce', multiple='stack')
# some students with more than 75 marks says, they never cheated in exams

In [None]:
sns.jointplot(data=data, x='Marks', y='doc', kind='reg')

## Feature Engineering

    # Missing values and alomolies were alredy processed and manipulated sucessfully

# MLR model and Variable selection(stepwise method)

The numeric variables are selected using forward variable selection method, and the categorical variables are removed after building the model with all the categorical variables available. The ordinal scale variabels are removed based on the significance values from the summary table, AIC, BIC and adjusted R squared values.

In [None]:
temp = ols('Marks ~ Gender + Age + Education + ss + ocd + eocd + tdu + doc + ic + ac + buc + poc + ata + smu+ bc + ce', data);
model1= temp.fit()
print(model1.params)
# 16 predictor variables

In [None]:
model1.summary2()

In [None]:
temp = ols('Marks ~ Gender + Age + Education + ss + ocd + doc + ic + ac + buc + poc + ata + smu+ bc + ce', data);
model2= temp.fit()
print(model2.params)
# ocd and tdu variables have hign correlation with eocd, which may cause overfit of model
# but ocd has higher significance than other two variables also eocd and tdu has cofficients near to 0
# eocd and tdu variables are removed
# no change in accuracy

In [None]:
model2.summary2()

In [None]:
temp = ols('Marks ~ Education + Age + Gender + ss + poc + ocd + ic + ac + buc + ata + smu+ bc + ce', data);
model3= temp.fit()
print(model3.params)
# doc variable has less significance
# varaible doc removed

In [None]:
model3.summary2()

In [None]:
temp = ols('Marks ~ Education + ss + ocd + ic + ac + buc + ata + smu+ bc + ce', data);
model4= temp.fit()
print(model4.params)
# poc, Age, and Gender variables are removed since not significant
# AIC value after including Age variable has a difference less than 2, but no significant change in accuracy, variable removed to decrease predictor variables.

In [None]:
model4.summary2()

In [None]:
temp = ols('Marks ~ Education + ss + ocd + ic + ac', data);
model5= temp.fit()
print(model5.params)
# bc, buc, smu, ata, and ce

In [None]:
model5.summary2()

In [None]:
model5.predict({'Education': 'ug', 'ss': 1, 'ocd': 6, 'ic': 'b', 'ac': 'No', 'ata': 'a2'})

In [None]:
def evaluateModel(model):
    print("RSS = ", ((data.Marks - model.predict())**2).sum())
    print("R2 = ", model.rsquared)

In [None]:
evaluateModel(model5);
# our model is 81.2% accurate