In [None]:
#This notebook tries to predict the math score of students using different linear regression methods
#The notebook is divided into these sections
#Information about the data and Exploratory data analysis
# The data can be accessed  via https://www.kaggle.com/datasets/spscientist/students-performance-in-exams


In [121]:
import pandas as pd
import seaborn as sns
import numpy as np


In [122]:
data = pd.read_csv('StudentsPerformance.csv')

In [123]:
# It is nice to save a copy of our data
# Let us perform some exploratory data analysis
raw = data.copy()
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [111]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [34]:
data.shape

(1000, 8)

In [None]:
#Summary about Data
# The data contains a total of 1000 rows and 8 columns. 
# There are zero non null values
# The data has three columns with int and five with object data type
# The column are gender, race, parents level of education, lunch(type of lunch served), test perparation course


In [None]:
# Exploratory data analysis
# There are no null values
# Each column will be checked for typo

In [35]:
data.gender.unique()

array(['female', 'male'], dtype=object)

In [36]:
data['gender'].unique()

array(['female', 'male'], dtype=object)

In [39]:
data['parental level of education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

In [40]:
data['lunch'].unique()

array(['standard', 'free/reduced'], dtype=object)

In [19]:
data['test preparation course'].unique()

array(['none', 'completed'], dtype=object)

In [None]:
# A student that does well in writing or reading is likely to do well in math.
# In order not to biase our model we will drop writing and reading column

In [41]:
data.drop(['reading score', 'writing score'], axis=1)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score
0,female,group B,bachelor's degree,standard,none,72
1,female,group C,some college,standard,completed,69
2,female,group B,master's degree,standard,none,90
3,male,group A,associate's degree,free/reduced,none,47
4,male,group C,some college,standard,none,76
...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88
996,male,group C,high school,free/reduced,none,62
997,female,group C,high school,free/reduced,completed,59
998,female,group D,some college,standard,completed,68


In [124]:
# I am converting categorical variables to dummy
data_dummy = pd.get_dummies(data, drop_first=True)

In [113]:
data_dummy.head()

Unnamed: 0,math score,reading score,writing score,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none
0,72,72,74,0,1,0,0,0,1,0,0,0,0,1,1
1,69,90,88,0,0,1,0,0,0,0,0,1,0,1,0
2,90,95,93,0,1,0,0,0,0,0,1,0,0,1,1
3,47,57,44,1,0,0,0,0,0,0,0,0,0,0,1
4,76,78,75,1,0,1,0,0,0,0,0,1,0,1,1


In [114]:
##data_dummy = data_dummy.drop(['reading score', 'writing score'], axis=1)

In [125]:
data_dummy.head()

Unnamed: 0,math score,reading score,writing score,gender_male,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_standard,test preparation course_none
0,72,72,74,0,1,0,0,0,1,0,0,0,0,1,1
1,69,90,88,0,0,1,0,0,0,0,0,1,0,1,0
2,90,95,93,0,1,0,0,0,0,0,1,0,0,1,1
3,47,57,44,1,0,0,0,0,0,0,0,0,0,0,1
4,76,78,75,1,0,1,0,0,0,0,0,1,0,1,1


In [81]:
data_dummy.shape

(1000, 14)

In [126]:
#Separating our target variable
y = data_dummy['math score']
data_dummy = data_dummy.drop(['math score'], axis = 1)   

In [None]:
data_dummy.head()

In [127]:
# Let us apply linear regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import (StandardScaler, 
                                   PolynomialFeatures)

In [128]:
# using simple linear regression as baseline
lr = LinearRegression()

In [129]:
X_train, X_test, y_train, y_test = train_test_split(data_dummy, y, test_size=0.3, 
                                                    random_state=72018)

In [130]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
score_lr = lr.score(X_test,y_test)
score_lr

0.8644707406632953

In [145]:

pd.DataFrame(zip(data_dummy.columns, lr.coef_)).sort_values(by = 1)

Unnamed: 0,0,1
4,race/ethnicity_group C,-7.417532
5,race/ethnicity_group D,-6.876884
3,race/ethnicity_group B,-6.862154
6,race/ethnicity_group E,-4.138887
0,reading score,-0.410742
12,lunch_standard,1.195203
1,writing score,1.240994
13,test preparation course_none,1.250678
7,parental level of education_bachelor's degree,2.246815
11,parental level of education_some high school,3.045897


In [147]:
#adding polynomial effects and checking performance
pf = PolynomialFeatures(degree=2, include_bias=False)
X_pf = pf.fit_transform(data_dummy)
X_train, X_test, y_train, y_test = train_test_split(X_pf, y, test_size=0.3, 
                                                    random_state=72018)

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
score_poly = lr.score(X_test,y_test)
score_poly

0.8584590350868349

In [148]:
pd.DataFrame(zip(data_dummy.columns, lr.coef_)).sort_values(by = 1)

Unnamed: 0,0,1
4,race/ethnicity_group C,-7.417532
5,race/ethnicity_group D,-6.876884
3,race/ethnicity_group B,-6.862154
6,race/ethnicity_group E,-4.138887
0,reading score,-0.410742
12,lunch_standard,1.195203
1,writing score,1.240994
13,test preparation course_none,1.250678
7,parental level of education_bachelor's degree,2.246815
11,parental level of education_some high school,3.045897


In [119]:
from sklearn import linear_model


In [140]:
clf = linear_model.Lasso(alpha=0.001, max_iter=1000000)
clf.fit(X_train,y_train)
score_lasso = clf.score(X_test,y_test)
score_lasso

0.8590223410283184

In [149]:
pd.DataFrame(zip(data_dummy.columns, clf.coef_)).sort_values(by = 1)

Unnamed: 0,0,1
5,race/ethnicity_group D,-5.706436
4,race/ethnicity_group C,-4.950581
3,race/ethnicity_group B,-2.90707
6,race/ethnicity_group E,-2.107637
0,reading score,-0.44683
7,parental level of education_bachelor's degree,0.0
9,parental level of education_master's degree,0.612806
1,writing score,1.278573
11,parental level of education_some high school,2.184252
10,parental level of education_some college,2.623071


In [None]:
# From the above it can be seen, that the performance of the models are as follow
# Linear regression is performed best followed by lasso regression while polynomial regression performed worst
# This is derived from the value of their scores. Hence I recommend linear regression as my final model
# In terms of explainability, the strongest predictor are gener and ethnicity

In [None]:
# Summary of insights
#1. Linear regression is the best performer
#2. The accuracy of the lasso regression can be improved by reducing the value of alpha
#3. In terms of explainability, the strongest predictor are gener and ethnicity

In [None]:
# Suggestion for further analysis
# K-folds could be used to divided the datasets while the average of scores will be used to test performance
#2. Additional data could also be added to help the model. Additional data could include, number of hours studied etc