Importing the needed libraries

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

Importing data from our csv dataset into a pandas dataframe

In [2]:
dataset = pd.read_csv('StudentsPerformance.csv')

In [3]:
dataset.head(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


# Regression on Math Score

Creating dummy data for numerical representation (0,1) of each attribute of a student, dropping the scores that are not being predicted

In [4]:
math_data = pd.get_dummies(dataset)

In [5]:
math_data.drop(['reading score', 'writing score'], axis=1, inplace=True)

In [6]:
math_data.head(10)

Unnamed: 0,math score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,72,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1
1,69,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0
2,90,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1
3,47,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1
4,76,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1
5,71,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1
6,88,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0
7,40,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1
8,64,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0
9,38,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1


In [7]:
X = math_data.drop('math score', axis=1)

y = math_data['math score']

Splitting the data into training data and test data

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

Creating the regression model

In [9]:
regression_model = LinearRegression()

regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Displaying coefficients for each attribute

In [10]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

The coefficient for gender_female is -80354752456660.95
The coefficient for gender_male is -80354752456656.28
The coefficient for race/ethnicity_group A is -4107698157539.583
The coefficient for race/ethnicity_group B is -4107698157538.0947
The coefficient for race/ethnicity_group C is -4107698157538.1333
The coefficient for race/ethnicity_group D is -4107698157535.4546
The coefficient for race/ethnicity_group E is -4107698157530.0757
The coefficient for parental level of education_associate's degree is -68322216388.91806
The coefficient for parental level of education_bachelor's degree is -68322216386.2784
The coefficient for parental level of education_high school is -68322216393.03072
The coefficient for parental level of education_master's degree is -68322216386.18867
The coefficient for parental level of education_some college is -68322216388.39156
The coefficient for parental level of education_some high school is -68322216391.76358
The coefficient for lunch_free/reduced is 53758

Displaying the intercept for the regression model

In [11]:
intercept = regression_model.intercept_

print("The intercept for our model is {}".format(intercept))

The intercept for our model is 82719020217082.73


Displaying the accuracy of the model represented by R^2

In [12]:
print('R^2 is {}'.format(regression_model.score(X_test, y_test)))

R^2 is 0.20896570761560185


Process repeats as is for writing scores and reading scores

# Regression on Reading

In [13]:
reading_data = pd.get_dummies(dataset)

In [14]:
reading_data.drop(['math score', 'writing score'], axis=1, inplace=True)

In [15]:
reading_data.head(10)

Unnamed: 0,reading score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,72,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1
1,90,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0
2,95,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1
3,57,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1
4,78,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1
5,83,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1
6,95,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0
7,43,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1
8,64,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0
9,60,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1


In [16]:
X = reading_data.drop('reading score', axis=1)

y = reading_data['reading score']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [18]:
regression_model = LinearRegression()

regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

The coefficient for gender_female is 213873567520226.38
The coefficient for gender_male is 213873567520218.6
The coefficient for race/ethnicity_group A is 10933118855941.0
The coefficient for race/ethnicity_group B is 10933118855941.61
The coefficient for race/ethnicity_group C is 10933118855942.455
The coefficient for race/ethnicity_group D is 10933118855944.043
The coefficient for race/ethnicity_group E is 10933118855945.902
The coefficient for parental level of education_associate's degree is 181847566115.1785
The coefficient for parental level of education_bachelor's degree is 181847566118.11163
The coefficient for parental level of education_high school is 181847566110.9764
The coefficient for parental level of education_master's degree is 181847566119.21973
The coefficient for parental level of education_some college is 181847566114.73984
The coefficient for parental level of education_some high school is 181847566112.35284
The coefficient for lunch_free/reduced is -1430844976126

In [20]:
intercept = regression_model.intercept_

print("The intercept for our model is {}".format(intercept))

The intercept for our model is -220166342558611.22


In [21]:
print('R^2 is {}'.format(regression_model.score(X_test, y_test)))

R^2 is 0.1285885124188132


# Writing Score

In [22]:
writing_data = pd.get_dummies(dataset)

In [23]:
writing_data.drop(['math score', 'reading score'], axis=1, inplace=True)

In [24]:
writing_data.head(10)

Unnamed: 0,writing score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,74,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,1
1,88,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,1,0
2,93,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1
3,44,0,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1
4,75,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1
5,78,1,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1
6,92,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,1,0
7,39,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,1
8,67,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0
9,50,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1


In [27]:
X = writing_data.drop('writing score', axis=1)

y = writing_data['writing score']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

In [29]:
regression_model = LinearRegression()

regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [30]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))

The coefficient for gender_female is -38839978552216.375
The coefficient for gender_male is -38839978552225.766
The coefficient for race/ethnicity_group A is -1985481921855.8743
The coefficient for race/ethnicity_group B is -1985481921855.1665
The coefficient for race/ethnicity_group C is -1985481921854.0623
The coefficient for race/ethnicity_group D is -1985481921850.8596
The coefficient for race/ethnicity_group E is -1985481921851.403
The coefficient for parental level of education_associate's degree is -33023976031.95556
The coefficient for parental level of education_bachelor's degree is -33023976027.891315
The coefficient for parental level of education_high school is -33023976037.295425
The coefficient for parental level of education_master's degree is -33023976026.866734
The coefficient for parental level of education_some college is -33023976032.32051
The coefficient for parental level of education_some high school is -33023976036.169582
The coefficient for lunch_free/reduced i

In [31]:
intercept = regression_model.intercept_

print("The intercept for our model is {}".format(intercept))

The intercept for our model is 39982762349098.31


In [32]:
print('R^2 is {}'.format(regression_model.score(X_test, y_test)))

R^2 is 0.23048743240113587
