# Test Score Regression

## Imports and Setup

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
csv_data = pd.read_csv('StudentsPerformance.csv')

In [3]:
df = pd.DataFrame(csv_data)

## Math Score Regression

In [4]:
# Convert categorical variables into dummy/indicator variables (Boolean variables) for each student
math_data = pd.get_dummies(df)

In [5]:
# Drop the Reading and Writing Scores
math_data.drop(['reading score','writing score'], axis=1, inplace=True)

In [6]:
# Display the first ten students 
math_data.head(10)

Unnamed: 0,math score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,72,True,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,True
1,69,True,False,False,False,True,False,False,False,False,False,False,True,False,False,True,True,False
2,90,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,True
3,47,False,True,True,False,False,False,False,True,False,False,False,False,False,True,False,False,True
4,76,False,True,False,False,True,False,False,False,False,False,False,True,False,False,True,False,True
5,71,True,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,True
6,88,True,False,False,True,False,False,False,False,False,False,False,True,False,False,True,True,False
7,40,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,False,True
8,64,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False,True,False
9,38,True,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,True


In [7]:
# Obtain the intended independent and dependent variables
math_x_values = math_data.drop('math score', axis=1)
math_y_value = math_data['math score']

In [8]:
# Split math_data into training and test data
math_x_train, math_x_test, math_y_train, math_y_test = train_test_split(math_x_values, math_y_value, test_size=0.10, random_state=42)

In [9]:
# Create the math linear regression model
math_regression_model = LinearRegression()
math_regression_model.fit(math_x_train, math_y_train)

In [10]:
# Display the coefficients of the math linear regression model
for idx, col_name in enumerate(math_x_train.columns):
    print(f'The coefficient for {col_name} is {math_regression_model.coef_[idx]}.') 

The coefficient for gender_female is 114816164235971.98.
The coefficient for gender_male is 114816164235976.73.
The coefficient for race/ethnicity_group A is -19272831418387.895.
The coefficient for race/ethnicity_group B is -19272831418386.273.
The coefficient for race/ethnicity_group C is -19272831418386.504.
The coefficient for race/ethnicity_group D is -19272831418383.69.
The coefficient for race/ethnicity_group E is -19272831418378.836.
The coefficient for parental level of education_associate's degree is -100142212300566.1.
The coefficient for parental level of education_bachelor's degree is -100142212300563.3.
The coefficient for parental level of education_high school is -100142212300570.02.
The coefficient for parental level of education_master's degree is -100142212300563.17.
The coefficient for parental level of education_some college is -100142212300565.52.
The coefficient for parental level of education_some high school is -100142212300568.92.
The coefficient for lunch_fre

In [11]:
# Display the intercept of the math linear regression model
print(f"The intercept of our model is {math_regression_model.intercept_}.")

The intercept of our model is -237427784310653.62.


In [12]:
# Display the accuracy of the math linear regression model represented by the coefficient of determination (R^2)
print(f'R^2 is {math_regression_model.score(math_x_test, math_y_test)}.')

R^2 is 0.20640977521046966.


## Reading Score Regression

In [13]:
# Convert categorical variables into dummy/indicator variables (Boolean variables) for each student
reading_data = pd.get_dummies(df)

In [14]:
# Drop the Math and Writing Scores
reading_data.drop(['math score','writing score'], axis=1, inplace=True)

In [15]:
# Display the first ten students 
reading_data.head(10)

Unnamed: 0,reading score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,72,True,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,True
1,90,True,False,False,False,True,False,False,False,False,False,False,True,False,False,True,True,False
2,95,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,True
3,57,False,True,True,False,False,False,False,True,False,False,False,False,False,True,False,False,True
4,78,False,True,False,False,True,False,False,False,False,False,False,True,False,False,True,False,True
5,83,True,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,True
6,95,True,False,False,True,False,False,False,False,False,False,False,True,False,False,True,True,False
7,43,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,False,True
8,64,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False,True,False
9,60,True,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,True


In [16]:
# Obtain the intended independent and dependent variables
reading_x_values = reading_data.drop('reading score', axis=1)
reading_y_value = reading_data['reading score']

In [17]:
# Split reading_data into training and test data
reading_x_train, reading_x_test, reading_y_train, reading_y_test = train_test_split(reading_x_values, reading_y_value, test_size=0.10, random_state=42)

In [18]:
# Create the reading linear regression model
reading_regression_model = LinearRegression()
reading_regression_model.fit(reading_x_train, reading_y_train)

In [19]:
# Display the coefficients of the reading linear regression model
for idx, col_name in enumerate(reading_x_train.columns):
    print(f'The coefficient for {col_name} is {reading_regression_model.coef_[idx]}.') 

The coefficient for gender_female is -89144661400372.75.
The coefficient for gender_male is -89144661400380.55.
The coefficient for race/ethnicity_group A is 41792561201851.82.
The coefficient for race/ethnicity_group B is 41792561201852.43.
The coefficient for race/ethnicity_group C is 41792561201853.18.
The coefficient for race/ethnicity_group D is 41792561201854.836.
The coefficient for race/ethnicity_group E is 41792561201856.54.
The coefficient for parental level of education_associate's degree is -60837474288972.3.
The coefficient for parental level of education_bachelor's degree is -60837474288969.24.
The coefficient for parental level of education_high school is -60837474288976.375.
The coefficient for parental level of education_master's degree is -60837474288968.125.
The coefficient for parental level of education_some college is -60837474288972.64.
The coefficient for parental level of education_some high school is -60837474288975.055.
The coefficient for lunch_free/reduced 

In [20]:
# Display the intercept of the reading linear regression model
print(f"The intercept of our model is {reading_regression_model.intercept_}.")

The intercept of our model is -151922558674002.2.


In [21]:
# Display the accuracy of the reading linear regression model represented by the coefficient of determination (R^2)
print(f'R^2 is {reading_regression_model.score(reading_x_test, reading_y_test)}.')

R^2 is 0.12633699681497168.


## Writing Score Regression

In [22]:
# Convert categorical variables into dummy/indicator variables (Boolean variables) for each student
writing_data = pd.get_dummies(df)

In [23]:
# Drop the Math and Reading Scores
writing_data.drop(['math score','reading score'], axis=1, inplace=True)

In [24]:
# Display the first ten students 
writing_data.head(10)

Unnamed: 0,writing score,gender_female,gender_male,race/ethnicity_group A,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_associate's degree,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school,lunch_free/reduced,lunch_standard,test preparation course_completed,test preparation course_none
0,74,True,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False,True
1,88,True,False,False,False,True,False,False,False,False,False,False,True,False,False,True,True,False
2,93,True,False,False,True,False,False,False,False,False,False,True,False,False,False,True,False,True
3,44,False,True,True,False,False,False,False,True,False,False,False,False,False,True,False,False,True
4,75,False,True,False,False,True,False,False,False,False,False,False,True,False,False,True,False,True
5,78,True,False,False,True,False,False,False,True,False,False,False,False,False,False,True,False,True
6,92,True,False,False,True,False,False,False,False,False,False,False,True,False,False,True,True,False
7,39,False,True,False,True,False,False,False,False,False,False,False,True,False,True,False,False,True
8,67,False,True,False,False,False,True,False,False,False,True,False,False,False,True,False,True,False
9,50,True,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,True


In [25]:
# Obtain the intended independent and dependent variables
writing_x_values = writing_data.drop('writing score', axis=1)
writing_y_value = writing_data['writing score']

In [26]:
# Split writing_data into training and test data
writing_x_train, writing_x_test, writing_y_train, writing_y_test = train_test_split(writing_x_values, writing_y_value, test_size=0.10, random_state=42)

In [27]:
# Create the writing linear regression model
writing_regression_model = LinearRegression()
writing_regression_model.fit(writing_x_train, writing_y_train)

In [28]:
# Display the coefficients of the writing linear regression model
for idx, col_name in enumerate(writing_x_train.columns):
    print(f'The coefficient for {col_name} is {writing_regression_model.coef_[idx]}.') 

The coefficient for gender_female is 113900369554158.6.
The coefficient for gender_male is 113900369554149.36.
The coefficient for race/ethnicity_group A is 19064037731813.562.
The coefficient for race/ethnicity_group B is 19064037731814.176.
The coefficient for race/ethnicity_group C is 19064037731815.2.
The coefficient for race/ethnicity_group D is 19064037731818.543.
The coefficient for race/ethnicity_group E is 19064037731817.523.
The coefficient for parental level of education_associate's degree is -97942869070779.19.
The coefficient for parental level of education_bachelor's degree is -97942869070775.0.
The coefficient for parental level of education_high school is -97942869070784.34.
The coefficient for parental level of education_master's degree is -97942869070774.08.
The coefficient for parental level of education_some college is -97942869070779.53.
The coefficient for parental level of education_some high school is -97942869070783.38.
The coefficient for lunch_free/reduced is

In [29]:
# Display the intercept of the writing linear regression model
print(f"The intercept of our model is {writing_regression_model.intercept_}.")

The intercept of our model is -892061977514729.0.


In [30]:
# Display the accuracy of the writing linear regression model represented by the coefficient of determination (R^2)
print(f'R^2 is {writing_regression_model.score(writing_x_test, writing_y_test)}.')

R^2 is 0.23112388842901754.
