# Data preprocessing of Student Performance Data

## Importing libraries and dataset

In [54]:
import numpy as np
import pandas as pd

In [55]:
# Load the first 10 rows of the dataset from the specified CSV file
dataset = pd.read_csv('StudentPerformanceFactors.csv')

# Generate descriptive statistics for the dataset, including all columns
dataset.describe(include = "all")

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
count,6607.0,6607.0,6607,6607,6607,6607.0,6607.0,6607,6607,6607.0,6607,6529,6607,6607,6607.0,6607,6517,6540,6607,6607.0
unique,,,3,3,2,,,3,2,,3,3,2,3,,2,3,3,2,
top,,,Medium,Medium,Yes,,,Medium,Yes,,Low,Medium,Public,Positive,,No,High School,Near,Male,
freq,,,3362,3319,3938,,,3351,6108,,2672,3925,4598,2638,,5912,3223,3884,3814,
mean,19.975329,79.977448,,,,7.02906,75.070531,,,1.493719,,,,,2.96761,,,,,67.235659
std,5.990594,11.547475,,,,1.46812,14.399784,,,1.23057,,,,,1.031231,,,,,3.890456
min,1.0,60.0,,,,4.0,50.0,,,0.0,,,,,0.0,,,,,55.0
25%,16.0,70.0,,,,6.0,63.0,,,1.0,,,,,2.0,,,,,65.0
50%,20.0,80.0,,,,7.0,75.0,,,1.0,,,,,3.0,,,,,67.0
75%,24.0,90.0,,,,8.0,88.0,,,2.0,,,,,4.0,,,,,69.0


In [56]:
# Preview of the dataset
dataset.head(10)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70
5,19,88,Medium,Medium,Yes,8,89,Medium,Yes,3,Medium,Medium,Public,Positive,3,No,Postgraduate,Near,Male,71
6,29,84,Medium,Low,Yes,7,68,Low,Yes,1,Low,Medium,Private,Neutral,2,No,High School,Moderate,Male,67
7,25,78,Low,High,Yes,6,50,Medium,Yes,1,High,High,Public,Negative,2,No,High School,Far,Male,66
8,17,94,Medium,High,No,6,80,High,Yes,0,Medium,Low,Private,Neutral,1,No,College,Near,Male,69
9,23,98,Medium,Medium,Yes,8,71,Medium,Yes,0,High,High,Public,Positive,5,No,High School,Moderate,Male,72


## Data Imputation (Missing data replacement)

In [58]:
# Iterate over the range of the number of columns in the dataset
for i in range(len(dataset.columns)):
    # Calculate the number of missing entries in the current column
    missing_data = dataset[dataset.columns[i]].isna().sum()
    # Calculate the percentage of missing entries relative to the total number of entries in the dataset
    perc = missing_data / len(dataset) * 100
    # Print the column name, count of missing entries, and the percentage of missing entries
    print(f"Columns {dataset.columns[i]} => Missing entries count: {missing_data}, percentage: {perc}")

Columns Hours_Studied => Missing entries count: 0, percentage: 0.0
Columns Attendance => Missing entries count: 0, percentage: 0.0
Columns Parental_Involvement => Missing entries count: 0, percentage: 0.0
Columns Access_to_Resources => Missing entries count: 0, percentage: 0.0
Columns Extracurricular_Activities => Missing entries count: 0, percentage: 0.0
Columns Sleep_Hours => Missing entries count: 0, percentage: 0.0
Columns Previous_Scores => Missing entries count: 0, percentage: 0.0
Columns Motivation_Level => Missing entries count: 0, percentage: 0.0
Columns Internet_Access => Missing entries count: 0, percentage: 0.0
Columns Tutoring_Sessions => Missing entries count: 0, percentage: 0.0
Columns Family_Income => Missing entries count: 0, percentage: 0.0
Columns Teacher_Quality => Missing entries count: 78, percentage: 1.1805660662933253
Columns School_Type => Missing entries count: 0, percentage: 0.0
Columns Peer_Influence => Missing entries count: 0, percentage: 0.0
Columns Physi

Apparently there is no need to impute anything

## Encode categorical data
### Encode independent variables (X)

In [61]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder with sparse_output=False (to get dense output)
onehot_encoder = OneHotEncoder(sparse_output=False)

# Identify categorical columns in the dataset
categorical_columns = dataset.select_dtypes(include=['object']).columns

# Apply one-hot encoding to the categorical columns
encoded_columns = onehot_encoder.fit_transform(dataset[categorical_columns])

# Create a DataFrame from the encoded columns with appropriate column names
encoded_df = pd.DataFrame(encoded_columns, columns=onehot_encoder.get_feature_names_out(categorical_columns))

# Concatenate the encoded columns with the original dataset
final_df = pd.concat([dataset.drop(columns=categorical_columns), encoded_df], axis=1)

# Display information about the final DataFrame
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 44 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Hours_Studied                          6607 non-null   int64  
 1   Attendance                             6607 non-null   int64  
 2   Sleep_Hours                            6607 non-null   int64  
 3   Previous_Scores                        6607 non-null   int64  
 4   Tutoring_Sessions                      6607 non-null   int64  
 5   Physical_Activity                      6607 non-null   int64  
 6   Exam_Score                             6607 non-null   int64  
 7   Parental_Involvement_High              6607 non-null   float64
 8   Parental_Involvement_Low               6607 non-null   float64
 9   Parental_Involvement_Medium            6607 non-null   float64
 10  Access_to_Resources_High               6607 non-null   float64
 11  Acce

### Encode dependent variable (y)

The depedent variable (Exam_score) is already numeric, which means there is no need to encode it.

## Splitting the dataset (X = data, y = output) into the Training set and Test set

In [65]:
# Convert the dataframe into a numpy array by calling values on the OG dataframe
X = final_df.iloc[:, final_df.columns != final_df.columns[6]].values
X

array([[23., 84.,  7., ...,  0.,  0.,  1.],
       [19., 64.,  8., ...,  0.,  1.,  0.],
       [24., 98.,  7., ...,  0.,  0.,  1.],
       ...,
       [20., 90.,  6., ...,  0.,  1.,  0.],
       [10., 86.,  6., ...,  0.,  1.,  0.],
       [15., 67.,  9., ...,  0.,  0.,  1.]])

In [66]:
y = final_df.iloc[:, 6].values
y

array([67, 61, 74, ..., 68, 68, 64], dtype=int64)

In [67]:
from sklearn.model_selection import train_test_split

# Splitting the dataset into 2 subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 616)

In [68]:
X_train

array([[24., 80.,  7., ...,  0.,  1.,  0.],
       [20., 69., 10., ...,  0.,  1.,  0.],
       [34., 87.,  6., ...,  0.,  0.,  1.],
       ...,
       [24., 80.,  6., ...,  0.,  1.,  0.],
       [19., 85.,  6., ...,  0.,  0.,  1.],
       [16., 71.,  6., ...,  0.,  0.,  1.]])

In [69]:
X_test

array([[17., 89.,  5., ...,  0.,  1.,  0.],
       [23., 91.,  8., ...,  0.,  0.,  1.],
       [25., 99.,  8., ...,  0.,  1.,  0.],
       ...,
       [16., 62.,  7., ...,  0.,  1.,  0.],
       [31., 88.,  6., ...,  0.,  0.,  1.],
       [20., 93.,  8., ...,  0.,  0.,  1.]])

In [70]:
y_train

array([70, 65, 72, ..., 68, 70, 82], dtype=int64)

In [71]:
y_test

array([68, 68, 72, ..., 59, 73, 66], dtype=int64)

## Feature Scaling

In [73]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train[:, :6] = sc.fit_transform(X_train[:, :6])
# The test set has to have the SAME scaler as the training set, so we ONLY use the Transform command
X_test[:, :6] = sc.transform(X_test[:, :6])

In [74]:
X_train

array([[ 6.67602187e-01,  4.24157776e-03, -1.35280458e-02, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 9.47266973e-05, -9.47820674e-01,  2.02920688e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 2.33637084e+00,  6.10099374e-01, -6.94439687e-01, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       ...,
       [ 6.67602187e-01,  4.24157776e-03, -6.94439687e-01, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [-1.66782138e-01,  4.36997147e-01, -6.94439687e-01, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [-6.67412733e-01, -7.74718446e-01, -6.94439687e-01, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00]])

In [75]:
X_test

array([[-5.00535868e-01,  7.83201602e-01, -1.37535133e+00, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 5.00725322e-01,  9.56303830e-01,  6.67383595e-01, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 8.34479052e-01,  1.64871274e+00,  6.67383595e-01, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       ...,
       [-6.67412733e-01, -1.55367847e+00, -1.35280458e-02, ...,
         0.00000000e+00,  1.00000000e+00,  0.00000000e+00],
       [ 1.83574024e+00,  6.96650488e-01, -6.94439687e-01, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [ 9.47266973e-05,  1.12940606e+00,  6.67383595e-01, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00]])

## Model Training and Results

### Linear Regression

In [78]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train, y_train)

In [95]:
LR_predict = LR.predict(X_test)
LR_predict

array([68.2421875, 68.109375 , 72.3984375, ..., 59.59375  , 72.5546875,
       65.8515625])

In [97]:
pd.DataFrame({'Predict': LR_predict, 'y_test': y_test})

Unnamed: 0,Predict,y_test
0,68.242188,68
1,68.109375,68
2,72.398438,72
3,68.632812,69
4,66.039062,66
...,...,...
1317,66.601562,67
1318,71.328125,71
1319,59.593750,59
1320,72.554688,73


### Random Forest

In [100]:
from sklearn.ensemble import RandomForestRegressor
RFR = RandomForestRegressor(random_state = 616)
RFR.fit(X_train, y_train)

In [102]:
RFR_predict = RFR.predict(X_test)
RFR_predict

array([67.45, 69.12, 73.01, ..., 62.01, 73.12, 67.51])

In [104]:
pd.DataFrame({'Predict': RFR_predict, 'y_test': y_test})

Unnamed: 0,Predict,y_test
0,67.45,68
1,69.12,68
2,73.01,72
3,67.73,69
4,65.92,66
...,...,...
1317,66.08,67
1318,70.07,71
1319,62.01,59
1320,73.12,73


### Bayesian Ridge Regression

In [107]:
from sklearn.linear_model import BayesianRidge
BRR = BayesianRidge()
BRR.fit(X_train, y_train)

In [109]:
BRR_predict = BRR.predict(X_test)
BRR_predict

array([68.21406023, 68.13165589, 72.42643135, ..., 59.59738554,
       72.54757858, 65.90260719])

In [111]:
pd.DataFrame({'Predict': BRR_predict, 'y_test': y_test})

Unnamed: 0,Predict,y_test
0,68.214060,68
1,68.131656,68
2,72.426431,72
3,68.640441,69
4,66.018193,66
...,...,...
1317,66.611821,67
1318,71.298549,71
1319,59.597386,59
1320,72.547579,73


### Support Vector Regression

In [116]:
from sklearn.svm import SVR
SVReg = SVR()
SVReg.fit(X_train, y_train)

In [120]:
SVR_predict = SVReg.predict(X_test)
SVR_predict

array([67.8348169 , 68.35449239, 72.30543135, ..., 59.25927   ,
       72.68287584, 65.81963038])

In [122]:
pd.DataFrame({'Predict': SVR_predict, 'y_test': y_test})

Unnamed: 0,Predict,y_test
0,67.834817,68
1,68.354492,68
2,72.305431,72
3,68.510606,69
4,65.728730,66
...,...,...
1317,66.466693,67
1318,70.706452,71
1319,59.259270,59
1320,72.682876,73
