importing libraries

In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


Reading dataset

In [70]:
dataset = pd.read_csv("/content/StudentsPerformance.csv")

In [None]:
dataset.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


# Data cleaning

In [71]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [72]:
# it doesnt contain null values
# it contain cat features
print(dataset['gender'].unique())
print(dataset['race/ethnicity'].unique())
print(dataset['parental level of education'].unique())
print(dataset['lunch'].unique())

['female' 'male']
['group B' 'group C' 'group A' 'group D' 'group E']
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
['standard' 'free/reduced']


In [73]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
dataset['gender'] = encoder.fit_transform(dataset['gender'])
dataset['lunch'] = encoder.fit_transform(dataset['lunch'])
dataset['test preparation course'] = encoder.fit_transform(dataset['test preparation course'])

In [74]:
dataset.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,group B,bachelor's degree,1,1,72,72,74
1,0,group C,some college,1,0,69,90,88
2,0,group B,master's degree,1,1,90,95,93
3,1,group A,associate's degree,0,1,47,57,44
4,1,group C,some college,1,1,76,78,75


In [75]:
print(dataset['race/ethnicity'].unique())
print(dataset['parental level of education'].unique())


['group B' 'group C' 'group A' 'group D' 'group E']
["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']


In [76]:
dataset_encoded = pd.get_dummies(dataset, drop_first=True).astype(int)

dataset_encoded.head()

Unnamed: 0,gender,lunch,test preparation course,math score,reading score,writing score,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school
0,0,1,1,72,72,74,1,0,0,0,1,0,0,0,0
1,0,1,0,69,90,88,0,1,0,0,0,0,0,1,0
2,0,1,1,90,95,93,1,0,0,0,0,0,1,0,0
3,1,0,1,47,57,44,0,0,0,0,0,0,0,0,0
4,1,1,1,76,78,75,0,1,0,0,0,0,0,1,0


# spilting data

In [77]:

Y  = dataset_encoded.iloc[: , 3]

X = dataset_encoded.drop("math score", axis=1)

from sklearn.model_selection import train_test_split
x_train ,x_test,y_train,y_test = train_test_split(X,Y,test_size =0.2,random_state=42)

In [78]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(800, 14)
(800,)
(200, 14)
(200,)


# Data preprocessing

In [79]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [80]:
import numpy as np
import statsmodels.api as sm

X = np.append(arr=np.ones((x_train.shape[0],1)).astype(int), values=x_train, axis=1)

cols = list(range(X.shape[1]))

x_opt = X[:, cols]

# Backward Elimination
while True:
    model = sm.OLS(y_train, exog=x_opt).fit()
    p_values = model.pvalues
    max_p = max(p_values)

    if max_p > 0.05:
        max_index = np.argmax(p_values)
        print(f"Removing column {cols[max_index]} with P={max_p:.3f}")

        cols.pop(max_index)
        x_opt = X[:, cols]
    else:
        break

final_model = sm.OLS(y_train, exog=x_opt).fit()
print(final_model.summary())


Removing column 6 with P=0.648
Removing column 10 with P=0.288
Removing column 12 with P=0.182
Removing column 8 with P=0.092
Removing column 7 with P=0.285
                            OLS Regression Results                            
Dep. Variable:             math score   R-squared:                       0.873
Model:                            OLS   Adj. R-squared:                  0.872
Method:                 Least Squares   F-statistic:                     604.4
Date:                Thu, 06 Nov 2025   Prob (F-statistic):               0.00
Time:                        13:33:32   Log-Likelihood:                -2476.4
No. Observations:                 800   AIC:                             4973.
Df Residuals:                     790   BIC:                             5020.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t

In [85]:
corr_matrix = dataset_encoded.corr()
# Now you can look at how much each attribute correlates with the median house value:
corr_matrix["math score"].sort_values(ascending=False)

Unnamed: 0,math score
math score,1.0
reading score,0.81758
writing score,0.802642
lunch,0.350877
race/ethnicity_group E,0.205855
gender,0.167982
parental level of education_bachelor's degree,0.079664
parental level of education_master's degree,0.060417
race/ethnicity_group D,0.050071
parental level of education_some college,0.037056


The importance of selecting the correct statistical model (like OLS in statsmodels) lies in achieving accurate and reliable predictions.
A well-fitted model correctly captures the relationship between variables and avoids misleading interpretations.
It also minimizes overfitting and underfitting, ensuring the model performs well on new, unseen data.
Finally, a proper model provides trustworthy insights for data-driven decision-making and research analysis.

In [None]:
x_opt=X[:,[1,2,3,4,5,6,7,8]]
x_opt.shape

(800, 8)

# Linear regression model

In [None]:
from sklearn.linear_model import Lasso
lr = Lasso(alpha=0.01)
lr.fit(x_opt,y_train)

In [None]:
x_test_pred = lr.predict(x_test[:,[1,2,3,4,5,6,7,8]])

In [None]:
from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(y_test,x_test_pred)
print(rmse)

16.777872351435406


# polynomyal regression

In [None]:
#polynomial regression

from sklearn.preprocessing import PolynomialFeatures
poly_model = PolynomialFeatures(degree=2)
poly=poly_model.fit_transform(x_opt,y_train)

linear_model_2 = Lasso(alpha=0.1)
linear_model_2.fit(poly,y_train)

y_pred = linear_model_2.predict(poly_model.fit_transform(x_test[:,[1,2,3,4,5,6,7,8]]))
from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(y_test,y_pred)
print(rmse)

16.27623759415123
