In [4]:
# Import required Python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')



In [5]:
df = pd.read_csv('StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [8]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

In [9]:
#  Data Preprocessing and Target Variable Creation
# Create a copy of the dataset
df_processed = df.copy()

# Compute the target variable: Final Exam Score (Average of Math, Reading, Writing)
df_processed['final_exam_score'] = (df_processed['math score'] + 
                                     df_processed['reading score'] + 
                                     df_processed['writing score']) / 3

print("Target variable 'final_exam_score' created!")
print(f"\nTarget variable statistics:")
print(df_processed['final_exam_score'].describe())


Target variable 'final_exam_score' created!

Target variable statistics:
count    1000.000000
mean       67.770667
std        14.257326
min         9.000000
25%        58.333333
50%        68.333333
75%        77.666667
max       100.000000
Name: final_exam_score, dtype: float64


In [10]:
df_processed['parental level of education']

0       bachelor's degree
1            some college
2         master's degree
3      associate's degree
4            some college
              ...        
995       master's degree
996           high school
997           high school
998          some college
999          some college
Name: parental level of education, Length: 1000, dtype: object

In [11]:
from sklearn.preprocessing import OrdinalEncoder

education_order = [
    "some high school",
    "high school",
    "some college",
    "associate's degree",
    "bachelor's degree",
    "master's degree"
]

ordinal_encoder = OrdinalEncoder(categories=[education_order])

df_processed['parental_education_encoded'] = ordinal_encoder.fit_transform(
    df_processed[['parental level of education']]
)

In [12]:

label_encoders = {}
#  2. Encode test preparation course
le_test_prep = LabelEncoder()
df_processed['test_preparation_encoded'] = le_test_prep.fit_transform(
    df_processed['test preparation course']
)
label_encoders['test preparation course'] = le_test_prep

#  3. Encode gender
le_gender = LabelEncoder()
df_processed['gender_encoded'] = le_gender.fit_transform(
    df_processed['gender']
)
label_encoders['gender'] = le_gender

#  4. Encode race/ethnicity
le_race = LabelEncoder()
df_processed['race_encoded'] = le_race.fit_transform(
    df_processed['race/ethnicity']
)
label_encoders['race/ethnicity'] = le_race

# ✅ 5. Encode lunch
le_lunch = LabelEncoder()
df_processed['lunch_encoded'] = le_lunch.fit_transform(
    df_processed['lunch']
)
label_encoders['lunch'] = le_lunch

print("Categorical features encoded successfully!")
print("\nProcessed dataset columns:")
print(df_processed.columns.tolist())

Categorical features encoded successfully!

Processed dataset columns:
['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course', 'math score', 'reading score', 'writing score', 'final_exam_score', 'parental_education_encoded', 'test_preparation_encoded', 'gender_encoded', 'race_encoded', 'lunch_encoded']


In [27]:
np.random.seed(42)

# Study hours per day (correlated with scores)
df_processed['study_hours'] = np.random.uniform(1, 8, size=len(df_processed)) + \
                               (df_processed['final_exam_score'] - 50) / 20

# Attendance percentage (correlated with scores)
df_processed['attendance_percentage'] = np.random.uniform(60, 100, size=len(df_processed)) + \
                                        (df_processed['final_exam_score'] - 50) / 10

# Sleep hours (normally distributed, slight correlation with performance)
df_processed['sleep_hours'] = np.random.uniform(5, 9, size=len(df_processed)) + \
                              (df_processed['final_exam_score'] - 70) / 50

# Ensure values are within reasonable ranges
df_processed['study_hours'] = df_processed['study_hours'].clip(0.5, 10)
df_processed['attendance_percentage'] = df_processed['attendance_percentage'].clip(50, 100)
df_processed['sleep_hours'] = df_processed['sleep_hours'].clip(4, 10)

In [29]:
feature_columns = [
    'study_hours',
    'attendance_percentage',
    'parental_education_encoded',
    'test_preparation_encoded',
    'sleep_hours',
    
]
X = df_processed[feature_columns].copy()
y = df_processed['final_exam_score'].copy()

In [34]:
X = StandardScaler().fit_transform(X)

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [47]:
# TASK 8: Train a Multilinear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

print("✓ Multilinear Regression model trained successfully!")
print(f"\nModel Parameters:")
print(f"  Intercept: {model.intercept_:.4f}")
print(f"\n  Coefficients:")
for feature, coef in zip(feature_columns, model.coef_):
    print(f"    {feature}: {coef:.4f}")

✓ Multilinear Regression model trained successfully!

Model Parameters:
  Intercept: 68.0219

  Coefficients:
    study_hours: 3.3774
    attendance_percentage: 0.5724
    parental_education_encoded: 2.4901
    test_preparation_encoded: -3.2648
    sleep_hours: 3.2919


In [48]:
y_pred = model.predict(X_test)

In [49]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE :", mse)
print("RMSE:", rmse)
print("R²  :", r2)

MSE : 155.54864647202936
RMSE: 12.471914306634302
R²  : 0.2743801009142315


In [50]:
features = [
    'Study Hours',
    'Attendance',
    'Parental Education',
    'Test Preparation',
    'Sleep Hours'
]

for f, c in zip(features, model.coef_):
    print(f"{f}: {c}")

Study Hours: 3.377357597926379
Attendance: 0.572395663689913
Parental Education: 2.490087740535858
Test Preparation: -3.264807751907771
Sleep Hours: 3.291919754842797


In [51]:
X_reduced = X[:, [0, 1, 3]]  # remove weak features

In [52]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

In [53]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE :", mse)
print("RMSE:", rmse)
print("R²  :", r2)

MSE : 155.54864647202936
RMSE: 12.471914306634302
R²  : 0.2743801009142315


In [54]:
lasso = Lasso(alpha=0.05)
lasso.fit(X_train, y_train)

In [55]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE :", mse)
print("RMSE:", rmse)
print("R²  :", r2)

MSE : 155.54864647202936
RMSE: 12.471914306634302
R²  : 0.2743801009142315
