## Load + Explore Dataset (EDA)

In [2]:
import pandas as pd
#Load the dataset
df = pd.read_csv(r"C:\Users\Aman Sinha\Downloads\StudentsPerformance.csv") #dataset link - https://www.kaggle.com/datasets/spscientist/students-performance-in-exams

# Checking Structure

In [5]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [7]:
df.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [11]:
df.isnull().sum() #Checking missing if any

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

## Data processing

In [16]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#Encode categorical columns
le = LabelEncoder()
df['gender']= le.fit_transform(df['gender'])
df['race/ethnicity'] = le.fit_transform(df['race/ethnicity'])
df['parental level of education'] = le.fit_transform(df['parental level of education'])
df['lunch'] = le.fit_transform(df['lunch'])
df['test preparation course'] = le.fit_transform(df['test preparation course'])

In [17]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,1,1,1,1,72,72,74
1,0,2,4,1,0,69,90,88
2,0,1,3,1,1,90,95,93
3,1,0,0,0,1,47,57,44
4,1,2,4,1,1,76,78,75


In [18]:
#Feature and target
X = df.drop(['math score'],axis=1) # we will predict maths score
y = df['math score']

#Split data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state= 42)

## Model training

In [21]:
from sklearn.linear_model import LinearRegression
model= LinearRegression()
model.fit(X_train,y_train)

## Evaluation

In [22]:
from sklearn.metrics import mean_squared_error, r2_score
y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test,y_pred))
print("R^2 Score:", r2_score(y_test,y_pred))

MSE: 28.275284506327317
R^2 Score: 0.8838026201112225


In [23]:
# MSE- Averge of (Predicted-Actual)^2  our is 28, means machinge on average is doing 28 marks's error
#R2 score 88% of the variation in the student marks (range 0-1)


Our student score model gives an R² of 0.88.
That means it explains 88% of the variation in marks using just the available features.
The Mean Squared Error is 28, which is acceptable depending on the mark range (0–100).
We can further improve accuracy by adding more features like attendance, sleep hours, or parental income.

#MLflow Logging (Basic)

In [30]:
import mlflow

with mlflow.start_run():
    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_metric("mse", mean_squared_error(y_test, y_pred))
    mlflow.log_metric("r2", r2_score(y_test, y_pred))
    mlflow.sklearn.log_model(model, "Project_student_grade_predictor")


