<a href="https://colab.research.google.com/github/Amiya-Kalita/Machine-Learning/blob/main/7_Student_Exam_score_Pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [8]:
df =  pd.read_csv("/content/drive/MyDrive/DataSets/student_exam_scores.csv")
df.head()

Unnamed: 0,student_id,hours_studied,sleep_hours,attendance_percent,previous_scores,exam_score
0,S001,8.0,8.8,72.1,45,30.2
1,S002,1.3,8.6,60.7,55,25.0
2,S003,4.0,8.2,73.7,86,35.8
3,S004,3.5,4.8,95.1,66,34.0
4,S005,9.1,6.4,89.8,71,40.3


In [9]:
target_candidates = [c for c in df.columns if "score" in c.lower() or "exam" in c.lower() or "marks" in c.lower()]
if len(target_candidates) == 1:
    target_col = target_candidates[0]
else:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if len(numeric_cols) >= 1:
        target_col = numeric_cols[-1]
    else:
        raise ValueError("No numeric column found to use as target. Please set target_col manually.")

print("Using target column:", target_col)

Using target column: exam_score


In [10]:
X = df.drop(columns=[target_col])
y = df[target_col]

In [11]:
X.shape

(200, 5)

In [12]:
y.shape

(200,)

In [13]:
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object','category','bool']).columns.tolist()

print("Numeric features:", numeric_features)
print("Categorical features:", categorical_features)

Numeric features: ['hours_studied', 'sleep_hours', 'attendance_percent', 'previous_scores']
Categorical features: ['student_id']


In [14]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [16]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [17]:
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
X_train.shape

(160, 5)

In [20]:
X_test.shape

(40, 5)

In [21]:
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [22]:
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=200, random_state=42))
])

In [24]:
lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)
lr_metrics = {
    "MAE": mean_absolute_error(y_test, y_pred_lr),
    "RMSE": mean_squared_error(y_test, y_pred_lr)**0.5,
    "R2": r2_score(y_test, y_pred_lr)
}

In [26]:
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)
rf_metrics = {
    "MAE": mean_absolute_error(y_test, y_pred_rf),
    "RMSE": mean_squared_error(y_test, y_pred_rf)**0.5,
    "R2": r2_score(y_test, y_pred_rf)
}

In [27]:
print("Linear Regression metrics:", lr_metrics)
print("Random Forest metrics:", rf_metrics)

best_model_pipeline = rf_pipeline if rf_metrics["RMSE"] < lr_metrics["RMSE"] else lr_pipeline
best_name = "RandomForestRegressor" if best_model_pipeline is rf_pipeline else "LinearRegression"
print("Best model selected:", best_name)

Linear Regression metrics: {'MAE': 2.3124125135091864, 'RMSE': 2.785790866013751, 'R2': 0.8537664193365823}
Random Forest metrics: {'MAE': 2.9253124999999973, 'RMSE': 3.3004965488165574, 'R2': 0.7947379509601451}
Best model selected: LinearRegression


In [28]:
joblib_path = "best_model.joblib"  # saved in current folder
joblib.dump(best_model_pipeline, joblib_path)
print("Saved best model to:", joblib_path)

Saved best model to: best_model.joblib


In [29]:
def predict_single(input_dict):
    row = pd.DataFrame([input_dict])
    pred = best_model_pipeline.predict(row)[0]
    return pred

In [30]:
example = {
     "hours_studied": 5.5,
     "sleep_hours": 7.0,
     "attendance_percent": 85,
     "previous_scores": 70,
     "student_id": "S123"
 }
print("Predicted exam score:", predict_single(example))

Predicted exam score: 33.453452173395505


In [31]:
sample = X_test.reset_index(drop=True).head(10).copy()
sample["predicted_"+str(target_col)] = best_model_pipeline.predict(sample)
sample[target_col] = y_test.reset_index(drop=True).head(10)
print("\nSample predictions (first 10 rows):")
print(sample.head(10))


Sample predictions (first 10 rows):
  student_id  hours_studied  ...  predicted_exam_score  exam_score
0       S096            5.2  ...             28.917113        28.7
1       S016            7.0  ...             29.703765        34.1
2       S031            9.9  ...             35.306774        34.5
3       S159            3.5  ...             31.198554        29.5
4       S129           10.7  ...             39.413833        36.1
5       S116           10.6  ...             41.643466        46.4
6       S070            3.9  ...             24.032368        23.9
7       S171           11.3  ...             45.650885        45.3
8       S175            9.9  ...             41.606273        44.8
9       S046            3.6  ...             27.514042        31.8

[10 rows x 7 columns]


In [32]:
sample.head()

Unnamed: 0,student_id,hours_studied,sleep_hours,attendance_percent,previous_scores,predicted_exam_score,exam_score
0,S096,5.2,6.8,84.0,43,28.917113,28.7
1,S016,7.0,9.0,51.2,41,29.703765,34.1
2,S031,9.9,4.4,55.3,67,35.306774,34.5
3,S159,3.5,4.5,89.1,78,31.198554,29.5
4,S129,10.7,8.0,51.0,68,39.413833,36.1
