# Predictions
This notebook runs predictions on new/unseen student data.

In [30]:
import os, sys
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname('__file__'), "..")))

import pandas as pd
from src.predict import run_prediction

In [31]:
# Set paths (relative to notebooks/ folder)
model_path = "../results/models/random_forest_math.pkl"
data_path = "../data/student-mat.csv"
output_dir = "../results/predictions"

print("Model path:", model_path)
print("Data path:", data_path)
print("Output directory:", output_dir)


Model path: ../results/models/random_forest_math.pkl
Data path: ../data/student-mat.csv
Output directory: ../results/predictions


In [32]:
# Check if files exist
print("Model exists:", os.path.exists(model_path))
print("Data exists:", os.path.exists(data_path))
print("Output directory exists:", os.path.exists(output_dir))

Model exists: True
Data exists: True
Output directory exists: True


In [33]:
# Load sample data to preview
if os.path.exists(data_path):
    sample_data = pd.read_csv(data_path)
    print("Data shape:", sample_data.shape)
    print("\nFirst few rows:")
    display(sample_data.head())
else:
    print("Data file not found!")

Data shape: (395, 1)

First few rows:


Unnamed: 0,school;sex;age;address;famsize;Pstatus;Medu;Fedu;Mjob;Fjob;reason;guardian;traveltime;studytime;failures;schoolsup;famsup;paid;activities;nursery;higher;internet;romantic;famrel;freetime;goout;Dalc;Walc;health;absences;G1;G2;G3
0,"GP;""F"";18;""U"";""GT3"";""A"";4;4;""at_home"";""teacher..."
1,"GP;""F"";17;""U"";""GT3"";""T"";1;1;""at_home"";""other"";..."
2,"GP;""F"";15;""U"";""LE3"";""T"";1;1;""at_home"";""other"";..."
3,"GP;""F"";15;""U"";""GT3"";""T"";4;2;""health"";""services..."
4,"GP;""F"";16;""U"";""GT3"";""T"";3;3;""other"";""other"";""h..."


In [34]:
# Run prediction
if os.path.exists(model_path) and os.path.exists(data_path):
    print("Running predictions...")
    predictions = run_prediction(model_path, data_path, output_dir)
    print("Predictions completed!")
else:
    print("Required files not found. Please check paths.")

Running predictions...
⚠️  Target column 'G3' removed from input data

PREDICTION SUMMARY
Model: random_forest_math.pkl
Input data: student-mat.csv
Number of predictions: 395
Prediction range: 0.0 - 19.4
Average prediction: 10.42

First 10 predictions:
  Student 1: 6.27 (rounded: 6)
  Student 2: 5.59 (rounded: 6)
  Student 3: 9.42 (rounded: 9)
  Student 4: 14.20 (rounded: 14)
  Student 5: 9.86 (rounded: 10)
  Student 6: 15.29 (rounded: 15)
  Student 7: 11.47 (rounded: 11)
  Student 8: 5.77 (rounded: 6)
  Student 9: 18.62 (rounded: 19)
  Student 10: 15.64 (rounded: 16)

✅ Predictions saved to: ../results/predictions\predictions_20250927_164831.csv
Predictions completed!


In [35]:

# Find the latest predictions file
files = glob.glob(os.path.join(output_dir, "predictions_*.csv"))
if files:
    latest_file = max(files, key=os.path.getctime)
    pred_df = pd.read_csv(latest_file)
    print(f"Loaded: {latest_file}")
    print("Predictions shape:", pred_df.shape)
    print("\nFirst 10 predictions:")
    display(pred_df.head(10))
else:
    print("No predictions file found in", output_dir)


Loaded: ../results/predictions\predictions_20250927_164831.csv
Predictions shape: (395, 2)

First 10 predictions:


Unnamed: 0,prediction,prediction_rounded
0,6.27,6
1,5.59,6
2,9.42,9
3,14.2,14
4,9.86,10
5,15.29,15
6,11.47,11
7,5.77,6
8,18.62,19
9,15.64,16
