In [1]:
import pandas as pd
import numpy as np
import joblib
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sn
import os

In [2]:
# Define file path
PROCESSED_DATA_PATH = '../data/processed/'
PREPROCESSED_DATA_FILE = os.path.join(PROCESSED_DATA_PATH, 'preprocessed_data.csv')

# Load the dataset
df = pd.read_csv(PREPROCESSED_DATA_FILE)

print("✅ Model-ready data loaded successfully.")
display(df.head())

✅ Model-ready data loaded successfully.


Unnamed: 0,Year,Opening Rank,Closing Rank,Is_Female_Only,Institute_encoded,Academic Program Name_encoded,Quota_encoded,Seat Type_encoded,Institute_Type_encoded
0,2018,20319,20319,1,0,3,0,2,0
1,2018,15903,17411,0,0,3,0,2,0
2,2018,54981,56345,1,0,3,0,4,0
3,2018,44634,57812,0,0,3,0,4,0
4,2018,2247,2247,0,0,3,0,5,0


In [3]:
# Define features (X) and target (y)
# X contains all our clues. We drop the ranks themselves.
X = df.drop(['Opening Rank', 'Closing Rank'], axis=1)
# y is what we want to predict.
y = df['Opening Rank']

In [4]:
# --- Train-Test Split ---
# We'll train the model on data up to 2024 and test it on 2025 data.
# This simulates a real-world scenario where we predict for a future year.
X_train = X[X['Year'] < 2024]
y_train = y[X['Year'] < 2024]

X_test = X[X['Year'] == 2024]
y_test = y[X['Year'] == 2024]

In [5]:
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape:  {X_test.shape}")

Training data shape: (54358, 7)
Testing data shape:  (11462, 7)


In [6]:
# Initialize the LightGBM Regressor model
# These parameters are a good starting point.
lgbm = lgb.LGBMRegressor(
    objective='regression_l1', # Use L1 loss, which is robust for this kind of data
    n_estimators=1000,         # Number of "decision trees" to build
    learning_rate=0.05,
    num_leaves=31,
    max_depth=-1,
    random_state=42,           # For reproducible results
    n_jobs=-1                  # Use all available CPU cores
)

# Train the model!
print("🚀 Training the LightGBM model...")
lgbm.fit(X_train, y_train)
print("✅ Model training complete.")


🚀 Training the LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001058 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 357
[LightGBM] [Info] Number of data points in the train set: 54358, number of used features: 7
[LightGBM] [Info] Start training from score 3723.500000
✅ Model training complete.


In [7]:
# Make predictions on the test data (year 2025)
predictions = lgbm.predict(X_test)

# Calculate performance metrics
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"--- Model Performance on 2025 Data ---")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²): {r2:.2f}")

# MAE: This means our model's predictions are, on average, off by about {mae:.0f} ranks.
# R²: Our model can explain about {r2:.0%} of the variation in the closing ranks.


--- Model Performance on 2025 Data ---
Mean Absolute Error (MAE): 4348.61
R-squared (R²): 0.61


In [8]:
# Create a scatter plot of actual vs. predicted ranks
plt.figure(figsize=(10, 10))
sns.scatterplot(x=y_test, y=predictions, alpha=0.3)
plt.plot([0, y_test.max()], [0, y_test.max()], color='red', linestyle='--', lw=2) # The perfect prediction line
plt.title('Actual vs. Predicted Closing Ranks for 2025', fontsize=16)
plt.xlabel('Actual Closing Rank', fontsize=12)
plt.ylabel('Predicted Closing Rank', fontsize=12)
plt.show()

NameError: name 'sns' is not defined

<Figure size 1000x1000 with 0 Axes>

In [9]:
# Define the path to save the model
MODELS_DIR = '../models/'
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)

MODEL_PATH = os.path.join(MODELS_DIR, 'opening_rank_model.joblib')

# Save the model
joblib.dump(lgbm, MODEL_PATH)

print(f"🎉 Model saved successfully to '{MODEL_PATH}'")

🎉 Model saved successfully to '../models/opening_rank_model.joblib'
