In [18]:
import pandas as pd
import os
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error



# Load consolidated file

In [20]:
filtered_data = pd.read_excel('nirf_engineering_2021_2024.xlsx')
colleges_all_years = filtered_data['college_id'].unique()
print(f"Loaded consolidated dataset from data.xlsx")
print(f"Number of colleges present in all 4 years: {len(colleges_all_years)}")

Loaded consolidated dataset from data.xlsx
Number of colleges present in all 4 years: 81


In [22]:
def estimate_rank(predicted_score, data_2024):

    data_2024_sorted = data_2024.sort_values('Overall_Score', ascending=False).reset_index(drop=True)
    data_2024_sorted['Rank'] = data_2024_sorted.index + 1
    score_diff = np.abs(data_2024_sorted['Overall_Score'] - predicted_score)
    closest_idx = score_diff.idxmin()
    return data_2024_sorted.loc[closest_idx, 'Rank']

In [24]:
# Prepare features and target
X = filtered_data[['TLR', 'RPC', 'GO', 'OI', 'PR', 'Delta_Overall']].fillna(0)
y = filtered_data['Overall_Score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Model RMSE: {rmse:.2f}")

Model RMSE: 1.67


In [56]:
def predict_score_rank(college_id=None, tlr=None, rpc=None, go=None, oi=None, pr=None, delta_overall=None):

    data_2024 = filtered_data[filtered_data['Year'] == 2024]
    
    if college_id:
        # if college_id not in filtered_data['college_id'].values:
        #     return None, None, f"College ID {college_id} not found in all 4 years"
        
        college_data = filtered_data[filtered_data['college_id'] == college_id]
        # if len(college_data) < 4:
        #     return None, None, f"College {college_id} does not have data for all 4 years"
        
        college_2024 = college_data[college_data['Year'] == 2024]
        # if college_2024.empty:
        #     return None, None, f"No 2024 data found for {college_id}"
        
        avg_delta = college_data['Delta_Overall'].mean()
        print(f"Average Delta_Overall for {college_id} 2021-2024: {avg_delta:.2f}")
        
        params = ['TLR', 'RPC', 'GO', 'OI', 'PR']
        trends = {}
        for param in params:
            param_values = college_data.sort_values('Year')[param].values
            if len(param_values) == 4:
                trends[param] = (param_values[-1] - param_values[0]) / 3
            else:
                trends[param] = 0
        
        params_2024 = college_2024[['TLR', 'RPC', 'GO', 'OI', 'PR']].values[0]
        params_2025 = [params_2024[i] + trends[param] for i, param in enumerate(params)]
        delta_2024 = college_2024['Delta_Overall'].values[0] if not pd.isna(college_2024['Delta_Overall'].values[0]) else avg_delta
        
        input_2025 = pd.DataFrame([params_2025 + [delta_2024]], columns=['TLR', 'RPC', 'GO', 'OI', 'PR', 'Delta_Overall'])
        predicted_score = model.predict(input_2025)[0]
        estimated_rank = estimate_rank(predicted_score, data_2024)
        
        print(f"\nDetailed Prediction for {college_id} ({college_2024['Institution'].iloc[0]}):")
        print("Projected 2025 Parameters:")
        for param, value in zip(params, params_2025):
            print(f"  {param}: {value:.2f}")
        print(f"  Delta_Overall (used): {delta_2024:.2f}")
        # print(f"Predicted 2025 Overall Score: {predicted_score:.2f}")
        # print(f"Estimated 2025 Rank: {estimated_rank}")
        
        return predicted_score, estimated_rank, college_2024['Institution'].iloc[0]
    
    elif all(v is not None for v in [tlr, rpc, go, oi, pr]):
        delta_used = delta_overall if delta_overall is not None else filtered_data[filtered_data['Year'] == 2024]['Delta_Overall'].mean()
        delta_source = "provided" if delta_overall is not None else "mean Delta_Overall from 2024"
        
        input_params = pd.DataFrame([[tlr, rpc, go, oi, pr, delta_used]], 
                                    columns=['TLR', 'RPC', 'GO', 'OI', 'PR', 'Delta_Overall'])
        predicted_score = model.predict(input_params)[0]
        estimated_rank = estimate_rank(predicted_score, data_2024)
        
        print(f"\nDetailed Prediction for Custom Parameters:")
        print("Input Parameters:")
        print(f"  TLR: {tlr:.2f}")
        print(f"  RPC: {rpc:.2f}")
        print(f"  GO: {go:.2f}")
        print(f"  OI: {oi:.2f}")
        print(f"  PR: {pr:.2f}")
        # print(f"  Delta_Overall ({delta_source}): {delta_used:.2f}")
        # print(f"Predicted  Overall Score: {predicted_score:.2f}")
        # print(f"Estimated  Rank: {estimated_rank}")
        
        return predicted_score, estimated_rank, "Custom Parameters"
    
    else:
        return None, None, "Invalid input: Provide either college_id or all parameters (tlr, rpc, go, oi, pr)"


In [58]:
college_id_manual = 'IR-E-U-0105'  
score, rank, institution = predict_score_rank(college_id=college_id_manual)
if score is not None:
    print(f"Predicted 2025 Score for {college_id_manual} ({institution}): {score:.2f}")
    print(f"Estimated 2025 Rank: {rank}")
else:
    print(institution)

Average Delta_Overall for IR-E-U-0105 2021-2024: 0.49

Detailed Prediction for IR-E-U-0105 (Indraprastha Institute of Information Technology):
Projected 2025 Parameters:
  TLR: 54.14
  RPC: 31.63
  GO: 72.95
  OI: 45.99
  PR: 19.01
  Delta_Overall (used): -0.25
Predicted 2025 Score for IR-E-U-0105 (Indraprastha Institute of Information Technology): 47.27
Estimated 2025 Rank: 72


# Predict for manually defined parameter scores

In [62]:
tlr_manual = 68.33 
rpc_manual = 31.67
go_manual = 71.72
oi_manual = 44.34
pr_manual = 17.57
delta_manual = 0.49

score, rank, institution = predict_score_rank(
    tlr=tlr_manual, rpc=rpc_manual, go=go_manual, oi=oi_manual, pr=pr_manual, delta_overall=delta_manual)

if score is not None:
    print(f"Predicted Score for provided parameters: {score:.2f}")
    print(f"Rank: {rank}")



Detailed Prediction for Custom Parameters:
Input Parameters:
  TLR: 68.33
  RPC: 31.67
  GO: 71.72
  OI: 44.34
  PR: 17.57
Predicted Score for provided parameters: 49.82
Rank: 63
