In [1]:
import pandas as pd
import numpy as np
import os
import joblib

In [2]:
# Define file paths
PROCESSED_DATA_PATH = '../data/processed/'
COMBINED_DATA_FILE = os.path.join(PROCESSED_DATA_PATH, 'combined_counseling_data.csv')

# Load the dataset
df = pd.read_csv(COMBINED_DATA_FILE)

  df = pd.read_csv(COMBINED_DATA_FILE)


In [3]:
print(f"Loaded combined dataset with {len(df)} rows.")

Loaded combined dataset with 424060 rows.


In [4]:
# We identify each unique seat by its combination of features
grouping_cols = ['Year', 'Institute', 'Academic Program Name', 'Quota', 'Seat Type', 'Gender']

# For each group, find the index of the row with the highest 'Round' number
idx_final_round = df.groupby(grouping_cols)['Round'].idxmax()

# Use these indices to select only the final-round rows from the original DataFrame
df_final = df.loc[idx_final_round].copy()

print(f"Data filtered to final rounds only. We now have {len(df_final)} rows.")

Data filtered to final rounds only. We now have 67369 rows.


In [6]:
# 1. Correct Data Types
# Ensure ranks are clean numbers (integers)
df_final['Opening Rank'] = pd.to_numeric(df_final['Opening Rank'], errors='coerce')
df_final['Closing Rank'] = pd.to_numeric(df_final['Closing Rank'], errors='coerce')
df_final.dropna(subset=['Opening Rank', 'Closing Rank'], inplace=True) # Drop rows if ranks were not valid numbers
df_final[['Opening Rank', 'Closing Rank']] = df_final[['Opening Rank', 'Closing Rank']].astype(int)


In [7]:
# 2. Engineer New Features 🧠
# a. Create Institute Type
def get_institute_type(institute):
    if 'Indian Institute of Technology' in institute: return 'IIT'
    if 'National Institute of Technology' in institute: return 'NIT'
    if 'Indian Institute of Information Technology' in institute: return 'IIIT'
    return 'GFTI'

df_final['Institute_Type'] = df_final['Institute'].apply(get_institute_type)

# b. Create a simple binary feature for Female-only seats
df_final['Is_Female_Only'] = df_final['Gender'].apply(lambda x: 1 if 'Female-only' in x else 0)


In [8]:
print("✅ New features 'Institute_Type' and 'Is_Female_Only' created.")
display(df_final[['Institute', 'Institute_Type', 'Gender', 'Is_Female_Only']].head())

✅ New features 'Institute_Type' and 'Is_Female_Only' created.


Unnamed: 0,Institute,Institute_Type,Gender,Is_Female_Only
103215,"Assam University, Silchar",GFTI,Female-only (including Supernumerary),1
103214,"Assam University, Silchar",GFTI,Gender-Neutral,0
103213,"Assam University, Silchar",GFTI,Female-only (including Supernumerary),1
103212,"Assam University, Silchar",GFTI,Gender-Neutral,0
96505,"Assam University, Silchar",GFTI,Gender-Neutral,0


In [9]:
from sklearn.preprocessing import LabelEncoder

# List of text-based columns we need to encode
categorical_cols = ['Institute', 'Academic Program Name', 'Quota', 'Seat Type', 'Institute_Type']
encoders = {} # Dictionary to save our "translators"

print("Encoding categorical features...")

Encoding categorical features...


In [10]:
for col in categorical_cols:
    le = LabelEncoder()
    df_final[col + '_encoded'] = le.fit_transform(df_final[col])
    encoders[col] = le # Save the fitted encoder for this column
    print(f"  - '{col}' encoded successfully.")

  - 'Institute' encoded successfully.
  - 'Academic Program Name' encoded successfully.
  - 'Quota' encoded successfully.
  - 'Seat Type' encoded successfully.
  - 'Institute_Type' encoded successfully.


In [11]:
# We MUST save these encoders. We'll need them later to translate new data for prediction.
ENCODERS_PATH = os.path.join(PROCESSED_DATA_PATH, 'encoders.joblib')
joblib.dump(encoders, ENCODERS_PATH)

print(f"\n✅ All encoders saved to '{ENCODERS_PATH}'")


✅ All encoders saved to '../data/processed/encoders.joblib'


In [12]:
# Select only the columns needed for the model, dropping the original text columns
model_cols = [
    'Year', 'Opening Rank', 'Closing Rank', 'Is_Female_Only',
    'Institute_encoded', 'Academic Program Name_encoded',
    'Quota_encoded', 'Seat Type_encoded', 'Institute_Type_encoded'
]
df_model_ready = df_final[model_cols]

In [13]:
# Save our final, model-ready dataset
PREPROCESSED_DATA_FILE = os.path.join(PROCESSED_DATA_PATH, 'preprocessed_data.csv')
df_model_ready.to_csv(PREPROCESSED_DATA_FILE, index=False)

print(f"\n🎉 Preprocessing complete! Model-ready data saved to '{PREPROCESSED_DATA_FILE}'")
display(df_model_ready.head())


🎉 Preprocessing complete! Model-ready data saved to '../data/processed/preprocessed_data.csv'


Unnamed: 0,Year,Opening Rank,Closing Rank,Is_Female_Only,Institute_encoded,Academic Program Name_encoded,Quota_encoded,Seat Type_encoded,Institute_Type_encoded
103215,2018,20319,20319,1,0,3,0,2,0
103214,2018,15903,17411,0,0,3,0,2,0
103213,2018,54981,56345,1,0,3,0,4,0
103212,2018,44634,57812,0,0,3,0,4,0
96505,2018,2247,2247,0,0,3,0,5,0
