In [1]:
import pandas as pd
import numpy as np

### STEP 1: Setup + Load Feature Data

In [2]:
# Load feature datasets for model training

X_placement = pd.read_csv("placement_features.csv")
y_placement = pd.read_csv("placement_target.csv")

X_salary = pd.read_csv("salary_features.csv")
y_salary = pd.read_csv("salary_target.csv")

print("Placement data shape:", X_placement.shape, y_placement.shape)
print("Salary data shape:", X_salary.shape, y_salary.shape)


Placement data shape: (10000, 12) (10000, 1)
Salary data shape: (570, 13) (570, 1)


### STEP 2: Train–Test Split + Preprocessing Pipeline

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ---------- SPLIT ----------

X_train_p, X_test_p, y_train_p, y_test_p = train_test_split(
    X_placement, y_placement, test_size=0.2, random_state=42, stratify=y_placement
)

X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_salary, y_salary, test_size=0.2, random_state=42
)

# ---------- PREPROCESSING ----------

# Identify column types automatically
categorical_cols_p = X_train_p.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols_p = X_train_p.select_dtypes(exclude=['object', 'category']).columns.tolist()

categorical_cols_s = X_train_s.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols_s = X_train_s.select_dtypes(exclude=['object', 'category']).columns.tolist()

# Preprocessors
preprocess_p = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols_p),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_p)
    ]
)

preprocess_s = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols_s),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols_s)
    ]
)

print("Placement split:", X_train_p.shape, X_test_p.shape)
print("Salary split:", X_train_s.shape, X_test_s.shape)
print("Placement categorical cols:", categorical_cols_p)
print("Salary categorical cols:", categorical_cols_s)


Placement split: (8000, 12) (2000, 12)
Salary split: (456, 13) (114, 13)
Placement categorical cols: ['internship_experience', 'cgpa_bucket']
Salary categorical cols: ['name', 'gender', 'degree', 'stream', 'college_name', 'placement_status']


### STEP 3: Placement Model – Baseline (Logistic Regression)

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# ---------- PIPELINE WITH LOGISTIC REGRESSION ----------

placement_model = Pipeline(steps=[
    ('preprocess', preprocess_p),
    ('model', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

# Train model
placement_model.fit(X_train_p, y_train_p.values.ravel())

# Predictions
y_pred_p = placement_model.predict(X_test_p)
y_prob_p = placement_model.predict_proba(X_test_p)[:, 1]

# Evaluation
print("Accuracy:", accuracy_score(y_test_p, y_pred_p))
print("\nClassification Report:\n", classification_report(y_test_p, y_pred_p))
print("ROC-AUC:", roc_auc_score(y_test_p, y_prob_p))


Accuracy: 0.8785

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.88      0.92      1668
           1       0.59      0.87      0.70       332

    accuracy                           0.88      2000
   macro avg       0.78      0.88      0.81      2000
weighted avg       0.91      0.88      0.89      2000

ROC-AUC: 0.9512564647077519


### STEP 4: Advanced Placement Model (Random Forest)

In [5]:
from sklearn.ensemble import RandomForestClassifier

rf_model = Pipeline(steps=[
    ('preprocess', preprocess_p),
    ('model', RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        random_state=42,
        class_weight='balanced'
    ))
])

# Train
rf_model.fit(X_train_p, y_train_p.values.ravel())

# Predict
y_pred_rf = rf_model.predict(X_test_p)
y_prob_rf = rf_model.predict_proba(X_test_p)[:, 1]

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test_p, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test_p, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test_p, y_prob_rf))


Random Forest Accuracy: 0.9985

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1668
           1       1.00      0.99      1.00       332

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000

ROC-AUC: 1.0


### STEP 5: Salary Model – Baseline (Linear Regression)

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# ---------- LINEAR REGRESSION PIPELINE ----------

salary_lr_model = Pipeline(steps=[
    ('preprocess', preprocess_s),
    ('model', LinearRegression())
])

# Train
salary_lr_model.fit(X_train_s, y_train_s.values.ravel())

# Predict
y_pred_s = salary_lr_model.predict(X_test_s)

# Evaluation
rmse = np.sqrt(mean_squared_error(y_test_s, y_pred_s))
mae = mean_absolute_error(y_test_s, y_pred_s)
r2 = r2_score(y_test_s, y_pred_s)

print("Salary Model (Linear Regression)")
print("RMSE:", rmse)
print("MAE:", mae)
print("R² Score:", r2)


Salary Model (Linear Regression)
RMSE: 39.191261016263866
MAE: 6.2596126137645465
R² Score: 0.9996982924925436


### STEP 6: Advanced Salary Model (Random Forest Regressor)

In [7]:
from sklearn.ensemble import RandomForestRegressor

salary_rf_model = Pipeline(steps=[
    ('preprocess', preprocess_s),
    ('model', RandomForestRegressor(
        n_estimators=200,
        max_depth=12,
        random_state=42
    ))
])

# Train
salary_rf_model.fit(X_train_s, y_train_s.values.ravel())

# Predict
y_pred_rf_s = salary_rf_model.predict(X_test_s)

# Evaluation
rmse_rf = np.sqrt(mean_squared_error(y_test_s, y_pred_rf_s))
mae_rf = mean_absolute_error(y_test_s, y_pred_rf_s)
r2_rf = r2_score(y_test_s, y_pred_rf_s)

print("Salary Model (Random Forest)")
print("RMSE:", rmse_rf)
print("MAE:", mae_rf)
print("R² Score:", r2_rf)


Salary Model (Random Forest)
RMSE: 100.68515282801134
MAE: 9.517543859649123
R² Score: 0.9980086916763652


### STEP 7: SAVE FINAL MODELS

In [11]:
import os
import joblib

# Create models directory in Colab local storage
MODELS_PATH = "/content/models/"
os.makedirs(MODELS_PATH, exist_ok=True)

# Save models locally in Colab
joblib.dump(placement_model, MODELS_PATH + "placement_model.pkl")
joblib.dump(salary_rf_model, MODELS_PATH + "salary_model.pkl")

print("Models saved successfully!")
print("Files in /content/models:")
print(os.listdir(MODELS_PATH))


Models saved successfully!
Files in /content/models:
['salary_model.pkl', 'placement_model.pkl']
