# **MACHINE LEARNING**

# **ASSEIGNMENT NO 02**

**Student Name:** KALE CHAITANYA PRASAD

**Student ID:** 2025AA05377

**Date:** 10.02.2026


# Student Academic Placement Performance Dataset

In [55]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score,
    recall_score, f1_score, matthews_corrcoef
)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

In [56]:
df = pd.read_csv("E:\\Chaitanya Personal Data\\M.Tech Material\\Study Material\\ML\\student_academic_placement_performance_dataset.csv",index_col=0)
print("Shape of Data Set is:", df.shape)
df.head()

Shape of Data Set is: (5000, 17)


Unnamed: 0_level_0,gender,ssc_percentage,hsc_percentage,degree_percentage,cgpa,entrance_exam_score,technical_skill_score,soft_skill_score,internship_count,live_projects,work_experience_months,certifications,attendance_percentage,backlogs,extracurricular_activities,placement_status,salary_package_lpa
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,Male,53,79,56,8.87,50,92,90,1,3,23,4,91,4,Yes,0,0.0
2,Female,56,54,59,6.78,61,51,99,1,0,6,5,87,3,No,0,0.0
3,Male,94,83,83,7.92,91,93,84,1,1,10,2,81,2,No,1,6.92
4,Male,84,71,87,6.57,85,60,72,4,2,14,5,87,3,No,0,0.0
5,Male,58,88,74,9.01,73,52,88,1,2,20,0,60,1,No,0,0.0


In [57]:
le = LabelEncoder()
for col in df.select_dtypes(include="object").columns:
    df[col] = le.fit_transform(df[col])


In [58]:
X1 = df.drop(
    columns=["placement_status", "salary_package_lpa"],
    errors="ignore"
)

y1 = df["placement_status"]

scaler = StandardScaler()
X1 = scaler.fit_transform(X1)

X1_train, X1_test, y1_train, y1_test = train_test_split(
    X1, y1,
    test_size=0.2,
    random_state=42,
    stratify=y1
)

In [59]:
## SALARY RANGE CLASSIFICATION (PLACED ONLY)
placed_df = df[df["placement_status"] == 1].copy()

# Define salary bins (LPA)
salary_bins = [0, 3, 6, 10, np.inf]

placed_df["salary_range"] = pd.cut(
    placed_df["salary_package_lpa"],
    bins=salary_bins
)

# Re-encode salary classes to 0,1,2,...
salary_encoder = LabelEncoder()
placed_df["salary_class"] = salary_encoder.fit_transform(
    placed_df["salary_range"]
)

X2 = placed_df.drop(
    columns=[
        "placement_status",
        "salary_package_lpa",
        "salary_range",
        "salary_class"
    ]
)

y2 = placed_df["salary_class"]

X2 = scaler.fit_transform(X2)

X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2,
    test_size=0.2,
    random_state=42,
    stratify=y2
)

In [60]:
## MODEL DEFINITIONS
binary_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        random_state=42
    ),
    "XGBoost": XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=42
    )
}

In [61]:
multiclass_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, solver="lbfgs"),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(
        n_estimators=200,
        random_state=42
    ),
    "XGBoost": XGBClassifier(
        objective="multi:softprob",
        num_class=len(np.unique(y2)),
        eval_metric="mlogloss",
        random_state=42
    )
}

In [62]:
## EVALUATION FUNCTION
def evaluate_models(models, X_train, X_test, y_train, y_test, task_type):
    results = []

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        if task_type == "binary":
            y_prob = model.predict_proba(X_test)[:, 1]

            results.append({
                "Model": name,
                "Accuracy": accuracy_score(y_test, y_pred),
                "AUC": roc_auc_score(y_test, y_prob),
                "Precision": precision_score(y_test, y_pred),
                "Recall": recall_score(y_test, y_pred),
                "F1 Score": f1_score(y_test, y_pred),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

        else:  # multiclass
            y_prob = model.predict_proba(X_test)

            results.append({
                "Model": name,
                "Accuracy": accuracy_score(y_test, y_pred),
                "AUC": roc_auc_score(
                    y_test, y_prob, multi_class="ovr"
                ),
                "Precision": precision_score(
                    y_test, y_pred, average="weighted"
                ),
                "Recall": recall_score(
                    y_test, y_pred, average="weighted"
                ),
                "F1 Score": f1_score(
                    y_test, y_pred, average="weighted"
                ),
                "MCC": matthews_corrcoef(y_test, y_pred)
            })

    return pd.DataFrame(results)

In [63]:
placement_results = evaluate_models(
    binary_models,
    X1_train, X1_test,
    y1_train, y1_test,
    task_type="binary"
)

salary_results = evaluate_models(
    multiclass_models,
    X2_train, X2_test,
    y2_train, y2_test,
    task_type="multiclass"
)

In [64]:
print("\nPLACEMENT STATUS PREDICTION RESULTS\n")
print(placement_results.sort_values("F1 Score", ascending=False))

print("\nSALARY RANGE PREDICTION RESULTS\n")
print(salary_results.sort_values("F1 Score", ascending=False))


PLACEMENT STATUS PREDICTION RESULTS

                 Model  Accuracy       AUC  Precision    Recall  F1 Score  \
1        Decision Tree     1.000  1.000000   1.000000  1.000000  1.000000   
5              XGBoost     1.000  1.000000   1.000000  1.000000  1.000000   
4        Random Forest     1.000  1.000000   1.000000  1.000000  1.000000   
3          Naive Bayes     0.937  0.984707   0.958333  0.664740  0.784983   
2                  KNN     0.895  0.905271   0.750000  0.589595  0.660194   
0  Logistic Regression     0.891  0.935473   0.719178  0.606936  0.658307   

        MCC  
1  1.000000  
5  1.000000  
4  1.000000  
3  0.766704  
2  0.605219  
0  0.597043  

SALARY RANGE PREDICTION RESULTS

                 Model  Accuracy       AUC  Precision    Recall  F1 Score  \
1        Decision Tree  0.379310  0.521191   0.380300  0.379310  0.379667   
4        Random Forest  0.431034  0.529938   0.456586  0.431034  0.370561   
5              XGBoost  0.362069  0.505555   0.350607  0.36