# Phase 3: Data Preprocessing & Feature Engineering

## Purpose:
To clean, transform, encode, and prepare the dataset for machine learning model training.

## Goals:
- Prepare numerical features
- Encode categorical labels
- Scale features
- Split data into train/test sets
- Build ML-ready dataset


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [4]:
df = pd.read_csv("data/students_dataset.csv")
df.head()

Unnamed: 0,CGPA,Backlogs,Internship_Count,Coding_Rating,Aptitude_Score,Target_Class
0,9.45,0,0,1880,38,Unplaced
1,9.29,0,2,1154,87,Mass_Recruiter
2,5.65,0,1,1812,92,Unplaced
3,6.51,0,3,1967,43,Unplaced
4,6.85,0,0,1364,60,Mass_Recruiter


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   CGPA              1000 non-null   float64
 1   Backlogs          1000 non-null   int64  
 2   Internship_Count  1000 non-null   int64  
 3   Coding_Rating     1000 non-null   int64  
 4   Aptitude_Score    1000 non-null   int64  
 5   Target_Class      1000 non-null   object 
dtypes: float64(1), int64(4), object(1)
memory usage: 47.0+ KB


In [10]:
df.isnull().sum()

CGPA                0
Backlogs            0
Internship_Count    0
Coding_Rating       0
Aptitude_Score      0
Target_Class        0
dtype: int64

In [12]:
X = df.drop("Target_Class", axis=1)
y = df["Target_Class"]

X.head(), y.head()

(   CGPA  Backlogs  Internship_Count  Coding_Rating  Aptitude_Score
 0  9.45         0                 0           1880              38
 1  9.29         0                 2           1154              87
 2  5.65         0                 1           1812              92
 3  6.51         0                 3           1967              43
 4  6.85         0                 0           1364              60,
 0          Unplaced
 1    Mass_Recruiter
 2          Unplaced
 3          Unplaced
 4    Mass_Recruiter
 Name: Target_Class, dtype: object)

In [14]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
class_mapping


{'Mass_Recruiter': 0, 'Tier_1': 1, 'Tier_2': 2, 'Unplaced': 3}

In [16]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled[:5]

array([[ 1.4704326 , -0.43318409, -1.06363933,  0.98300502, -1.55778222],
       [ 1.35365108, -0.43318409,  0.98971845, -0.81638089,  1.04454046],
       [-1.30312848, -0.43318409, -0.03696044,  0.8144675 ,  1.31008359],
       [-0.67542781, -0.43318409,  2.01639735,  1.19863391, -1.29223909],
       [-0.42726708, -0.43318409, -1.06363933, -0.29589736, -0.38939245]])

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

X_train.shape, X_test.shape


((800, 5), (200, 5))

In [20]:
import joblib

joblib.dump(scaler, "models/scaler.pkl")
joblib.dump(label_encoder, "models/label_encoder.pkl")

print("Scaler and Label Encoder saved.")


Scaler and Label Encoder saved.


In [22]:
np.save("data/X_train.npy", X_train)
np.save("data/X_test.npy", X_test)
np.save("data/y_train.npy", y_train)
np.save("data/y_test.npy", y_test)

print("Processed datasets saved.")


Processed datasets saved.
