In [2]:
import pandas as pd
import numpy as np


### STEP 1: Feature Engineering for Career Path Recommender

In [3]:
# Load cleaned career dataset
career_df = pd.read_csv("career_path_cleaned.csv")

print("Dataset Shape:", career_df.shape)
print("\nColumns:", career_df.columns.tolist())

print("\nSample Data:")
career_df.head()

Dataset Shape: (570, 4)

Columns: ['placed', 'salary', 'salary_level', 'career_path']

Sample Data:


Unnamed: 0,placed,salary,salary_level,career_path
0,0,60000.0,Low,Startup
1,0,65000.0,Medium,Startup
2,0,58000.0,Low,Startup
3,0,62000.0,Low,Startup
4,1,63000.0,Low,Higher Studies


In [4]:
# =========================================
# Cell 2: Encode Features & Target
# =========================================

from sklearn.preprocessing import LabelEncoder

# Encode salary_level
salary_level_encoder = LabelEncoder()
career_df["salary_level_enc"] = salary_level_encoder.fit_transform(
    career_df["salary_level"]
)

# Encode target career_path
career_path_encoder = LabelEncoder()
career_df["career_path_enc"] = career_path_encoder.fit_transform(
    career_df["career_path"]
)

print("Salary Level Encoding:")
print(dict(zip(
    salary_level_encoder.classes_,
    salary_level_encoder.transform(salary_level_encoder.classes_)
)))

print("\nCareer Path Encoding:")
print(dict(zip(
    career_path_encoder.classes_,
    career_path_encoder.transform(career_path_encoder.classes_)
)))

# Prepare feature matrix and target
X = career_df[["placed", "salary", "salary_level_enc"]]
y = career_df["career_path_enc"]

print("\nFeature Matrix Shape:", X.shape)
print("Target Shape:", y.shape)


Salary Level Encoding:
{'High': np.int64(0), 'Low': np.int64(1), 'Medium': np.int64(2)}

Career Path Encoding:
{'Higher Studies': np.int64(0), 'Job': np.int64(1), 'Startup': np.int64(2)}

Feature Matrix Shape: (570, 3)
Target Shape: (570,)


In [5]:
# =========================================
# Cell 3: Train-Test Split
# =========================================

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train distribution:")
print(y_train.value_counts())

print("\ny_test distribution:")
print(y_test.value_counts())


X_train shape: (456, 3)
X_test shape: (114, 3)
y_train distribution:
career_path_enc
2    384
1     46
0     26
Name: count, dtype: int64

y_test distribution:
career_path_enc
2    96
1    11
0     7
Name: count, dtype: int64


In [6]:
# =========================================
# Save features for model notebook
# =========================================

import os

os.makedirs("/content/data/processed", exist_ok=True)

X_train.to_csv("/content/data/processed/career_X_train.csv", index=False)
X_test.to_csv("/content/data/processed/career_X_test.csv", index=False)
y_train.to_csv("/content/data/processed/career_y_train.csv", index=False)
y_test.to_csv("/content/data/processed/career_y_test.csv", index=False)

print("✅ Career path features saved successfully")


✅ Career path features saved successfully
