In [1]:
import pandas as pd
import numpy as np


### Load data again & recreate Career dataset

In [2]:
# Load cleaned placement & salary data
placement_target = pd.read_csv("placement_target.csv")
salary_target = pd.read_csv("salary_target.csv")

# Rename columns for safety
placement_target.columns = ["placed"]
salary_target.columns = ["salary"]

# Create salary level buckets
salary_bins = salary_target["salary"].quantile([0.33, 0.66]).values

def salary_level(s):
    if s <= salary_bins[0]:
        return "Low"
    elif s <= salary_bins[1]:
        return "Medium"
    else:
        return "High"

salary_target["salary_level"] = salary_target["salary"].apply(salary_level)

# Align lengths by sampling placement data
career_df = salary_target.copy().reset_index(drop=True)
career_df["placed"] = placement_target.sample(
    n=len(career_df), random_state=42
).reset_index(drop=True)["placed"]

# Define career path rules
def career_path(row):
    if row["placed"] == 1 and row["salary_level"] == "High":
        return "Job"
    elif row["placed"] == 1 and row["salary_level"] == "Low":
        return "Higher Studies"
    elif row["placed"] == 0:
        return "Startup"
    else:
        return "Job"

career_df["career_path"] = career_df.apply(career_path, axis=1)

# Keep final columns
career_df = career_df[["placed", "salary", "salary_level", "career_path"]]

# Basic validation
print("Shape:", career_df.shape)
print("\nMissing Values:")
print(career_df.isnull().sum())

print("\nCareer Path Distribution:")
print(career_df["career_path"].value_counts())

Shape: (570, 4)

Missing Values:
placed          0
salary          0
salary_level    0
career_path     0
dtype: int64

Career Path Distribution:
career_path
Startup           480
Job                57
Higher Studies     33
Name: count, dtype: int64


### Save Cleaned Career Dataset

In [3]:
# =========================================
# Cell 2: Save Cleaned Career Dataset
# =========================================

import os

# Create directory if not exists
os.makedirs("/content/data/cleaned", exist_ok=True)

# Save cleaned dataset
career_df.to_csv(
    "/content/data/cleaned/career_path_cleaned.csv",
    index=False
)

print("✅ Career Path cleaned dataset saved successfully!")
print("Saved at: /content/data/cleaned/career_path_cleaned.csv")


✅ Career Path cleaned dataset saved successfully!
Saved at: /content/data/cleaned/career_path_cleaned.csv
