In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### STEP 1 — Data Load & Scope Understanding (EDA start)

In [3]:


# Placement data (cleaned)
placement_features = pd.read_csv("placement_features.csv")
placement_target = pd.read_csv("placement_target.csv")

# Salary data (cleaned)
salary_features = pd.read_csv("salary_features.csv")
salary_target = pd.read_csv("salary_target.csv")

print("Placement Features:", placement_features.shape)
print("Placement Target:", placement_target.shape)

print("\nSalary Features:", salary_features.shape)
print("Salary Target:", salary_target.shape)


Placement Features: (10000, 12)
Placement Target: (10000, 1)

Salary Features: (570, 13)
Salary Target: (570, 1)


### CELL 2: Career Signals Creation (EDA – Logic Building)

In [4]:
# =========================================
# Cell 2: Create Career Decision Signals
# =========================================

# Placement signal
placement_target.columns = ["placed"]

print("Placement Distribution:")
print(placement_target["placed"].value_counts())

# Salary signal
salary_target.columns = ["salary"]

print("\nSalary Statistics:")
print(salary_target["salary"].describe())

# Create salary level buckets (EDA step)
salary_bins = salary_target["salary"].quantile([0.33, 0.66]).values

def salary_level(s):
    if s <= salary_bins[0]:
        return "Low"
    elif s <= salary_bins[1]:
        return "Medium"
    else:
        return "High"

salary_target["salary_level"] = salary_target["salary"].apply(salary_level)

print("\nSalary Level Distribution:")
print(salary_target["salary_level"].value_counts())


Placement Distribution:
placed
0    8341
1    1659
Name: count, dtype: int64

Salary Statistics:
count      570.000000
mean     64442.105263
std       2255.471564
min      57000.000000
25%      63000.000000
50%      65000.000000
75%      66000.000000
max      68000.000000
Name: salary, dtype: float64

Salary Level Distribution:
salary_level
Medium    272
Low       194
High      104
Name: count, dtype: int64


### CELL 3: Define Career Path Logic (EDA → Label Design)

In [5]:
# =========================================
# Cell 3: Define Career Path (Rule-Based)
# =========================================

# For alignment, sample equal number of salary rows
career_df = salary_target.copy().reset_index(drop=True)

# Add placement signal (sample to same length)
career_df["placed"] = placement_target.sample(
    n=len(career_df), random_state=42
).reset_index(drop=True)["placed"]

def career_path(row):
    # Rule 1: Placed + High salary → Job
    if row["placed"] == 1 and row["salary_level"] == "High":
        return "Job"

    # Rule 2: Placed + Low salary → Higher Studies
    elif row["placed"] == 1 and row["salary_level"] == "Low":
        return "Higher Studies"

    # Rule 3: Not placed → Startup
    elif row["placed"] == 0:
        return "Startup"

    # Fallback
    else:
        return "Job"

career_df["career_path"] = career_df.apply(career_path, axis=1)

print("Career Path Distribution:")
print(career_df["career_path"].value_counts())


Career Path Distribution:
career_path
Startup           480
Job                57
Higher Studies     33
Name: count, dtype: int64


### CELL 4: Career Path vs Signals (EDA Validation)

In [6]:
# =========================================
# Cell 4: Validate Career Logic (EDA)
# =========================================

print("Career Path vs Placement:")
print(pd.crosstab(career_df["career_path"], career_df["placed"]))

print("\nCareer Path vs Salary Level:")
print(pd.crosstab(career_df["career_path"], career_df["salary_level"]))


Career Path vs Placement:
placed            0   1
career_path            
Higher Studies    0  33
Job               0  57
Startup         480   0

Career Path vs Salary Level:
salary_level    High  Low  Medium
career_path                      
Higher Studies     0   33       0
Job               16    0      41
Startup           88  161     231


### CELL 5: Finalize Career Dataset (EDA → Ready for Cleaning)

In [7]:
# =========================================
# Cell 5: Final Career Dataset (EDA Output)
# =========================================

final_career_df = career_df[[
    "placed",
    "salary",
    "salary_level",
    "career_path"
]]

print("Final Career Dataset Shape:", final_career_df.shape)
print("\nSample Rows:")
final_career_df.head()


Final Career Dataset Shape: (570, 4)

Sample Rows:


Unnamed: 0,placed,salary,salary_level,career_path
0,0,60000.0,Low,Startup
1,0,65000.0,Medium,Startup
2,0,58000.0,Low,Startup
3,0,62000.0,Low,Startup
4,1,63000.0,Low,Higher Studies
