In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### STEP 1: Setup + Load Cleaned Data

In [2]:
# Load cleaned datasets for feature engineering

df_college_students = pd.read_csv("college_students_clean.csv")
df_job_placement    = pd.read_csv("job_placement_clean.csv")
df_placement_basic  = pd.read_csv("placement_basic_clean.csv")
salary_job_df       = pd.read_csv("salary_job_clean.csv")
salary_basic_df     = pd.read_csv("salary_basic_clean.csv")

print("Datasets loaded for feature engineering.")
print(df_college_students.shape, df_job_placement.shape)


Datasets loaded for feature engineering.
(10000, 10) (700, 10)


### STEP 2: Placement Features Creation (Core ML Features)

In [3]:
# ---------- Placement Feature Engineering ----------

# 1️⃣ CGPA bucket (helps non-linear models + explainability)
df_college_students['cgpa_bucket'] = pd.cut(
    df_college_students['cgpa'],
    bins=[0, 6, 7.5, 10],
    labels=['low', 'medium', 'high']
)

# 2️⃣ Internship experience → binary flag
df_college_students['has_internship'] = df_college_students['internship_experience'].map(
    {'Yes': 1, 'No': 0}
)

# 3️⃣ Academic strength score (simple composite feature)
df_college_students['academic_score'] = (
    df_college_students['prev_sem_result'] * 0.4 +
    df_college_students['academic_performance'] * 0.6
)

# 4️⃣ Skill intensity feature
df_college_students['skill_score'] = (
    df_college_students['communication_skills'] +
    df_college_students['projects_completed'] +
    df_college_students['extra_curricular_score']
)

# Quick check
df_college_students[['cgpa', 'cgpa_bucket', 'has_internship',
                      'academic_score', 'skill_score']].head()


Unnamed: 0,cgpa,cgpa_bucket,has_internship,academic_score,skill_score
0,6.28,medium,0,7.444,20
1,5.37,low,0,7.008,15
2,5.83,low,0,7.544,5
3,5.75,low,1,5.788,8
4,7.69,high,0,7.364,20


### STEP 3: Job Placement + Experience Features

In [4]:
# ---------- Job Placement Feature Engineering ----------

# 1️⃣ Experience level bucket
df_job_placement['experience_level'] = pd.cut(
    df_job_placement['years_of_experience'],
    bins=[-1, 0, 2, 5, 20],
    labels=['fresher', 'junior', 'mid', 'senior']
)

# 2️⃣ Fresher flag
df_job_placement['is_fresher'] = df_job_placement['years_of_experience'].apply(
    lambda x: 1 if x == 0 else 0
)

# 3️⃣ Education level encoding (ordinal idea, not final encoding)
education_map = {
    'High School': 0,
    'Diploma': 1,
    'Bachelor': 2,
    'Masters': 3,
    'PhD': 4
}
if 'degree' in df_job_placement.columns:
    df_job_placement['education_level'] = df_job_placement['degree'].map(education_map)

# Quick check
df_job_placement[['years_of_experience', 'experience_level',
                  'is_fresher', 'education_level']].head()


Unnamed: 0,years_of_experience,experience_level,is_fresher,education_level
0,2.0,junior,0,
1,1.0,junior,0,
2,3.0,mid,0,
3,2.0,junior,0,
4,2.0,junior,0,


### STEP 4: Salary-Focused Feature Engineering

In [5]:
# ---------- Salary Feature Engineering ----------

# 1️⃣ Log salary (reduce skewness)
salary_job_df['log_salary'] = np.log1p(salary_job_df['salary'])

# 2️⃣ Experience-salary interaction feature
salary_job_df['exp_salary_interaction'] = (
    salary_job_df['years_of_experience'] * salary_job_df['salary']
)

# 3️⃣ Fresher flag for salary dataset
salary_job_df['is_fresher'] = salary_job_df['years_of_experience'].apply(
    lambda x: 1 if x == 0 else 0
)

# Quick check
salary_job_df[['salary', 'log_salary',
               'years_of_experience', 'exp_salary_interaction',
               'is_fresher']].head()


Unnamed: 0,salary,log_salary,years_of_experience,exp_salary_interaction,is_fresher
0,60000.0,11.002117,2.0,120000.0,0
1,65000.0,11.082158,1.0,65000.0,0
2,58000.0,10.968216,3.0,174000.0,0
3,62000.0,11.034906,2.0,124000.0,0
4,63000.0,11.050906,1.0,63000.0,0


### STEP 5: Final Feature Selection + Save Feature Datasets

In [7]:
# ---------- FINAL FEATURE SELECTION ----------
import os
os.mkdir("cleaned")

# ========== PLACEMENT FEATURES ==========
placement_features = df_college_students.drop(columns=[
    'college_id'  # ID column, no predictive value
])

placement_target = placement_features['placement']
placement_features = placement_features.drop(columns=['placement'])

# ========== SALARY FEATURES ==========
salary_features = salary_job_df.drop(columns=[
    'salary'  # target
])

salary_target = salary_job_df['salary']

# Save final feature datasets
placement_features.to_csv("cleaned/placement_features.csv", index=False)
placement_target.to_csv("cleaned/placement_target.csv", index=False)

salary_features.to_csv("cleaned/salary_features.csv", index=False)
salary_target.to_csv("cleaned/salary_target.csv", index=False)

print("Final feature datasets saved successfully.")
print("Placement features shape:", placement_features.shape)
print("Salary features shape:", salary_features.shape)


Final feature datasets saved successfully.
Placement features shape: (10000, 12)
Salary features shape: (570, 13)
