In [3]:
import pandas as pd
import numpy as np


In [4]:
df_college_students = pd.read_csv("college_student_placement_dataset.csv")
df_job_placement    = pd.read_csv("job_placement.csv")
df_placement_basic  = pd.read_csv("Placement.csv")
df_placementdata    = pd.read_csv("placementdata.csv")
df_ds_salary        = pd.read_csv("data_scientists_salaries_from_reddit.csv")


02_data_cleaning.ipynb – STEP 1

- Google Drive mounted successfully
- Raw datasets loaded from Drive
- No cleaning applied yet


## STEP 2: Missing Values Handling (Placement Datasets)

In [5]:
placement_datasets = {
    "college_students": df_college_students,
    "placement_basic": df_placement_basic,
    "placementdata": df_placementdata,
    "job_placement": df_job_placement
}

for name, df in placement_datasets.items():
    print(f"\n{name} missing values:")
    print(df.isnull().sum())



college_students missing values:
College_ID                0
IQ                        0
Prev_Sem_Result           0
CGPA                      0
Academic_Performance      0
Internship_Experience     0
Extra_Curricular_Score    0
Communication_Skills      0
Projects_Completed        0
Placement                 0
dtype: int64

placement_basic missing values:
Student_ID          0
CGPA                0
Internships         0
Placed              0
Salary (INR LPA)    0
dtype: int64

placementdata missing values:
StudentID                    0
CGPA                         0
Internships                  0
Projects                     0
Workshops/Certifications     0
AptitudeTestScore            0
SoftSkillsRating             0
ExtracurricularActivities    0
PlacementTraining            0
SSC_Marks                    0
HSC_Marks                    0
PlacementStatus              0
dtype: int64

job_placement missing values:
id                     0
name                   0
gender              

In [6]:
# college_students
df_college_students['Prev_Sem_Result'].fillna(
    df_college_students['Prev_Sem_Result'].median(), inplace=True
)
df_college_students['CGPA'].fillna(
    df_college_students['CGPA'].median(), inplace=True
)

# job_placement
df_job_placement['gpa'].fillna(
    df_job_placement['gpa'].median(), inplace=True
)
df_job_placement['years_of_experience'].fillna(
    df_job_placement['years_of_experience'].median(), inplace=True
)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_college_students['Prev_Sem_Result'].fillna(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_college_students['CGPA'].fillna(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always beh

In [7]:
# job_placement categorical
df_job_placement['gender'].fillna(
    df_job_placement['gender'].mode()[0], inplace=True
)
df_job_placement['degree'].fillna(
    df_job_placement['degree'].mode()[0], inplace=True
)
df_job_placement['stream'].fillna(
    df_job_placement['stream'].mode()[0], inplace=True
)

# placementdata categorical
df_placementdata['ExtracurricularActivities'].fillna(
    "Unknown", inplace=True
)
df_placementdata['PlacementTraining'].fillna(
    "No", inplace=True
)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_job_placement['gender'].fillna(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_job_placement['degree'].fillna(
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

In [8]:
print(df_college_students['Placement'].isnull().sum())
print(df_placement_basic['Placed'].isnull().sum())
print(df_placementdata['PlacementStatus'].isnull().sum())
print(df_job_placement['placement_status'].isnull().sum())


0
0
0
0


02_data_cleaning – STEP 2

- Identified missing values in placement datasets
- Filled numerical columns using median
- Filled categorical columns using mode / default values
- Ensured target columns have no missing values

### STEP 3: Salary Cleaning

In [10]:
# Check salary related columns and basic stats (only loaded datasets)

print("job_placement salary:")
print(df_job_placement['salary'].describe())

print("\nplacement_basic salary:")
print(df_placement_basic['Salary (INR LPA)'].describe())



job_placement salary:
count      700.000000
mean     52474.285714
std      25160.331005
min          0.000000
25%      61000.000000
50%      64000.000000
75%      66000.000000
max      68000.000000
Name: salary, dtype: float64

placement_basic salary:
count    1000.000000
mean       11.795580
std        10.105559
min         0.000000
25%         0.000000
50%        11.715000
75%        20.935000
max        29.900000
Name: Salary (INR LPA), dtype: float64


In [11]:
# Replace zero salaries with NaN (do NOT drop yet)

df_job_placement['salary'] = df_job_placement['salary'].replace(0, np.nan)
df_placement_basic['Salary (INR LPA)'] = df_placement_basic['Salary (INR LPA)'].replace(0, np.nan)

# Verify
print("job_placement salary after zero handling:")
print(df_job_placement['salary'].isnull().sum())

print("\nplacement_basic salary after zero handling:")
print(df_placement_basic['Salary (INR LPA)'].isnull().sum())


job_placement salary after zero handling:
130

placement_basic salary after zero handling:
310


In [12]:
# Create salary-only datasets (drop rows where salary is NaN)

salary_job_df = df_job_placement.dropna(subset=['salary']).copy()
salary_basic_df = df_placement_basic.dropna(subset=['Salary (INR LPA)']).copy()

# Check shapes
print("salary_job_df shape:", salary_job_df.shape)
print("salary_basic_df shape:", salary_basic_df.shape)


salary_job_df shape: (570, 11)
salary_basic_df shape: (690, 5)


### STEP 4: Duplicate Removal + Column Cleanup

In [13]:
# Remove duplicate rows (all datasets used so far)

df_college_students.drop_duplicates(inplace=True)
df_job_placement.drop_duplicates(inplace=True)
df_placement_basic.drop_duplicates(inplace=True)
df_placementdata.drop_duplicates(inplace=True)
salary_job_df.drop_duplicates(inplace=True)
salary_basic_df.drop_duplicates(inplace=True)

# Remove useless index-like columns if present
for df in [df_placementdata, df_job_placement]:
    for col in ['Unnamed: 0', 'id']:
        if col in df.columns:
            df.drop(columns=[col], inplace=True)

print("Duplicate removal & column cleanup done.")


Duplicate removal & column cleanup done.


### STEP 5: Column Name Standardization

In [14]:
def standardize_columns(df):
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace("(", "", regex=False)
        .str.replace(")", "", regex=False)
    )
    return df

# Apply to all relevant datasets
df_college_students = standardize_columns(df_college_students)
df_job_placement    = standardize_columns(df_job_placement)
df_placement_basic  = standardize_columns(df_placement_basic)
df_placementdata    = standardize_columns(df_placementdata)
salary_job_df       = standardize_columns(salary_job_df)
salary_basic_df     = standardize_columns(salary_basic_df)

# Check one example
print(df_college_students.columns)


Index(['college_id', 'iq', 'prev_sem_result', 'cgpa', 'academic_performance',
       'internship_experience', 'extra_curricular_score',
       'communication_skills', 'projects_completed', 'placement'],
      dtype='object')


### STEP 6: Placement Labels Normalization (Yes / No → 1 / 0)

In [15]:
# Normalize placement labels to binary (1 = Placed, 0 = Not Placed)

# college_students
df_college_students['placement'] = df_college_students['placement'].map(
    {'Yes': 1, 'No': 0}
)

# placement_basic
df_placement_basic['placed'] = df_placement_basic['placed'].map(
    {'Yes': 1, 'No': 0}
)

# placementdata
df_placementdata['placementstatus'] = df_placementdata['placementstatus'].map(
    {'Placed': 1, 'Not Placed': 0}
)

# job_placement
df_job_placement['placement_status'] = df_job_placement['placement_status'].map(
    {'Placed': 1, 'Not Placed': 0}
)

# Verify
print(df_college_students['placement'].value_counts())
print(df_placement_basic['placed'].value_counts())
print(df_placementdata['placementstatus'].value_counts())
print(df_job_placement['placement_status'].value_counts())


placement
0    8341
1    1659
Name: count, dtype: int64
placed
1    690
0    310
Name: count, dtype: int64
placementstatus
1.0    4197
Name: count, dtype: int64
placement_status
1    570
0    130
Name: count, dtype: int64


### STEP 7: Save Final Cleaned Datasets

In [18]:
import os
os.mkdir("cleaned")

# Save cleaned placement datasets
df_college_students.to_csv("cleaned/college_students_clean.csv", index=False)
df_job_placement.to_csv("cleaned/job_placement_clean.csv", index=False)
df_placement_basic.to_csv("cleaned/placement_basic_clean.csv", index=False)
df_placementdata.to_csv("cleaned/placementdata_clean.csv", index=False)

# Save salary datasets
salary_job_df.to_csv("cleaned/salary_job_clean.csv", index=False)
salary_basic_df.to_csv("cleaned/salary_basic_clean.csv", index=False)

print("All cleaned datasets saved successfully.")


All cleaned datasets saved successfully.
