In [1]:
# Import libraries
import pandas as pd
import os


In [3]:
# Load dataset
file_path = "custom_resume_dataset.csv"
df = pd.read_csv(file_path)

print("✅ Original Data Loaded")
print(df.head())

✅ Original Data Loaded
  resume_id                                        resume_text  \
0     R0001  As a Python Developer, I have hands-on experie...   
1     R0002  As a Python Developer, I have hands-on experie...   
2     R0003  As a Python Developer, I have hands-on experie...   
3     R0004  As a Python Developer, I have hands-on experie...   
4     R0005  As a Python Developer, I have hands-on experie...   

       job_category                                    job_description  \
0  Python Developer  This role requires expertise in Python, Flask,...   
1  Python Developer  This role requires expertise in Python, Flask,...   
2  Python Developer  This role requires expertise in Python, Flask,...   
3  Python Developer  This role requires expertise in Python, Flask,...   
4  Python Developer  This role requires expertise in Python, Flask,...   

   suitability_score                                           feedback  \
0                100  Resume is strong and matches well with

In [6]:
# Basic Cleaning
# 1. Remove duplicates
df.drop_duplicates(inplace=True)

# 2. Handle missing values (if any)
df.fillna("", inplace=True)

# 3. Strip extra spaces from text columns
for col in ["resume_text", "job_description", "feedback", "match_skills", "missing_skills"]:
    df[col] = df[col].astype(str).str.strip()

# 4. Standardize text (optional - lowercasing for NLP tasks)
df["resume_text"] = df["resume_text"].str.lower()
df["job_description"] = df["job_description"].str.lower()

# 5. Remove unwanted characters (basic cleanup)
df["resume_text"] = df["resume_text"].str.replace(r"[^a-zA-Z0-9\s.,]", "", regex=True)
df["job_description"] = df["job_description"].str.replace(r"[^a-zA-Z0-9\s.,]", "", regex=True)

In [7]:
# Recalculate Suitability Score (optional recheck)
# Keep scores between 0–100
df["suitability_score"] = df["suitability_score"].clip(0, 100)

In [8]:
# Save cleaned dataset
clean_file_path = "custom_resume_dataset_cleaned.csv"
df.to_csv(clean_file_path, index=False)

print(f"✅ Cleaned dataset saved at: {clean_file_path}")
print("Shape:", df.shape)

✅ Cleaned dataset saved at: custom_resume_dataset_cleaned.csv
Shape: (990, 8)
