In [2]:
import pandas as pd

In [14]:
occupation = pd.read_excel("datasets/onet_datasets/Occupation Data.xlsx")
abilities = pd.read_excel("datasets/onet_datasets/Abilities.xlsx")
edu = pd.read_excel("datasets/onet_datasets/Education, Training, and Experience.xlsx")
interests = pd.read_excel("datasets/onet_datasets/Interests.xlsx")
knowledge = pd.read_excel("datasets/onet_datasets/Knowledge.xlsx")
Emerging = pd.read_excel("datasets/onet_datasets/Emerging Tasks.xlsx")
skills = pd.read_excel("datasets/onet_datasets/Skills.xlsx")
work_activities = pd.read_excel("datasets/onet_datasets/Work Activities.xlsx")
work_context = pd.read_excel("datasets/onet_datasets/Work Context.xlsx")
work_styles = pd.read_excel("datasets/onet_datasets/Work Styles.xlsx")

In [15]:
# Helper: aggregate text fields per occupation
def aggregate_field(df, col_key, col_value, new_col_name):
    grouped = df.groupby(col_key)[col_value].apply(
        lambda x: list(set([str(v).strip().lower() for v in x.dropna()]))
    ).reset_index()
    grouped.rename(columns={col_value: new_col_name}, inplace=True)
    return grouped


In [None]:
# Aggregate each dataset
abilities_grouped = aggregate_field(abilities, "O*NET-SOC Code", "Element Name", "Abilities")
edu_grouped = aggregate_field(edu, "O*NET-SOC Code", "Category", "EducationReq")
interests_grouped = aggregate_field(interests, "O*NET-SOC Code", "Element Name", "Interests")
knowledge_grouped = aggregate_field(knowledge, "O*NET-SOC Code", "Element Name", "KnowledgeAreas")
skills_grouped = aggregate_field(skills, "O*NET-SOC Code", "Element Name", "Skills")
Emerging_grouped = aggregate_field(Emerging, "O*NET-SOC Code", "Task", "Emerging")
work_activities_grouped = aggregate_field(work_activities, "O*NET-SOC Code", "Element Name", "WorkActivities")
work_context_grouped = aggregate_field(work_context, "O*NET-SOC Code", "Element Name", "WorkContext")
work_styles_grouped = aggregate_field(work_styles, "O*NET-SOC Code", "Element Name", "WorkStyles")

print("âœ… Aggregated all fields")


âœ… Aggregated all fields


In [22]:
# Merge everything into one dataframe
df = occupation.merge(abilities_grouped, on="O*NET-SOC Code", how="left")
df = df.merge(edu_grouped, on="O*NET-SOC Code", how="left")
df = df.merge(interests_grouped, on="O*NET-SOC Code", how="left")
df = df.merge(knowledge_grouped, on="O*NET-SOC Code", how="left")
df = df.merge(skills_grouped, on="O*NET-SOC Code", how="left")
df = df.merge(Emerging_grouped, on="O*NET-SOC Code", how="left")
df = df.merge(work_activities_grouped, on="O*NET-SOC Code", how="left")
df = df.merge(work_context_grouped, on="O*NET-SOC Code", how="left")
df = df.merge(work_styles_grouped, on="O*NET-SOC Code", how="left")

print("âœ… Final merged shape:", df.shape)


âœ… Final merged shape: (1016, 12)


In [23]:
# Clean NaN -> empty list
for col in ["Abilities", "EducationReq", "Interests", "KnowledgeAreas", 
            "Skills", "Emerging", "WorkActivities", "WorkContext", "WorkStyles"]:
    df[col] = df[col].apply(lambda x: x if isinstance(x, list) else [])

print("âœ… Cleaned null values")


âœ… Cleaned null values


In [24]:
# Export final dataset
df.to_csv("onet_clean.csv", index=False)
print("cleaned onet dataset")


cleaned onet dataset


In [26]:
onet = pd.read_csv("datasets/onet_clean.csv")

naukri = pd.read_csv("datasets/naukri_dataset.csv")

In [27]:
onet.columns = onet.columns.str.strip().str.lower().str.replace(" ", "_")
onet.to_csv("onet_clean.csv", index=False)


In [28]:
naukri.columns = naukri.columns.str.strip().str.lower().str.replace(" ", "_")

# Example cleaning: drop duplicates
naukri = naukri.drop_duplicates()

naukri.to_csv("datasets/naukri_clean_data.csv", index=False)
print(" Naukri dataset cleaned and saved as naukri_clean.csv")

 Naukri dataset cleaned and saved as naukri_clean.csv


In [29]:
students = pd.read_csv("datasets/student/student-mat.csv", sep=";")


In [30]:
students.columns = (
    students.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

In [31]:
# Convert numeric fields
numeric_cols = ["age", "absences", "g1", "g2", "g3"]
for col in numeric_cols:
    if col in students.columns:
        students[col] = pd.to_numeric(students[col], errors="coerce")


In [33]:
students.to_csv("datasets/students_clean_data.csv", index=False)
print(" Student dataset cleaned and saved as students_clean.csv")

 Student dataset cleaned and saved as students_clean.csv


In [None]:
def prepare_for_rag(df, text_cols, id_col, source):
    """
    Prepares a dataset for RAG by concatenating selected text columns,
    handling missing IDs, and tagging with source.
    """
    # Ensure columns exist
    valid_cols = [col for col in text_cols if col in df.columns]
    
    # Build combined text field
    df["rag_text"] = df[valid_cols].astype(str).agg(" ".join, axis=1)
    
    # Handle ID column
    if id_col in df.columns:
        df = df.rename(columns={id_col: "id"})
    else:
        df = df.reset_index().rename(columns={"index": "id"})
        df["id"] = df["id"].apply(lambda x: f"{source}_{x}")
    
    df["source"] = source
    return df[["id", "rag_text", "source"]]



In [45]:
onet_rag = prepare_for_rag(
    onet,
    [
        "title", "description", "abilities", "educationreq",
        "interests", "knowledgeareas", "skills",
        "workactivities", "workcontext", "workstyles"
    ],
    "o*net-soc_code",
    "onet"
)

In [46]:
naukri_rag = prepare_for_rag(
    naukri,
    ["jobtitle", "jobdescription", "skills", "education", "experience"],
    "jobid",
    "naukri"
)

In [47]:
students_rag = prepare_for_rag(
    students,
    ["school", "age", "studytime", "activities", "higher", "g1", "g2", "g3"],
    "school",
    "students"
)


In [49]:
rag_master = pd.concat([onet_rag, naukri_rag, students_rag], ignore_index=True)
rag_master.to_csv("datasets/career_compass_master_datasets_1.csv", index=False)


In [50]:
print("ðŸŽ¯ Final RAG dataset saved as career_compass_master.csv")
print("ONET records:", onet_rag.shape[0])
print("Naukri records:", naukri_rag.shape[0])
print("Student records:", students_rag.shape[0])

ðŸŽ¯ Final RAG dataset saved as career_compass_master.csv
ONET records: 1016
Naukri records: 22000
Student records: 395


In [None]:
#25034