In [10]:
import pandas as pd


# Job Cleaning

In [11]:
# -----------------------
# Utility: clean URLs, emails, phone numbers
# -----------------------

def clean_text(text: str) -> str:
    # Remove web links
    text = re.sub(r"http\S+|www\.\S+", " ", text)
    # Remove email addresses
    text = re.sub(r"\S+@\S+", " ", text)
    # Remove phone numbers (digits, dashes/spaces, length ≥9)
    text = re.sub(r"\+?\d[\d\-\s]{7,}\d", " ", text)
    # Collapse all consecutive whitespace into single spaces
    text = re.sub(r"\s+", " ", text)
    return text.strip()



In [12]:
# -----------------------
# Utility: load & prepare raw job data
# -----------------------

def load_and_prepare_jobs(data_path: str) -> pd.DataFrame: 
    # Read CSV into DataFrame
    df = pd.read_csv(data_path)
    # Reset index to ensure continuous integer IDs
    df = df.reset_index(drop=True)
    # Rename Job-Title column to Job_title
    df = df.rename(columns={'Job-Title': 'Job_title'})
    # Drop any rows missing essential fields
    df = df.dropna(subset=["Job_title", "Description", "Location"]).copy()
    # Create a unique job_id based on the DataFrame index
    df["job_id"] = df.index.astype(int)

    df["title_clean"] = df["Job_title"].apply(clean_text)
    df["desc_clean"] = df["Description"].apply(clean_text)
    df["text"] = df["title_clean"] + " " + df["desc_clean"]
    return df



In [13]:

# Store the job data
df = load_and_prepare_jobs(JOB_DATA_PATH)
df.to_csv(r"C:\Yousuf\DEPI\Technical\Mega Projects\Job_Recommendation_System\data\Processed\cleaned_data\job_cleaned.csv", index=False)
df

Unnamed: 0,Job_title,Date-Posted,Company,Job-Type,Salary,Location,Description,job_id,title_clean,desc_clean,text
0,"Bachelor of Science, Nursing Instructor Part...",2023-04-25,Red Deer College,Full-time,,"Red Deer, AB",Position Information\n\nPosition Title\n\nBach...,0,"Bachelor of Science, Nursing Instructor Part T...",Position Information Position Title Bachelor o...,"Bachelor of Science, Nursing Instructor Part T..."
1,PHD Graduate Student,2023-04-19,IDOBE Research Group,Full-time,,"Edmonton, AB",The IDOBE (Intelligent Design & Operation for ...,1,PHD Graduate Student,The IDOBE (Intelligent Design & Operation for ...,PHD Graduate Student The IDOBE (Intelligent De...
2,Central Graduate Advisor,2023-04-19,University of Alberta,Full-time,"$47,210–$63,741 a year","Edmonton, AB",FGSR Grad Studies & Rsrch Admi\n\nCompetition ...,2,Central Graduate Advisor,FGSR Grad Studies & Rsrch Admi Competition No....,Central Graduate Advisor FGSR Grad Studies & R...
3,Student/Graduate Application,2023-04-20,Eagle Builders LP,Internship,,"Blackfalds, AB",Eagle Builders welcomes and offers work experi...,3,Student/Graduate Application,Eagle Builders welcomes and offers work experi...,Student/Graduate Application Eagle Builders we...
4,"Advisor, Graduate Student Experience, Haskayne...",2023-04-12,University of Calgary,Full-time,,"Calgary, AB",University of Calgary\nRSS Jump to Headline Ho...,4,"Advisor, Graduate Student Experience, Haskayne...",University of Calgary RSS Jump to Headline Hom...,"Advisor, Graduate Student Experience, Haskayne..."
...,...,...,...,...,...,...,...,...,...,...,...
40056,Bike Centre Volunteer (Fall 2022),,Bike Centre,,,"Waterloo, ON",Service Description: The Bike Centre is a do-i...,40056,Bike Centre Volunteer (Fall 2022),Service Description: The Bike Centre is a do-i...,Bike Centre Volunteer (Fall 2022) Service Desc...
40057,Volunteer Program Coordinator,2023-04-05,The Food Bank of Waterloo Region,Full-time,,"Kitchener, ON",Volunteer Program Coordinator\n\nAdministratio...,40057,Volunteer Program Coordinator,Volunteer Program Coordinator Administration K...,Volunteer Program Coordinator Volunteer Progra...
40058,Volunteer Coordinator,,Alzheimer Society Waterloo Wellington,Part-time,,"Kitchener, ON",About the job\n\nJOB SUMMARY\n\nThis position ...,40058,Volunteer Coordinator,About the job JOB SUMMARY This position manage...,Volunteer Coordinator About the job JOB SUMMAR...
40059,Summer Camp Volunteer,,Little Medical School - Region of Waterloo,Part-time,,"Kitchener, ON",Do you love working and interacting with child...,40059,Summer Camp Volunteer,Do you love working and interacting with child...,Summer Camp Volunteer Do you love working and ...


# User Data Cleaning

In [16]:
user_df = pd.read_csv('C:\Yousuf\DEPI\Technical\Mega Projects\Job_Recommendation_System\data\Raw\skill2vec_50K.csv')
user_df

  user_df = pd.read_csv('C:\Yousuf\DEPI\Technical\Mega Projects\Job_Recommendation_System\data\Raw\skill2vec_50K.csv')


Unnamed: 0,HR Executive,screening,selection,Interview,HR,Recruiter,IT Recruiter,Sourcing,recruitment executive,onboarding,...,Unnamed: 950,Unnamed: 951,Unnamed: 952,Unnamed: 953,Unnamed: 954,Unnamed: 955,Unnamed: 956,Unnamed: 957,Unnamed: 958,Unnamed: 959
0,Special Teacher,Teaching,Education,,,,,,,,...,,,,,,,,,,
1,consulting,fresher,IT helpdesk,Techincal Troubleshooting,international voice,international BPO,technical support,outsourcing,call center,BBA fresher,...,,,,,,,,,,
2,diploma,machining,cnc m,mould,conventional machines,die making,knowledge,tool,cipet,assembly,...,,,,,,,,,,
3,Compensation,Benefits,HR Functions,Alm,Payroll,ESS,Core HR,QC,QA,SQL,...,,,,,,,,,,
4,Storage Administrator,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49994,Chief Engineer,,,,,,,,,,...,,,,,,,,,,
49995,Receptionist Activities,Front Desk,front office,front desk executive,front office executive,receptionist,reception,,,,...,,,,,,,,,,
49996,SQL Queries,Log Analysis,Hardware Networking,People Leadership,Technical Skills,Unix,Oracle,Service Delivery Management,Database Administration,Continuous Improvement,...,,,,,,,,,,
49997,Quality Analyst,,,,,,,,,,...,,,,,,,,,,


## Produce a list of skills for each profile

In [14]:
def prepare_user_skill_profiles(df: pd.DataFrame):
    # Fill NaNs with empty strings and convert all to string
    df_clean = df.fillna("").astype(str)
    
    # Combine each row's values into a cleaned list of skills
    user_skills = []
    for _, row in df_clean.iterrows():
        skills = [skill.strip().lower() for skill in row if skill.strip()]
        skills = list(dict.fromkeys(skills))  # optional: remove duplicates, preserve order
        user_skills.append(skills)
    
    return user_skills

In [17]:
# Apply the transformation
user_skill_lists = prepare_user_skill_profiles(user_df)

# Show example output
user_skill_lists[:5]

[['special teacher', 'teaching', 'education'],
 ['consulting',
  'fresher',
  'it helpdesk',
  'techincal troubleshooting',
  'international voice',
  'international bpo',
  'technical support',
  'outsourcing',
  'call center',
  'bba fresher',
  'bcom fresher',
  'tech support',
  'voice calling',
  'bpo',
  'sme',
  'bca fresher',
  'mba fresher'],
 ['diploma',
  'machining',
  'cnc m',
  'mould',
  'conventional machines',
  'die making',
  'knowledge',
  'tool',
  'cipet',
  'assembly'],
 ['compensation',
  'benefits',
  'hr functions',
  'alm',
  'payroll',
  'ess',
  'core hr',
  'qc',
  'qa',
  'sql'],
 ['storage administrator']]

In [None]:
# Store the job data
user_skill_lists.to_csv(r"C:\Yousuf\DEPI\Technical\Mega Projects\Job_Recommendation_System\data\Processed\cleaned_data\skill_list.csv", index=False)