In [41]:
import pandas as pd
import re

resume1 = pd.read_csv("resume1.csv")
resume2 = pd.read_csv("resume2.csv")

df = pd.concat([resume1, resume2], ignore_index=True)
df.columns = df.columns.str.lower().str.strip()

print(df.columns.tolist())


['role', 'resume', 'category']


In [42]:
df['role'] = df.get('role', '').fillna('').astype(str)
df['category'] = df.get('category', '').fillna('').astype(str)

df['career_label'] = (
    df['role'].str.lower().str.strip() + ' ' +
    df['category'].str.lower().str.strip()
).str.strip()


In [43]:
career_map = {
    'software engineer it': 'software_engineering',
    'software developer it': 'software_engineering',
    'data analyst it': 'data_analytics',
    'data scientist it': 'data_science',
    'chef culinary': 'culinary',
    'cook culinary': 'culinary',
    'doctor healthcare': 'healthcare',
    'nurse healthcare': 'healthcare'
}

df['career_label'] = df['career_label'].replace(career_map)


In [44]:
def clean_resume(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['resume_clean'] = df['resume'].apply(clean_resume)


In [45]:
SKILL_LIST = [
    'python','java','sql','excel','machine learning','deep learning',
    'data analysis','statistics','communication','management',
    'project management','aws','azure','docker','tensorflow','pytorch'
]

def extract_skills(text):
    found = [skill for skill in SKILL_LIST if skill in text]
    return ', '.join(found)

df['skills'] = df['resume_clean'].apply(extract_skills)


In [46]:
def extract_experience_years(text):
    matches = re.findall(r'(\d+)\+?\s+years?', text)
    if matches:
        return int(max(matches))
    return 0

df['experience_years'] = df['resume_clean'].apply(extract_experience_years)


In [47]:
EDU_LEVELS = {
    'phd': 'doctorate',
    'doctorate': 'doctorate',
    'master': 'master',
    'masters': 'master',
    'degree': 'degree',
    'bachelor': 'degree',
    'diploma': 'diploma',
    'foundation': 'foundation'
}

def extract_education(text):
    for key, value in EDU_LEVELS.items():
        if key in text:
            return value
    return 'unknown'

df['education_level'] = df['resume_clean'].apply(extract_education)


In [48]:
final_resume_dataset = df[
    ['career_label', 'skills', 'experience_years', 'education_level']
]


In [49]:
final_resume_dataset.sample(5)


Unnamed: 0,career_label,skills,experience_years,education_level
5019,machine learning engineer,"python, java, machine learning, deep learning,...",0,master
5255,human resources specialist,management,5,degree
140,robotics engineer,"python, management",5,master
5336,database administrator,"python, sql, management",8,degree
8448,software engineer,"python, java, machine learning",3,degree


In [50]:
final_resume_dataset.to_csv(
    "combined_resume_cleaned.csv",
    index=False,
    encoding="utf-8"
)


In [51]:
import pandas as pd
import re
import html
import unicodedata

jobstreet = pd.read_csv("jobstreet_50k_jobs copy.csv")
linkedin = pd.read_csv("linkedin_job_postings copy.csv")


In [61]:
jobs_df = pd.concat([jobstreet, linkedin], ignore_index=True)
jobs_df.columns = jobs_df.columns.str.lower().str.strip()

print("Original columns:", jobs_df.columns.tolist())


Original columns: ['job_title', 'description', 'requirements', 'title']


In [62]:
text_source_cols = [
    col for col in jobs_df.columns
    if any(key in col for key in ['description', 'requirement', 'responsibil'])
]

if not text_source_cols:
    raise ValueError("No description / requirement columns found")

print("Text source columns:", text_source_cols)

jobs_df['job_text'] = jobs_df[text_source_cols].astype(str).agg(' '.join, axis=1)
jobs_df['job_text'] = jobs_df['job_text'].fillna('').astype(str)


Text source columns: ['description', 'requirements']


In [63]:
if 'role' not in jobs_df.columns:
    for col in jobs_df.columns:
        if any(k in col for k in ['role', 'title', 'position']):
            jobs_df['role'] = jobs_df[col]
            break
    else:
        jobs_df['role'] = ''

jobs_df['role'] = jobs_df['role'].fillna('').astype(str)


In [64]:
if 'category' not in jobs_df.columns:
    jobs_df['category'] = ''

jobs_df['category'] = jobs_df['category'].fillna('').astype(str)


In [65]:
jobs_df = jobs_df[jobs_df['job_text'].apply(lambda x: x.strip() != '')]


In [66]:
jobs_df['career_label'] = (
    jobs_df['role'].str.lower().str.strip() + ' ' +
    jobs_df['category'].str.lower().str.strip()
).str.strip()


In [67]:
career_map = {
    'software engineer it': 'software_engineering',
    'software developer it': 'software_engineering',
    'data analyst it': 'data_analytics',
    'data scientist it': 'data_science',
    'chef culinary': 'culinary',
    'cook culinary': 'culinary',
    'doctor healthcare': 'healthcare',
    'nurse healthcare': 'healthcare'
}

jobs_df['career_label'] = jobs_df['career_label'].replace(career_map)


In [68]:
def clean_job_text(text):
    if not isinstance(text, str):
        return ""

    text = html.unescape(text)
    text = re.sub(r'<.*?>', ' ', text)
    text = unicodedata.normalize("NFKD", text)
    text = text.encode("ascii", "ignore").decode("ascii")
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = text.lower()
    text = re.sub(r'[!@#$%^&*()\-_+=\[\]{}|\\:;"\'<>,.?/~`]', ' ', text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

jobs_df['job_text_clean'] = jobs_df['job_text'].apply(clean_job_text)


In [69]:
SKILL_LIST = [
    'python','java','sql','excel','machine learning','deep learning',
    'data analysis','statistics','communication','management',
    'project management','aws','azure','docker','tensorflow','pytorch'
]

def extract_skills(text):
    return ', '.join([skill for skill in SKILL_LIST if skill in text])

jobs_df['skills'] = jobs_df['job_text_clean'].apply(extract_skills)


In [70]:
def extract_experience_years(text):
    matches = re.findall(r'(\d+)\+?\s+years?', text)
    return int(max(matches)) if matches else 0

jobs_df['experience_years'] = jobs_df['job_text_clean'].apply(extract_experience_years)


In [71]:
final_jobs_dataset = jobs_df[
    ['career_label', 'skills', 'experience_years']
]


In [72]:
final_jobs_dataset.sample(5)


Unnamed: 0,career_label,skills,experience_years
39370,,"excel, communication, management, aws",0
8246,quantity surveyor,management,0
27099,,"excel, communication, management",0
45744,,management,0
23745,,communication,0


In [74]:
final_jobs_dataset.to_csv(
    "combined_jobs_cleaned.csv",
    index=False,
    encoding="utf-8"
)


In [75]:
import pandas as pd
import re
import html
import unicodedata


In [76]:
coursera = pd.read_csv("coursera_data.csv")
coursera.columns = coursera.columns.str.lower().str.strip()

print("Original columns:", coursera.columns.tolist())


Original columns: ['course_title', 'course_organization', 'course_certificate_type', 'course_rating', 'course_difficulty']


In [77]:
text_source_cols = [
    col for col in coursera.columns
    if any(key in col for key in [
        'title', 'name', 'description', 'about',
        'skill', 'syllabus', 'content', 'overview'
    ])
]

if not text_source_cols:
    raise ValueError("No text columns found in Coursera dataset")

print("Text source columns:", text_source_cols)


Text source columns: ['course_title']


In [78]:
coursera['course_text'] = (
    coursera[text_source_cols]
    .astype(str)
    .agg(' '.join, axis=1)
)

coursera['course_text'] = coursera['course_text'].fillna('').astype(str)


In [79]:
coursera = coursera[coursera['course_text'].apply(lambda x: x.strip() != '')]


In [80]:
def clean_course_text(text):
    if not isinstance(text, str):
        return ""

    text = html.unescape(text)
    text = re.sub(r'<.*?>', ' ', text)
    text = unicodedata.normalize("NFKD", text)
    text = text.encode("ascii", "ignore").decode("ascii")
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = text.lower()
    text = re.sub(r'[!@#$%^&*()\-_+=\[\]{}|\\:;"\'<>,.?/~`]', ' ', text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    return text

coursera['course_text_clean'] = coursera['course_text'].apply(clean_course_text)


In [81]:
def keep_english_only(text):
    return re.sub(r'[^a-z\s]', ' ', text)

coursera['course_text_clean'] = (
    coursera['course_text_clean']
    .apply(keep_english_only)
    .apply(lambda x: re.sub(r'\s+', ' ', x).strip())
)


In [82]:
final_coursera_dataset = coursera[
    [col for col in coursera.columns if col not in ['course_text']]
    + ['course_text_clean']
]


In [84]:
final_coursera_dataset.to_csv(
    "coursera_cleaned.csv",
    index=False,
    encoding="utf-8"
)
