In [28]:
import pandas as pd
from collections import Counter
import ast
from itertools import chain
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [29]:
df=pd.read_csv("full_wuzzuf_jobs.csv")

In [30]:
df=df.drop_duplicates()

In [31]:
# take a copy from salary column to get gender column
df['Gender'] = df['Salary']

df["Gender"] = df["Gender"].apply(lambda x: x if x in ["Female", "Male","Females Preferred","Males Preferred"] else np.nan)


In [32]:
df["Salary"] = df["Salary"].apply(lambda x: np.nan if x in  ["Female", "Male","Females Preferred","Males Preferred"] else x)

In [33]:
df["Gender"] = df["Gender"].fillna("both")

In [34]:
df["Working_Place"] = df["Working_Place"].apply(lambda x: "on-site" if x not in  ["hybrid", "Male","remote","on-site"] else x)

In [35]:
df['Job_Category_Count'] = df['Job_Category'].map(df['Job_Category'].value_counts())
df = df.sort_values(by=['Job_Category_Count', 'Job_Category', 'Title'], ascending=[False, True, True])
df['Company'] = df['Company'].fillna('Not Specified')

In [36]:
def convert_to_days(x):
    if 'month' in x:
        return int(x.split(' ')[1]) * 28
    elif 'day' in x:
        return int(x.split(' ')[1])
    elif 'hour' in x:
        return 1

df['Post_Date'] = df['Post_Date'].apply(convert_to_days)

In [37]:
# Create new columns with None values
df['min_experience'] = None
df['max_experience'] = None

# Iterate through unique values in Experience column
for value in df['Experience'].unique():
    # Check if value contains "More Than"
    if "More Than" in value:
        num = int(value.split("More Than")[1].strip().split(" ")[0])
        df.loc[df['Experience'] == value, 'min_experience'] = num
        df.loc[df['Experience'] == value, 'max_experience'] = None
    # Check if value is "Not Specified"
    elif value == "Not Specified":
        df.loc[df['Experience'] == value, 'min_experience'] = None
        df.loc[df['Experience'] == value, 'max_experience'] = None
    # Extract range of years for other values
    else:
        if " To " in value:
            range_years = value.split(" To ")
            min_years = int(range_years[0].split(" ")[0])
            max_years = int(range_years[1].split(" ")[0])
            df.loc[df['Experience'] == value, 'min_experience'] = min_years
            df.loc[df['Experience'] == value, 'max_experience'] = max_years
        else:
            df.loc[df['Experience'] == value, 'min_experience'] = int(value.split(" ")[0])
            df.loc[df['Experience'] == value, 'max_experience'] = int(value.split(" ")[0])

In [38]:
df = df.drop('Experience', axis=1)

In [39]:
df["Location"] = df["Location"].apply(lambda x: x.split("\n")[0])

In [40]:
df.loc[~df["Location"].str.contains(","), "Location"] = ""

In [41]:
df[['City', 'Country']] = df['Location'].str.split(',', expand=True)

In [43]:
df["Number_of_Applicants"]=df["Number_of_Applicants"].replace('Be the First to Apply',1)

In [44]:
df["Number_of_Positions"] = df["Number_of_Positions"].fillna("1")
df["Number_of_Positions"] = df["Number_of_Positions"].astype(str)
df["Number_of_Positions"] = (df["Number_of_Positions"].str.replace(",", "").str.split().str[0].astype("Int64"))

In [45]:
df["Number_of_Applicants"] = pd.to_numeric(df["Number_of_Applicants"], errors="coerce").astype("Int64")

In [46]:
df['Applicant_per_position'] = df['Number_of_Applicants'] // df['Number_of_Positions']

In [47]:
df["Skills"] = df["Skills"].apply(ast.literal_eval)
all_skills = [skill for skills_list in df["Skills"] for skill in skills_list]
skill_frequency = Counter(all_skills)
top_200_skills = skill_frequency.most_common(200)
top_200_skill_names = [skill for skill, _ in top_200_skills]
print(top_200_skill_names)

['Communication', '', 'Engineering', 'Sales', 'English', 'sales skills', 'Customer Service', 'Management', 'Microsoft Office', 'Communication skills', 'Sales Target', 'Marketing', 'Accounting', 'Administration', 'Finance', 'Information Technology (IT)', 'Computer Science', 'Design', 'Social Media', 'AutoCAD', 'Customer Support', 'Financial Analysis', 'business', 'Business Administration', 'Mechanical Engineering', 'Customer Care', 'Human Resources (HR)', 'Software', 'Negotiation', 'Skills', 'Construction', 'media', 'Software Development', 'Supply Chain', 'Logistics', 'Project Management', 'excel', 'Financial Management', 'Recruitment', 'Electrical Engineering', 'Digital Marketing', 'MS Office', 'ERP', 'CRM', 'Mechanical', 'Architecture', 'Manufacturing', 'Office management', 'Business Development', 'Real Estate', 'E-Marketing', 'Electrical', 'Adobe Photoshop', 'Planning', 'Civil Engineering', 'HR', 'Education', 'Human Resources', 'Maintenance', 'Operations', 'Analysis', 'Quality Contro

In [48]:
standardization_map = {
    "ms office": "microsoft office",
    "ms excel": "microsoft excel",
    "erp": "enterprise resource planning",
    "crm": "customer relationship management",
    "seo": "search engine optimization",
    "hr": "human resources",
    "b2b": "business-to-business",
    "pmp": "project management professional",
    "qa": "quality assurance",
    "it": "information technology",
    "sql": "structured query language",
    "cpa": "certified public accountant",
    "cma": "certified management accountant",
    "hvac": "heating, ventilation, and air conditioning",
    "fmcg": "fast-moving consumer goods",
    "bim": "building information modeling",
    "cad": "computer-aided design"
}

words_to_remove = {"skills", "experience", "field", "technical", "office", "management", "assistant"}

def clean_skill(skill):
    skill = skill.lower().strip()
    skill = standardization_map.get(skill, skill)  
    cleaned_words = [word for word in skill.split() if word not in words_to_remove]
    return " ".join(cleaned_words) if cleaned_words else None  

def process_skills(skills):
    if isinstance(skills, str):
        try:
            skills_list = ast.literal_eval(skills)  
            if not isinstance(skills_list, list):
                return []  
        except (SyntaxError, ValueError):
            return []
    elif isinstance(skills, list):
        skills_list = skills
    else:
        return []

    return [skill for skill in map(clean_skill, skills_list) if skill]  

df = df.dropna(subset=["Skills"])
df["Skills"] = df["Skills"].apply(process_skills)


In [49]:
df.to_csv("Final_cleaned_wuzzuf_file.csv", index=False)