### Overview

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('/content/wuzzuf_jobs_final.csv', on_bad_lines='skip', engine='python')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12355 entries, 0 to 12354
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Title              12355 non-null  object
 1   Company            11916 non-null  object
 2   Date               11733 non-null  object
 3   Job Type           11651 non-null  object
 4   Work Setting       7495 non-null   object
 5   Location           12335 non-null  object
 6   Experience Needed  12344 non-null  object
 7   Career Level       12344 non-null  object
 8   Education Level    12344 non-null  object
 9   Salary             12344 non-null  object
 10  Job Categories     12344 non-null  object
 11  Skills             12019 non-null  object
 12  Job Description    12338 non-null  object
 13  Job Requirements   8402 non-null   object
 14  Link               12355 non-null  object
dtypes: object(15)
memory usage: 1.4+ MB


In [None]:
df.isnull().sum()

Unnamed: 0,0
Title,0
Company,439
Date,622
Job Type,704
Work Setting,4860
Location,20
Experience Needed,11
Career Level,11
Education Level,11
Salary,11


In [None]:
print(f"\nTotal rows: {len(df)}")


Total rows: 12355


### Title

In [None]:
print(f"Null values: {df['Title'].isnull().sum()}")

Null values: 0


In [None]:
empty_strings = (df['Title'].astype(str).str.strip() == '').sum()
print(f"Empty strings: {empty_strings}")

Empty strings: 0


In [None]:
not_specified_mask = df['Title'].astype(str).str.lower().str.contains(
    'not specified|not specific|n/a|na |none|confidential',
    na=False
)
not_specified_count = not_specified_mask.sum()
print(f"Contains nulls:  {not_specified_count}")

Contains nulls:  1


In [None]:
df[not_specified_mask]

Unnamed: 0,Title,Company,Date,Job Type,Work Setting,Location,Experience Needed,Career Level,Education Level,Salary,Job Categories,Skills,Job Description,Job Requirements,Link
11632,N/A - Cybersecurity - Products,Tata Communications\n-,posted 10 days ago,Full Time,,United Arab Emirates,Not Specified,Entry Level (Junior Level / Fresh Grad),Not Specified,Confidential,IT/Software Development,"information technology (it), computer science,...",- About The CompanyTata Communications Redefin...,,https://wuzzuf.net/jobs/p/g/n4lq6plnys2a--cybe...


In [None]:
short_titles = df['Title'].astype(str).str.len() < 4
short_count = short_titles.sum()
print(f"Titles with less than 3 characters: {short_count}")

Titles with less than 3 characters: 4


In [None]:
print(df[short_titles]['Title'])

1687     CFO
2975     CFO
4783     CFO
10500    CFO
Name: Title, dtype: object


In [None]:
print(f"Unique titles: {df['Title'].nunique()}")

Unique titles: 8264


In [None]:
def has_encoding_issues(text):
    if pd.isna(text):
        return False
    text = str(text)
    issues = ['�', 'â€', 'Ã', '\x00']
    return any(char in text for char in issues)

In [None]:
encoding_problems = df['Title'].apply(has_encoding_issues).sum()
print(f"Titles with potential encoding issues: {encoding_problems}")

Titles with potential encoding issues: 0


### Company

In [None]:
df['Company'].isnull().sum()

np.int64(439)

In [None]:
(df['Company'].astype(str).str.strip() == '').sum()


np.int64(0)

In [None]:
problematic_mask = df['Company'].astype(str).str.lower().str.contains(
    'not specified|not specific|n/a|confidential|undisclosed|anonymous|hidden',
    na=False,
    regex=True
)
problematic_count = problematic_mask.sum()
print(f"Contains problematic phrases: {problematic_count}")

if problematic_count > 0:
    print("\nExamples of problematic companies:")
    print(df[problematic_mask]['Company'].value_counts().head(10))


Contains problematic phrases: 877

Examples of problematic companies:
Company
Confidential Company\n-       875
Confidential Government\n-      2
Name: count, dtype: int64


In [None]:
df['Company'] = df['Company'].str.replace('Confidential Company\n-', 'Confidential', regex=False)
df['Company'] = df['Company'].str.replace('Confidential Government\n-', 'Confidential - Government', regex=False)

In [None]:
df['Company'] = df['Company'].fillna('Confidential')

In [None]:
print(f"Unique companies: {df['Company'].nunique()}")

Unique companies: 3362


In [None]:
df['Company'].apply(has_encoding_issues).sum()

np.int64(0)

In [None]:
import re

In [None]:
def contains_arabic(text):
    arabic_pattern = re.compile(r'[\u0600-\u06FF]')
    return bool(arabic_pattern.search(str(text)))


In [None]:
arabic_mask = df['Company'].apply(contains_arabic)
arabic_count = arabic_mask.sum()
print(arabic_count)

103


In [None]:
df['Company'].isnull().sum()

np.int64(0)

In [None]:
df.drop(columns='Date', inplace=True)

### Job Type

In [None]:
df['Job Type'].isnull().sum()

np.int64(704)

In [None]:
(df['Job Type'].astype(str).str.strip() == '').sum()

np.int64(0)

In [None]:
print(df['Job Type'].value_counts())

Job Type
Full Time              11156
Freelance / Project      239
Internship               120
Part Time                105
دوام كامل                 21
Shift Based                9
تدريب عملي                 1
Name: count, dtype: int64


In [None]:
df['Job Type'] = df['Job Type'].replace({'دوام كامل': 'Full Time', 'تدريب عملي': 'Internship'
})

In [None]:
career_job_mapping = df.groupby('Career Level')['Job Type'].agg(lambda x: x.mode()[0] if not x.mode().empty else 'Full Time').to_dict()

In [None]:
def impute_job_type(row):

    if pd.notna(row['Job Type']):
        return row['Job Type']

    title = str(row['Title']).lower()
    career_level = row['Career Level']

    if 'intern' in title:
        return 'Internship'

    if 'freelance' in title or 'freelancer' in title:
        return 'Freelance / Project'

    if 'part time' in title or 'part-time' in title:
        return 'Part Time'

    if 'contract' in title or 'temporary' in title or 'temp ' in title:
        return 'Freelance / Project'

    if pd.notna(career_level) and career_level in career_job_mapping:
        return career_job_mapping[career_level]

    return 'Full Time'


In [None]:
df['Job Type'] = df.apply(impute_job_type, axis=1)

In [None]:
df['Job Type'].isnull().sum()

np.int64(0)

In [None]:
df['Job Type'].astype(str).str.lower().str.contains(
    'not specified|not specific|n/a|none|unknown',
    na=False
).sum()


np.int64(0)

### Work Setting

In [None]:
df['Work Setting'].isnull().sum()

np.int64(4860)

In [None]:
(df['Work Setting'].astype(str).str.strip() == '').sum()


np.int64(0)

In [None]:
print(df['Work Setting'].value_counts())

Work Setting
On-site              6506
Hybrid                516
Remote                457
عمل من مقر الشركة      14
عمل عن بُعد             1
عمل هجين                1
Name: count, dtype: int64


In [None]:
df['Work Setting'] = df['Work Setting'].replace({
    'عمل من مقر الشركة': 'On-site',
    'عمل عن بُعد': 'Remote',
    'عمل هجين': 'Hybrid'
})

In [None]:
df['Work Setting'] = df['Work Setting'].fillna('Not Specified')

### Location

In [None]:
df['Location'].isnull().sum()


np.int64(20)

In [None]:
(df['Location'].astype(str).str.strip() == '').sum()

np.int64(0)

In [None]:
df['Location'].nunique()

69

In [None]:
df['Location'].astype(str).str.lower().str.contains(
    'not specified|not specific|n/a|none|unknown|unspecified',
    na=False
).sum()

np.int64(0)

In [None]:
print(df[df['Location'].apply(contains_arabic)]['Location'].value_counts())

Location
القاهرة                     7
مصر                         6
الإمارات العربية المتحدة    4
الجيزة                      2
المملكة العربية السعودية    2
البحر الأحمر                1
الشرقية                     1
Name: count, dtype: int64


In [None]:
df['Location'] = df['Location'].replace({
    'القاهرة': 'Cairo',
    'مصر': 'Egypt',
    'الإمارات العربية المتحدة': 'United Arab Emirates',
    'الجيزة': 'Giza',
    'المملكة العربية السعودية': 'Saudi Arabia',
    'البحر الأحمر': 'Red Sea',
    'الشرقية': 'Sharqia'
})

In [None]:
df['Location'] = df['Location'].fillna('Not Specified')

### Work Experience

In [None]:
df['Experience Needed'].isnull().sum()

np.int64(11)

In [None]:
(df['Experience Needed'].astype(str).str.strip() == '').sum()


np.int64(0)

In [None]:
df['Experience Needed'].value_counts()

Unnamed: 0_level_0,count
Experience Needed,Unnamed: 1_level_1
Not Specified,4098
3 To 5 Years,833
1 To 3 Years,752
2 To 5 Years,550
2 To 4 Years,385
...,...
0 To 11 Years,1
10 To 14 Years,1
15 To 17 Years,1
5 To 18 Years,1


In [None]:
df['Experience Needed'].apply(contains_arabic).sum()

np.int64(0)

In [None]:
df['Experience Needed'] = df['Experience Needed'].fillna('Not Specified')

### Career Level

In [None]:
df['Career Level'].isnull().sum()

np.int64(11)

In [None]:
(df['Career Level'].astype(str).str.strip() == '').sum()

np.int64(0)

In [None]:
print(df['Career Level'].value_counts())

Career Level
Experienced (Non-Mager)                      6682
Entry Level (Junior Level / Fresh Grad)      3616
Mager                                        1245
Education Level:                              325
Senior Magement (CEO, GM, Director, Head)     248
Not Specified                                 153
Student (Undergrad / Postgrad)                 54
ذو خبرة (غير إداري)                            12
مستوى مبتدئ (مبتدئ / خريج جديد)                 5
مدير                                            3
طالب (طالب جامعي / دراسات عُليا)                1
Name: count, dtype: int64


In [None]:
df = df[df['Career Level'] != 'Education Level:']

In [None]:
df['Career Level'] = df['Career Level'].replace('Mager', 'Manager')

In [None]:
df['Career Level'] = df['Career Level'].replace({
    'ذو خبرة (غير إداري)': 'Experienced (Non-Manager)',
    'مستوى مبتدئ (مبتدئ / خريج جديد)': 'Entry Level (Junior Level / Fresh Grad)',
    'مدير': 'Manager',
    'طالب (طالب جامعي / دراسات عُليا)': 'Student (Undergrad / Postgrad)'
})

In [None]:
df['Career Level'] = df['Career Level'].fillna('Not Specified')

In [None]:
df['Career Level'] = df['Career Level'].replace('Experienced (Non-Mager)', 'Experienced (Non-Manager)')

In [None]:
print(df['Career Level'].value_counts())

Career Level
Experienced (Non-Manager)                    6694
Entry Level (Junior Level / Fresh Grad)      3621
Manager                                      1248
Senior Magement (CEO, GM, Director, Head)     248
Not Specified                                 164
Student (Undergrad / Postgrad)                 55
Name: count, dtype: int64


### Education Level

In [None]:
df['Education Level'].isnull().sum()

np.int64(11)

In [None]:
(df['Education Level'].astype(str).str.strip() == '').sum()


np.int64(0)

In [None]:
print(df['Education Level'].value_counts())

Education Level
Not Specified                  6283
Bachelor's Degree              5651
High School (Or Equivalent)      28
Master's Degree                  19
درجة البكالوريوس                 12
غير محدد                          9
Diploma                           9
MBA                               6
Doctorate                         2
Name: count, dtype: int64


In [None]:
df['Education Level']=df['Education Level'].replace({
    "غير محدد": "Not Specified",
    "درجة البكالوريوس": "Bachelor's Degree",
})

In [None]:
df['Education Level']=df['Education Level'].fillna('Not Specified')

In [None]:
print(df['Education Level'].value_counts())

Education Level
Not Specified                  6303
Bachelor's Degree              5663
High School (Or Equivalent)      28
Master's Degree                  19
Diploma                           9
MBA                               6
Doctorate                         2
Name: count, dtype: int64


In [None]:
df.columns

Index(['Title', 'Company', 'Job Type', 'Work Setting', 'Location',
       'Experience Needed', 'Career Level', 'Education Level', 'Salary',
       'Job Categories', 'Skills', 'Job Description', 'Job Requirements',
       'Link'],
      dtype='object')

### Salary

In [None]:
df['Salary'].isnull().sum()

np.int64(11)

In [None]:
df['Salary'].value_counts().head(100)

Unnamed: 0_level_0,count
Salary,Unnamed: 1_level_1
Confidential,9706
Paid,70
"Confidential, Commission",42
"Confidential, Bonus",37
8000 To 12000 EGP Per Month,36
...,...
"Confidential, Social Insurance Medical Insurance Transportation In Work",3
400 To 500 USD Per Month,3
"Confidential, Device & Headset Will Be Provided.",3
15000 To 20000 SAR Per Month,3


In [None]:
df.drop(columns='Salary', inplace=True)

### Job Categories

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12030 entries, 0 to 12354
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Title              12030 non-null  object
 1   Company            12030 non-null  object
 2   Job Type           12030 non-null  object
 3   Work Setting       12030 non-null  object
 4   Location           12030 non-null  object
 5   Experience Needed  12030 non-null  object
 6   Career Level       12030 non-null  object
 7   Education Level    12030 non-null  object
 8   Job Categories     12019 non-null  object
 9   Skills             12019 non-null  object
 10  Job Description    12030 non-null  object
 11  Job Requirements   8198 non-null   object
 12  Link               12030 non-null  object
dtypes: object(13)
memory usage: 1.3+ MB


In [None]:
df['Job Categories'].isnull().sum()

np.int64(11)

In [None]:
print(df['Job Categories'].nunique())

49


In [None]:
(df['Job Categories'].astype(str).str.strip() == '').sum()

np.int64(0)

In [None]:
df['Job Categories'].astype(str).str.lower().str.contains(
    'not specified|not specific|n/a|none|unknown',
    na=False
).sum()


np.int64(604)

In [None]:
df['Job Categories'].apply(contains_arabic).sum()

np.int64(21)

In [None]:
print(df[df['Job Categories'].apply(contains_arabic)]['Job Categories'].value_counts().head(100))


Job Categories
الهندسة - البناء/المدنية/المعمارية     6
التعليم/التدريس                        2
تكنولوجيا المعلومات/تطوير البرمجيات    2
الإدارة                                2
المبيعات/التجزئة                       2
التسويق/العلاقات العامة/الإعلان        2
العمليات/الإدارة                       1
خدمة العملاء/الدعم                     1
المحاسبة/المالية                       1
الطب/الرعاية الصحية                    1
الكتابة/التحرير                        1
Name: count, dtype: int64


In [None]:
arabic_translations = {
    'الهندسة - البناء/المدنية/المعمارية': 'Engineering - Construction/Civil/Architecture',
    'التعليم/التدريس': 'Education/Teaching',
    'تكنولوجيا المعلومات/تطوير البرمجيات': 'IT/Software Development',
    'الإدارة': 'Administration',
    'المبيعات/التجزئة': 'Sales/Retail',
    'التسويق/العلاقات العامة/الإعلان': 'Marketing/PR/Advertising',
    'العمليات/الإدارة': 'Operations/Management',
    'خدمة العملاء/الدعم': 'Customer Service/Support',
    'المحاسبة/المالية': 'Accounting/Finance',
    'الطب/الرعاية الصحية': 'Medical/Healthcare',
    'الكتابة/التحرير': 'Writing/Editorial'
}

df['Job Categories'] = df['Job Categories'].replace(arabic_translations)


In [None]:
df=df.dropna(subset=['Job Categories'])

In [None]:
corrections = {
    "Accounting/Fince": "Accounting / Finance",
    "Accounting/Finance": "Accounting / Finance",
    "Installation/Maintence/Repair": "Installation / Maintenance / Repair",
    "Operations/Magement": "Operations / Management",
    "Operations/Management": "Operations / Management",
    "Project/Program Magement": "Project / Program Management",
    "Alyst/Research": "Analyst / Research",
    "Media/Jourlism/Publishing": "Media / Journalism / Publishing",
    "Engineering - Oil & Gas/Energy": "Engineering - Oil & Gas / Energy",
}


In [None]:
df["Job Categories"] = (
    df["Job Categories"]
    .replace(corrections)
    .str.replace(r"\s*/\s*", " / ", regex=True)
    .str.replace(r"\s*-\s*", " - ", regex=True)
    .str.strip()
    .str.title()
)

### Skills

In [None]:
df['Skills'].isnull().sum()


np.int64(0)

In [None]:
(df['Skills'].astype(str).str.strip() == '').sum()


np.int64(0)

In [None]:
df['Skills'].nunique()

10069

In [None]:
df['Skills'].value_counts().head()

Unnamed: 0_level_0,count
Skills,Unnamed: 1_level_1
"Business Development, Sales, Sales Skills, Sales Target, Marketing, Customer Service, Customer Support, Customer Care, Market Research, Magement",33
"Sales, Sales Skills, Sales Target, Customer Service, Customer Support, Customer Care, Microsoft Office, Marketing, Business Development, Outdoor Sales",32
"Sales, Sales Skills, Customer Service, Sales Target, Customer Support, Customer Care, Account Magement, Marketing, Business Development, Information Technology (IT)",29
"Microsoft Office, Sales, Administration, Magement, Sales Skills, Office Magement, Customer Service, Accounting, Fince, Marketing",27
"Sales, Sales Skills, Sales Target, Customer Service, Magement, Customer Support, Customer Care, Marketing, Business Development, Microsoft Office",24


In [None]:
df['Skills'].apply(contains_arabic).sum()

np.int64(3)

In [None]:
df[df['Skills'].apply(contains_arabic)]['Skills'].value_counts().head(15)

Unnamed: 0_level_0,count
Skills,Unnamed: 1_level_1
"التفاوض, التواصل الفعال, إعداد التقارير, تحليل البيانات, إدارة الموردين, استخدام برامج ERP, حل المشكلات, العمل الجماعي",1
"مدخل بيانات, محاسبة",1
"محاسب, السعودية, هيئة الزكاة و الضريبة و الجمارك",1


In [None]:
df['Skills'] = df['Skills'].replace({
    "التفاوض, التواصل الفعال, إعداد التقارير, تحليل البيانات, إدارة الموردين, استخدام برامج ERP, حل المشكلات, العمل الجماعي":
        "Negotiation, Effective communication, Report preparation, Data analysis, Supplier management, Using ERP software, Problem solving, Teamwork",

    "مدخل بيانات, محاسبة":
        "Data entry, Accounting",

    "محاسب, السعودية, هيئة الزكاة و الضريبة و الجمارك":
        "Accountant, Saudi Arabia, Zakat, Tax and Customs Authority"
})


In [None]:
df['Skills'] = df['Skills'].replace({
    'Magement': 'Management',
    'Fince': 'Finance',
    'Fincial': 'Financial',
    'Alysis': 'Analysis',
    'Alytical': 'Analytical',
    'Organizatiol': 'Organizational',
    'Persol': 'Personal',
    'Coordition': 'Coordination',
    'Maintence': 'Maintenance',
    'Telesales': 'Tele Sales'
})

In [None]:
skill_map = {
        'ms office': 'Microsoft Office',
        'ms excel': 'Microsoft Excel',
        'ms word': 'Microsoft Word',
        'powerpoint': 'Microsoft PowerPoint',
        'excel': 'Microsoft Excel',
        'word': 'Microsoft Word',
        'admin': 'Administration',
        'admin work': 'Administration',
        'admin assistant': 'Administrative Assistant',
        'hr': 'Human Resources',
        'it': 'Information Technology',
        'qa': 'Quality Assurance',
        'qc': 'Quality Control',
        'f&b': 'Food & Beverage',
        'crm': 'CRM Software',
        'erp': 'ERP Systems',
        'pmp': 'Project Management Professional',
        'seo': 'Search Engine Optimization',
        'css': 'CSS',
        'html': 'HTML',
        'javascript': 'JavaScript',
        'jquery': 'jQuery',
    }
df['Skills'] = df['Skills'].replace(skill_map)

In [None]:
print(df['Skills'].sample(10))

170     Civil Engineering, Tendering, Estimation, Saud...
7549    ELA teacher, Social studies teacher, American ...
4004    Video Editing, Adobe After Effects, Adobe Phot...
2056    Talent Acquisition, Candidate Sourcing, Recrui...
6699    Accounting, Fince, Microsoft Office, Sales Ski...
9901              Sales Target, Indoor Sales, Sales Field
7325    sales, retail, sales skills, customer service,...
4275             Sales, Customer Service, Computer Skills
912     Engineering, Civil Engineering, Swimming Pools...
5972    English Teaching, English Education, Kids Teac...
Name: Skills, dtype: object


In [None]:
def clean_skills(skills_str):

    skills = [s.strip() for s in skills_str.split(',')]
    skills = [s for s in skills if s]

    cleaned_skills = []
    for skill in skills:
        skill = skill.title()
        skill_lower = skill.lower()
        skill = skill_map.get(skill_lower, skill)
        cleaned_skills.append(skill)

    seen = set()
    unique_skills = []
    for skill in cleaned_skills:
        if skill.lower() not in seen:
            seen.add(skill.title())
            unique_skills.append(skill)

    skills_str = ', '.join(unique_skills)
    return skills_str


In [None]:
df['Skills'] = df['Skills'].apply(clean_skills)

In [None]:
print(df['Skills'].head())

0    Administration, Office Magement, Microsoft Off...
1    Autocad, Mechanical Design, Shop Drawings Prep...
2    Administration, Office Magement, Microsoft Off...
3    Lean Manufacturing, Problem-Solving, Productio...
4    Graphic Design, Adobe Photoshop, Adobe Illustr...
Name: Skills, dtype: object


### Job Description and Requirements

In [None]:
import re

In [None]:
def clean_text_basic(text):
    if pd.isna(text) or not text:
        return ""

    text = str(text).strip()

    text = re.sub(r'[ \t]{2,}', ' ', text)

    text = re.sub(r'\n{3,}', '\n\n', text)

    lines = [line.strip() for line in text.split('\n')]
    text = '\n'.join(lines)

    return text.strip()


In [None]:
def extract_requirements_from_description(description):
    if pd.isna(description) or not description:
        return "", ""

    desc = str(description)

    req_patterns = [
        r'(?:^|\n)\s*(?:-\s*)?(?:qualifications?\s*(?:&|and)?\s*requirements?|requirements?\s*(?:&|and)?\s*qualifications?)\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?requirements?\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?qualifications?\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?job\s+requirements?\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?minimum\s+(?:requirements?|qualifications?)\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?required\s+(?:skills?|qualifications?|experience)\s*:?\s*(?:\n|$)',

        r'(?:^|\n)\s*(?:-\s*)?what\s+we(?:\'re| are)\s+looking\s+for\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?we(?:\'re| are)\s+looking\s+for\s+(?:someone|a\s+(?:candidate|professional|person))\s+(?:with|who)\s*:?\s*',
        r'(?:^|\n)\s*(?:-\s*)?looking\s+for\s+(?:someone|a\s+(?:candidate|professional|person))\s+(?:with|who)\s*:?\s*',
        r'(?:^|\n)\s*(?:-\s*)?we\s+(?:are\s+)?looking\s+for\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?we\s+(?:are\s+)?seeking\s+(?:someone|a\s+(?:candidate|professional|person))\s+(?:with|who)\s*:?\s*',
        r'(?:^|\n)\s*(?:-\s*)?seeking\s+(?:someone|a\s+(?:candidate|professional|person))\s+(?:with|who)\s*:?\s*',

        r'(?:^|\n)\s*(?:-\s*)?(?:ideal|successful|perfect)\s+candidate\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?(?:the\s+)?(?:ideal|successful|perfect)\s+candidate\s+(?:will|should|must)\s+have\s*:?\s*',
        r'(?:^|\n)\s*(?:-\s*)?you\s+should\s+have\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?you\s+(?:will\s+)?need\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?candidate\s+(?:should|must|will)\s*:?\s*(?:\n|$)',

        r'(?:^|\n)\s*(?:-\s*)?skills?\s+(?:&|and)\s+(?:experience|qualifications?)\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?experience\s+(?:&|and)\s+(?:skills?|qualifications?)\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?required\s+skills?\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?key\s+skills?\s*:?\s*(?:\n|$)',

        r'(?:^|\n)\s*(?:-\s*)?education(?:al)?\s+(?:&|and)?\s*experience\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?must\s+have\s*:?\s*(?:\n|$)',
        r'(?:^|\n)\s*(?:-\s*)?(?:essential|preferred)\s+(?:skills?|qualifications?|experience)\s*:?\s*(?:\n|$)',

        r'(?:^|\n)\s*(?:-\s*)?to\s+be\s+successful\s*,?\s*you\s+(?:will|should|must)\s*:?\s*',
        r'(?:^|\n)\s*(?:-\s*)?to\s+succeed\s+in\s+this\s+role\s*,?\s*you\s+(?:will|should|must)\s*:?\s*',
    ]

    best_match = None
    best_position = len(desc)

    for pattern in req_patterns:
        match = re.search(pattern, desc, re.IGNORECASE | re.MULTILINE)
        if match and match.start() < best_position:
            best_match = match
            best_position = match.start()

    if best_match:
        split_point = best_match.start()

        description_part = desc[:split_point].strip()

        requirements_part = desc[split_point:].strip()

        requirements_part = requirements_part[best_match.end() - best_match.start():].strip()

        if len(requirements_part) > 50:
            return description_part, requirements_part

    return desc, ""


In [None]:
def clean_job_description(description):
    if pd.isna(description) or not description:
        return ""

    if contains_arabic(description):
        description = re.sub(r'[\u0600-\u06FF]+', '', str(description))

    cleaned = clean_text_basic(description)

    cleaned = cleaned.replace('Magement', 'Management')
    cleaned = cleaned.replace('Fince', 'Finance')
    cleaned = cleaned.replace('Fincial', 'Financial')
    cleaned = cleaned.replace('Alysis', 'Analysis')
    cleaned = cleaned.replace('Alytical', 'Analytical')
    cleaned = cleaned.replace('Organizatiol', 'Organizational')
    cleaned = cleaned.replace('Persol', 'Personal')
    cleaned = cleaned.replace('Coordition', 'Coordination')
    cleaned = cleaned.replace('Maintence', 'Maintenance')

    return cleaned


In [None]:
def clean_job_requirements(requirements):
    if pd.isna(requirements) or not requirements:
        return ""

    cleaned = str(requirements)

    if contains_arabic(cleaned):
        cleaned = re.sub(r'[\u0600-\u06FF]+', '', cleaned)

    cleaned = clean_text_basic(cleaned)

    corrections = {
        'Magement': 'Management',
        'Fince': 'Finance',
        'Fincial': 'Financial',
        'Alysis': 'Analysis',
        'Alytical': 'Analytical'
    }

    for wrong, right in corrections.items():
        cleaned = cleaned.replace(wrong, right)

    return cleaned


In [None]:
def process_descriptions_and_requirements(df):

    df['Desc_Clean'] = ""
    df['Req_Clean'] = ""
    df['Req_Extracted'] = False
    for idx in df.index:
        description = df.loc[idx, 'Job Description']
        requirements = df.loc[idx, 'Job Requirements']

        if pd.isna(requirements) or str(requirements).strip() == '':
            desc_clean, req_extracted = extract_requirements_from_description(description)

            if req_extracted:
                df.loc[idx, 'Desc_Clean'] = clean_job_description(desc_clean)
                df.loc[idx, 'Req_Clean'] = clean_job_requirements(req_extracted)
                df.loc[idx, 'Req_Extracted'] = True
            else:
                df.loc[idx, 'Desc_Clean'] = clean_job_description(description)
                df.loc[idx, 'Req_Clean'] = "Please refer to the job description for requirements."
                df.loc[idx, 'Req_Extracted'] = False
        else:
            df.loc[idx, 'Desc_Clean'] = clean_job_description(description)
            df.loc[idx, 'Req_Clean'] = clean_job_requirements(requirements)
            df.loc[idx, 'Req_Extracted'] = False

    extracted_count = df['Req_Extracted'].sum()
    still_null = (df['Req_Clean'] == "Please refer to the job description for requirements.").sum()

    df['Job Description'] = df['Desc_Clean']
    df['Job Requirements'] = df['Req_Clean']

    df.drop(['Desc_Clean', 'Req_Clean'], axis=1, inplace=True)

    return df


In [None]:
df = process_descriptions_and_requirements(df)

In [None]:
pd.set_option('display.max_colwidth', None)

### Grammar Fixes

In [None]:
import pandas as pd
import re
from collections import Counter


def extract_all_words(df, columns=None):
    """
    Extract all unique words from specified columns
    """
    if columns is None:
        columns = ['Title', 'Job Description', 'Job Requirements', 'Skills']

    columns = [col for col in columns if col in df.columns]

    print(f"Extracting words from: {columns}\n")

    all_words = []

    for col in columns:
        print(f"Processing {col}...", end=' ')
        texts = df[col].dropna().astype(str)

        for text in texts:
            words = re.findall(r'\b[a-zA-Z]+\b', text)
            all_words.extend(words)

        print(f"✓ ({len(texts)} rows)")

    return all_words


def analyze_words(words, min_frequency=5):
    """
    Analyze words and find suspicious patterns
    """
    print(f"\nAnalyzing {len(words)} total words...")

    word_counts = Counter(words)
    print(f"Found {len(word_counts)} unique words\n")

    frequent_words = {word: count for word, count in word_counts.items()
                     if count >= min_frequency}

    print(f"{'='*80}")
    print(f"WORDS APPEARING {min_frequency}+ TIMES: {len(frequent_words)}")
    print(f"{'='*80}\n")

    return word_counts, frequent_words



def find_suspicious_patterns(word_counts, min_frequency=5):
    """
    Find words with suspicious patterns that are likely typos
    """
    suspicious = []

    patterns = {
        'double letters': r'(.)\1{2,}',
        'iol ending': r'\w+iol$',
        'ence ending': r'\w+ence$',
        'ition ending': r'\w+ition$',
        'ger ending': r'\w+ger$',
        'nce/nance': r'\w+nce$',
        'ment/gement': r'\w+gement$',
        'lysis': r'\w*[Aa]lysis$',
        'lytical': r'\w*[Aa]lytical$',
    }

    for word, count in word_counts.items():
        if count < min_frequency:
            continue

        if len(word) < 4:
            continue

        for pattern_name, pattern in patterns.items():
            if re.search(pattern, word, re.IGNORECASE):
                suspicious.append({
                    'word': word,
                    'count': count,
                    'pattern': pattern_name
                })
                break

    return suspicious



def find_similar_word_pairs(word_counts, min_frequency=5):
    """
    Find pairs of similar words that might be typo variations
    """
    pairs = []
    words = [w for w, c in word_counts.items() if c >= min_frequency]

    print("Finding similar word pairs...")

    for i, word1 in enumerate(words):
        for word2 in words[i+1:]:
            if len(word1) < 4 or len(word2) < 4:
                continue

            if abs(len(word1) - len(word2)) > 2:
                continue

            if (word1.lower() in word2.lower() or
                word2.lower() in word1.lower() or
                levenshtein_distance(word1.lower(), word2.lower()) <= 2):

                count1 = word_counts[word1]
                count2 = word_counts[word2]

                if max(count1, count2) / min(count1, count2) > 3:
                    pairs.append({
                        'word1': word1,
                        'count1': count1,
                        'word2': word2,
                        'count2': count2,
                        'distance': levenshtein_distance(word1.lower(), word2.lower())
                    })

    return pairs


def levenshtein_distance(s1, s2):
    """Calculate edit distance between two strings"""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]



def display_top_words(word_counts, n=100):
    """Display most frequent words"""
    print(f"\n{'='*80}")
    print(f"TOP {n} MOST FREQUENT WORDS")
    print(f"{'='*80}\n")

    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

    print(f"{'Word':<30} {'Count':<10}")
    print("-" * 80)

    for word, count in sorted_words[:n]:
        print(f"{word:<30} {count:<10}")


def display_suspicious_words(suspicious):
    """Display words with suspicious patterns"""
    print(f"\n{'='*80}")
    print(f"SUSPICIOUS WORDS (Likely Typos)")
    print(f"{'='*80}\n")

    if not suspicious:
        print("✓ No suspicious patterns found!")
        return

    suspicious.sort(key=lambda x: x['count'], reverse=True)

    print(f"{'Word':<30} {'Count':<10} {'Pattern'}")
    print("-" * 80)

    for item in suspicious:
        print(f"{item['word']:<30} {item['count']:<10} {item['pattern']}")


def display_similar_pairs(pairs):
    """Display similar word pairs"""
    print(f"\n{'='*80}")
    print(f"SIMILAR WORD PAIRS (Potential Typo Variations)")
    print(f"{'='*80}\n")

    if not pairs:
        print("✓ No suspicious pairs found!")
        return

    pairs.sort(key=lambda x: max(x['count1'], x['count2']), reverse=True)

    print(f"{'Word 1':<25} {'Count':<8} {'Word 2':<25} {'Count':<8} {'Distance'}")
    print("-" * 80)

    for pair in pairs[:50]:
        print(f"{pair['word1']:<25} {pair['count1']:<8} "
              f"{pair['word2']:<25} {pair['count2']:<8} {pair['distance']}")


def save_to_csv(word_counts, suspicious, pairs):
    """Save results to CSV files for manual review"""

    df_words = pd.DataFrame(list(word_counts.items()), columns=['word', 'count'])
    df_words = df_words.sort_values('count', ascending=False)
    df_words.to_csv('all_words_frequency.csv', index=False)
    print(f"\n✓ Saved all words to 'all_words_frequency.csv'")

    # Suspicious words
    if suspicious:
        df_suspicious = pd.DataFrame(suspicious)
        df_suspicious.to_csv('suspicious_words.csv', index=False)
        print(f"✓ Saved suspicious words to 'suspicious_words.csv'")

    # Similar pairs
    if pairs:
        df_pairs = pd.DataFrame(pairs)
        df_pairs.to_csv('similar_word_pairs.csv', index=False)
        print(f"✓ Saved similar pairs to 'similar_word_pairs.csv'")



def review_suspicious_words(suspicious):
    """
    Interactively review suspicious words
    """
    corrections = {}

    print(f"\n{'='*80}")
    print("REVIEW SUSPICIOUS WORDS")
    print(f"{'='*80}")
    print("\nFor each word:")
    print("  y = It's a typo (then type correct spelling)")
    print("  n = Not a typo (skip)")
    print("  q = Quit review\n")

    for i, item in enumerate(suspicious, 1):
        word = item['word']
        count = item['count']
        pattern = item['pattern']

        print(f"\n[{i}/{len(suspicious)}] '{word}' - {pattern} (appears {count} times)")

        choice = input("Is this a typo? (y/n/q): ").lower().strip()

        if choice == 'y':
            correct = input(f"  Correct spelling of '{word}': ").strip()
            if correct:
                corrections[word] = correct
                print(f"  ✓ Added: {word} → {correct}")
        elif choice == 'q':
            break
        elif choice == 'n':
            print("  ⊘ Skipped")

    return corrections


def review_word_pairs(pairs):
    """
    Review similar word pairs
    """
    corrections = {}

    print(f"\n{'='*80}")
    print("REVIEW SIMILAR WORD PAIRS")
    print(f"{'='*80}")
    print("\nFor each pair, which is the typo?")
    print("  1 = First word is typo")
    print("  2 = Second word is typo")
    print("  n = Neither is typo")
    print("  q = Quit\n")

    for i, pair in enumerate(pairs[:30], 1):
        word1 = pair['word1']
        count1 = pair['count1']
        word2 = pair['word2']
        count2 = pair['count2']

        print(f"\n[{i}] '{word1}' ({count1}) vs '{word2}' ({count2})")

        choice = input("Which is typo? (1/2/n/q): ").strip()

        if choice == '1':
            corrections[word1] = word2
            print(f"  ✓ Added: {word1} → {word2}")
        elif choice == '2':
            corrections[word2] = word1
            print(f"  ✓ Added: {word2} → {word1}")
        elif choice == 'q':
            break

    return corrections



def generate_typo_dict_code(corrections):
    """
    Generate Python code for the typo dictionary
    """
    if not corrections:
        print("\n⚠️  No corrections to generate")
        return

    print(f"\n{'='*80}")
    print("GENERATED TYPO DICTIONARY CODE")
    print(f"{'='*80}\n")

    print("KNOWN_TYPOS = {")
    for typo, correction in sorted(corrections.items()):
        print(f"    '{typo}': '{correction}',")
    print("}")

    with open('typo_dictionary.txt', 'w') as f:
        f.write("KNOWN_TYPOS = {\n")
        for typo, correction in sorted(corrections.items()):
            f.write(f"    '{typo}': '{correction}',\n")
        f.write("}\n")

    print(f"\n✓ Saved to 'typo_dictionary.txt'")
    print(f"✓ Total corrections: {len(corrections)}")


def manual_typo_detection_workflow(df, interactive=True):
    """
    Complete manual typo detection workflow
    """
    print("="*80)
    print("MANUAL TYPO DETECTION WORKFLOW")
    print("="*80 + "\n")

    print("STEP 1: Extracting words...")
    all_words = extract_all_words(df)

    print("\nSTEP 2: Analyzing word patterns...")
    word_counts, frequent_words = analyze_words(all_words, min_frequency=5)

    print("\nSTEP 3: Finding suspicious patterns...")
    suspicious = find_suspicious_patterns(word_counts, min_frequency=5)

    print("\nSTEP 4: Finding similar word pairs...")
    pairs = find_similar_word_pairs(word_counts, min_frequency=10)

    print("\nSTEP 5: Displaying results...")
    display_top_words(word_counts, n=50)
    display_suspicious_words(suspicious)
    display_similar_pairs(pairs)

    print("\nSTEP 6: Saving results...")
    save_to_csv(word_counts, suspicious, pairs)

    corrections = {}
    if interactive:
        print("\n" + "="*80)
        review = input("\nStart interactive review? (y/n): ").lower()

        if review == 'y':
            print("\nReviewing suspicious words...")
            corrections1 = review_suspicious_words(suspicious)
            corrections.update(corrections1)

            if pairs:
                print("\n\nReviewing word pairs...")
                corrections2 = review_word_pairs(pairs)
                corrections.update(corrections2)

    if corrections:
        generate_typo_dict_code(corrections)

    print(f"\n{'='*80}")
    print("✓ WORKFLOW COMPLETE!")
    print(f"  Total unique words: {len(word_counts)}")
    print(f"  Suspicious words found: {len(suspicious)}")
    print(f"  Similar pairs found: {len(pairs)}")
    print(f"  Corrections created: {len(corrections)}")
    print(f"{'='*80}\n")

    return word_counts, suspicious, pairs, corrections



def quick_manual_detection(df):
    """
    Quick analysis without interaction - just generate CSVs to review
    """
    all_words = extract_all_words(df)
    word_counts, frequent_words = analyze_words(all_words, min_frequency=5)
    suspicious = find_suspicious_patterns(word_counts, min_frequency=5)
    pairs = find_similar_word_pairs(word_counts, min_frequency=10)

    display_suspicious_words(suspicious)
    display_similar_pairs(pairs)
    save_to_csv(word_counts, suspicious, pairs)

    print(f"\n{'='*80}")
    print("NEXT STEPS:")
    print(f"{'='*80}")
    print("1. Open 'all_words_frequency.csv' and look for typos")
    print("2. Review 'suspicious_words.csv' for likely typos")
    print("3. Check 'similar_word_pairs.csv' for typo variations")
    print("4. Add corrections to KNOWN_TYPOS dictionary")
    print(f"{'='*80}\n")

    return word_counts, suspicious, pairs





In [None]:
"""
# Option 1: Quick analysis (generates CSVs, no interaction)
word_counts, suspicious, pairs = quick_manual_detection(df)

# Option 2: Full workflow with interactive review
word_counts, suspicious, pairs, corrections = manual_typo_detection_workflow(df, interactive=True)

# Option 3: Just analyze and display (no files)
all_words = extract_all_words(df)
word_counts, frequent_words = analyze_words(all_words)
suspicious = find_suspicious_patterns(word_counts)
display_suspicious_words(suspicious)
"""

In [None]:
word_counts, suspicious, pairs, corrections = manual_typo_detection_workflow(df, interactive=True)

MANUAL TYPO DETECTION WORKFLOW

STEP 1: Extracting words...
Extracting words from: ['Title', 'Job Description', 'Job Requirements', 'Skills']

Processing Title... ✓ (12019 rows)
Processing Job Description... ✓ (12019 rows)
Processing Job Requirements... ✓ (12019 rows)
Processing Skills... ✓ (12019 rows)

STEP 2: Analyzing word patterns...

Analyzing 3637003 total words...
Found 57956 unique words

WORDS APPEARING 5+ TIMES: 17802


STEP 3: Finding suspicious patterns...

STEP 4: Finding similar word pairs...
Finding similar word pairs...

STEP 5: Displaying results...

TOP 50 MOST FREQUENT WORDS

Word                           Count     
--------------------------------------------------------------------------------
and                            243692    
to                             103980    
the                            84876     
of                             70593     
in                             68716     
with                           54961     
a                     

### Converting to RAG format / Finetuning

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12019 entries, 0 to 12354
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Title              12019 non-null  object
 1   Company            12019 non-null  object
 2   Job Type           12019 non-null  object
 3   Work Setting       12019 non-null  object
 4   Location           12019 non-null  object
 5   Experience Needed  12019 non-null  object
 6   Career Level       12019 non-null  object
 7   Education Level    12019 non-null  object
 8   Job Categories     12019 non-null  object
 9   Skills             12019 non-null  object
 10  Job Description    12019 non-null  object
 11  Job Requirements   12019 non-null  object
 12  Link               12019 non-null  object
 13  Req_Extracted      12019 non-null  bool  
dtypes: bool(1), object(13)
memory usage: 1.5+ MB


In [None]:
df.drop(columns=['Req_Extracted'], inplace=True)

In [None]:
df['Link'].duplicated().sum()

np.int64(388)

In [None]:
df = df.drop_duplicates(subset=["Link"])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11631 entries, 0 to 12354
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Title              11631 non-null  object
 1   Company            11631 non-null  object
 2   Job Type           11631 non-null  object
 3   Work Setting       11631 non-null  object
 4   Location           11631 non-null  object
 5   Experience Needed  11631 non-null  object
 6   Career Level       11631 non-null  object
 7   Education Level    11631 non-null  object
 8   Job Categories     11631 non-null  object
 9   Skills             11631 non-null  object
 10  Job Description    11631 non-null  object
 11  Job Requirements   11631 non-null  object
 12  Link               11631 non-null  object
dtypes: object(13)
memory usage: 1.2+ MB


In [None]:
df.to_csv('wuzzuf_jobs_cleaned.csv', index=False)

In [None]:
# RAG Formatting:
import json
def format_for_rag(row):

    content_parts = []

    if pd.notna(row['Job Description']) and str(row['Job Description']).strip():
        content_parts.append(str(row['Job Description']))

    if pd.notna(row['Job Requirements']) and str(row['Job Requirements']).strip():
        content_parts.append("\n\nRequirements:\n" + str(row['Job Requirements']))

    full_content = "\n\n".join(content_parts).strip()

    return {
        'id': int(row.name),
        'title': str(row['Title']),
        'company': str(row['Company']),
        'skills': str(row['Skills']) if pd.notna(row['Skills']) else "",
        'career_level': str(row['Career Level']),
        'experience_needed': str(row['Experience Needed']),
        'location': str(row['Location']),
        'job_type': str(row['Job Type']),
        'work_setting': str(row['Work Setting']),
        'job_category': str(row['Job Categories']),
        'education_level': str(row['Education Level']),

        'content': full_content,

        'source_url': str(row['Link'])
    }


In [None]:
rag_data = df.apply(format_for_rag, axis=1).tolist()

with open('wuzzuf_for_rag.json', 'w', encoding='utf-8') as f:
    json.dump(rag_data, f, ensure_ascii=False, indent=2)


In [None]:
print(json.dumps(rag_data[0], indent=2, ensure_ascii=False))

{
  "id": 0,
  "title": "Administrative Mager",
  "company": "OKX\n-",
  "skills": "Administration, Office Magement, Microsoft Office, Secretary, Administration, Business Administration, Customer Service, Magement, English, Administrative Assistant",
  "career_level": "Experienced (Non-Manager)",
  "experience_needed": "Not Specified",
  "location": "United Arab Emirates",
  "job_type": "Freelance / Project",
  "work_setting": "Not Specified",
  "job_category": "Administration",
  "education_level": "Not Specified",
  "content": "- Who We AreAt OKX, we believe that the future will be reshaped by crypto, and ultimately contribute to every individual's freedom.OKX is a leading crypto exchange, and the developer of OKX Wallet, giving millions access to crypto trading and decentralized crypto applications (dApps). OKX is also a trusted brand by hundreds of large institutions seeking access to crypto markets. We are safe and reliable, backed by our Proof of Reserves.Across our multiple offi

In [None]:
# Finetuning Format:
def format_for_finetuning(row):

    output_parts = []

    if pd.notna(row['Job Description']) and str(row['Job Description']).strip():
        output_parts.append(str(row['Job Description']))

    if pd.notna(row['Job Requirements']) and str(row['Job Requirements']).strip():
        output_parts.append("\n\nRequirements:\n" + str(row['Job Requirements']))

    ideal_output = "\n\n".join(output_parts).strip()

    instruction = f"""Write a professional job posting for the following position:

Job Title: {row['Title']}
Career Level: {row['Career Level']}
Required Skills: {row['Skills']}
Experience Needed: {row['Experience Needed']}
Location: {row['Location']}
Job Type: {row['Job Type']}
Work Setting: {row['Work Setting']}

Generate a complete job description with responsibilities and requirements."""

    return {
        "instruction": instruction,
        "response": ideal_output,

        "metadata": {
            "job_category": str(row['Job Categories']),
            "career_level": str(row['Career Level']),
            "location": str(row['Location'])
        }
    }




In [None]:
finetuning_data = df.apply(format_for_finetuning, axis=1).tolist()

with open('wuzzuf_for_finetuning.jsonl', 'w', encoding='utf-8') as f:
    for item in finetuning_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
