# Wuzzuf Web Scrabing and Data Cleaning

### Import Library

In [None]:
import requests #to import HTML Code from Website
from bs4 import BeautifulSoup
import pandas as pd
import warnings
# Ignore all warnings for this cell
warnings.filterwarnings('ignore')
import re

### Scrabing

In [None]:
# Create Data frame
df = pd.DataFrame(columns= ["job_name","company_name","location","city_name","job_type","expriance_level","year_expriance","skils"])
df

In [None]:
# Get Data
for i in range(644):
    request = requests.get(f"https://wuzzuf.net/search/jobs/?a=hpb&q=&start={i}")
    content = request.content
    soup = BeautifulSoup(content, "html.parser")
    jobs = soup.find_all("div", class_ = "css-1gatmva e1v1l3u10")
    
    for job in jobs:
        job_name = job.find_all("h2", class_ = "css-m604qf")[0].text
        company_name = job.find_all("div", class_ = "css-d7j1kk")[0].text.split(" - ")[0]
        location = job.find_all("span", class_ = "css-5wys0k")[0].text.split(", ")[0]
        city_name = job.find_all("span", class_ = "css-5wys0k")[0].text.split(", ")[1]
        job_type = job.find_all("span", class_ = "css-1ve4b75 eoyjyou0")[0].text
        expriance_level = job.find_all("div", class_ = "css-1lh32fc")[0].next_sibling.text.split(" · ")[0]
        year_expriance = job.find_all("div", class_ = "css-1lh32fc")[0].next_sibling.text.split(" · ")[1]
        skils = " ".join(job.find_all("div", class_ = "css-1lh32fc")[0].next_sibling.text.split(" · ")[2:])
        
        df.loc[len(df.index)] = [job_name, company_name, location, city_name, job_type, expriance_level, year_expriance, skils]
    if i % 50 == 0:
        print("Done")

In [None]:
df

### Data Cleaning

In [None]:
# Some informatuion about Dataset
df.info()

In [None]:
df['job_name'].unique()[:50]

Ohh, we have a big task, Data like [Customer Support Representative - Commerce ( Accounting )] forced us to take the first part and delete other parts

In [None]:
df['job_name'].value_counts()[:20]

Let's do that with our simple clean_job_title

In [None]:
def clean_jop_title(jop_name):
    """
    Clean up the jop_title by removing hyphenated words, text after forward slashes,
    and text within parentheses.

    Args:
        jop_title (str): the jop title string to be cleand.
    
    Returns:
        str: the cleaned job title string.
    """
    clean_title = jop_name.split("-")[0].strip().split("/")[0].strip().split("(")[0].strip().split("|")[0].strip().split(":")[0].strip()
    return clean_title

df['job_name'] = df['job_name'].apply(clean_jop_title)
df['job_name'].unique()[:50]

In [None]:
df['job_name'].value_counts()[:20]

The job titles are now significantly more consistent, which is a notable improvement. The noticeable rise in the frequency of each job title indicates this positive development.

=======================================================================================================================

In [None]:
df['company_name'].unique()

In [None]:
df['company_name'].value_counts()[:20]

I don't believe there is a problem with this column.

=======================================================================================================================

In [None]:
df['year_expriance'].unique()

I believe there are a lot of issues with this column.

- First, we must remove the "Yrs of Exp" portion from each value.
- Remove values such as " ", "-" Next, remove any text values.
- Perform certain actions to create a brief unique list.

Git rid of this part "year_expriance"

In [None]:
def extract_numeric_and_hyphen(input_string):
    """
    Extract only numeric values and hyphens from the input string.

    Args:
        input_string (str): The string from which to extract numeric values and hyphens.

    Returns:
        str: The cleaned string containing only numeric values and hyphens.
    """
    return re.sub(r"[^0-9-]", "", input_string)

df['year_expriance'] = df['year_expriance'].apply(extract_numeric_and_hyphen)
df['year_expriance'].unique()

Git rid of values as " ", "-"

In [None]:
# Take the hyphen ('-') or empty values ('') out of the "year_expriance" column.
df = df[(df['year_expriance']!= '')&(df['year_expriance']!='-')]

df['year_expriance'].unique()

Turn values like '1-1', '8-8, '20-20' to '1', '8', '20'

In [None]:
def clean_expriance(exp):
    """
    Clean up the 'Experience years' data by checking if the value contains a hyphen ("-")
    and both parts of the split result are equal. If so, it returns only one part;
    otherwise, it leaves the original 'Experience years' value unchanged.

   a Args:
        exp (str): The 'Experience years' value to be cleaned.

    Returns:
        str: The cleaned 'Experience years' value.
    """
    # Split the 'year_expriance' value by hyphen
    parts = exp.split('-')

    # Check if there are exactly two parts and they are equal
    if len(parts) == 2 and parts[0] == parts[1]:
        return parts[0] # If equal, return only one part
    else:
        return exp # If not equal or not two parts, return the original value unchanged
    
df['year_expriance'] = df['year_expriance'].apply(clean_expriance)
df['year_expriance'].unique()

single values into range format

In [None]:
def convert_single_to_range(data):
    """
    Converts single values in the list to range format by matching them to existing ranges.

    Args:
        data (list): A list of values where single values might be converted to ranges.

    Returns:
        list: The modified list with single values converted to ranges and sorted.
    """

    # Step 1: Convert single values to range format
    for i, item in enumerate(data):
        if '-' not in item:
            value = int(item)
            for range_item in data:
                if '-' in range_item:
                    start, end = map(int, range_item.split('-'))
                    if start <= value <= end:
                        data[i] = range_item
                        break
                    
    # Step 2: Filter out empty strings and sort the list of ranges
    data = [item for item in data if item.strip()]
    data.sort(key=lambda x: int(x.split('-')[0]))

    return data

In [None]:
# Convert 'Experience years' column to a list
expriance_list = df['year_expriance'].tolist()

# Process the list using the function
processed_expriance = convert_single_to_range(expriance_list)

# Assign the processed list back to the DataFrame column
df['year_expriance'] = processed_expriance

In [None]:
df['year_expriance'].unique()

In [None]:
df.head()

=======================================================================================================================

In [None]:
df['job_type'].unique()

 replace 'Freelance / Project' to 'Freelance'

In [None]:
def clean_job_type(job_type):
    clean_title = job_type.split("/")[0].strip()
    return clean_title

df['job_type'] = df['job_type'].apply(clean_job_type)
df['job_type'].unique()

=======================================================================================================================

In [None]:
df['location'].unique()

In [None]:
# 1. Convert to Lowercase
df['location'] = df['location'].str.lower()

# 2. Remove Leading and Trailing Spaces
df['location'] = df['location'].str.strip()

In [None]:
df['location'].unique()

=======================================================================================================================

In [None]:
df['skils'].unique()

In [None]:
def clean_skils(skils):
    """
    Clean up the skills by removing text after forward slashes.

    Args:
        skills (str): The job title string to be cleaned.

    Returns:
        str: The cleaned skills string.
    """
    clean_skils_title = skils.split('/')
    return clean_skils_title

df['skils'] = df['skils'].apply(clean_skils)


In [None]:
df.head()

### Fininsh