# Data Cleaning Pipeline

This notebook implements the data cleaning pipeline for the Wuzzuf job postings dataset.
It includes functions for loading, cleaning, and preprocessing the raw data.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully")

Libraries imported successfully


## Task 2.1: Data Loading and Initial Cleaning Functions

In [None]:
def load_csv(file_path):
    """
    Load CSV file with proper encoding handling.
    
    Args:
        file_path (str): Path to the CSV file
        
    Returns:
        pd.DataFrame: Loaded dataframe
    """
    try:
        # Try UTF-8 encoding first
        df = pd.read_csv(file_path, encoding='utf-8')
        print(f"Successfully loaded {file_path} with UTF-8 encoding")
    except UnicodeDecodeError:
        try:
            # Fallback to latin-1 encoding
            df = pd.read_csv(file_path, encoding='latin-1')
            print(f"Successfully loaded {file_path} with latin-1 encoding")
        except Exception as e:
            print(f"Error loading file: {e}")
            return None
    
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    return df

In [None]:
def remove_unnecessary_columns(df):
    """
    Remove unnecessary columns like 'Unnamed: 0' and other index columns.
    
    Args:
        df (pd.DataFrame): Input dataframe
        
    Returns:
        pd.DataFrame: Dataframe with unnecessary columns removed
    """
    original_cols = len(df.columns)
    
    # Remove columns that start with 'Unnamed:' (typically index columns)
    cols_to_remove = [col for col in df.columns if col.startswith('Unnamed:')]
    
    # Also remove the first column if it's just an index (contains only integers)
    if len(df.columns) > 0:
        first_col = df.columns[0]
        if first_col not in cols_to_remove and df[first_col].dtype in ['int64', 'float64']:
            # Check if it's likely an index column (sequential numbers)
            if df[first_col].nunique() == len(df) and df[first_col].min() == 0:
                cols_to_remove.append(first_col)
    
    if cols_to_remove:
        df = df.drop(columns=cols_to_remove)
        print(f"Removed {len(cols_to_remove)} unnecessary columns: {cols_to_remove}")
    else:
        print("No unnecessary columns found to remove")
    
    print(f"Columns reduced from {original_cols} to {len(df.columns)}")
    
    return df

In [None]:
def remove_duplicates(df, id_column='Job Posting ID'):
    """
    Remove duplicate records based on Job Posting ID.
    
    Args:
        df (pd.DataFrame): Input dataframe
        id_column (str): Column name to check for duplicates
        
    Returns:
        pd.DataFrame: Dataframe with duplicates removed
    """
    original_rows = len(df)
    
    if id_column not in df.columns:
        print(f"Warning: {id_column} column not found. Available columns: {list(df.columns)}")
        return df
    
    # Check for duplicates
    duplicates = df.duplicated(subset=[id_column], keep='first')
    duplicate_count = duplicates.sum()
    
    if duplicate_count > 0:
        df = df[~duplicates].copy()
        print(f"Removed {duplicate_count} duplicate records based on {id_column}")
    else:
        print(f"No duplicates found based on {id_column}")
    
    print(f"Rows reduced from {original_rows} to {len(df)}")
    
    return df

In [None]:
def parse_dates(df, date_column='Job Posting Date'):
    """
    Parse Job Posting Date into datetime format and extract year and month columns.
    
    Args:
        df (pd.DataFrame): Input dataframe
        date_column (str): Column name containing dates
        
    Returns:
        pd.DataFrame: Dataframe with parsed dates and extracted year/month columns
    """
    if date_column not in df.columns:
        print(f"Warning: {date_column} column not found. Available columns: {list(df.columns)}")
        return df
    
    original_nulls = df[date_column].isnull().sum()
    
    # Parse dates with error handling
    df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
    
    # Count parsing failures
    parsing_failures = df[date_column].isnull().sum() - original_nulls
    successful_parses = len(df) - df[date_column].isnull().sum()
    
    # Extract year and month columns
    df['posting_year'] = df[date_column].dt.year
    df['posting_month'] = df[date_column].dt.month
    
    print(f"Date parsing results:")
    print(f"  - Successfully parsed: {successful_parses} dates")
    print(f"  - Parsing failures: {parsing_failures} dates")
    print(f"  - Success rate: {(successful_parses/len(df)*100):.1f}%")
    
    if successful_parses > 0:
        print(f"  - Date range: {df[date_column].min()} to {df[date_column].max()}")
        print(f"  - Years covered: {sorted(df['posting_year'].dropna().unique().astype(int))}")
    
    return df

## Load and Process Data

In [None]:
# Load the raw data
raw_data_path = '../data/raw/Wuzzuf-Jobs-Posting.csv'
df = load_csv(raw_data_path)

if df is not None:
    print("\nFirst few rows:")
    print(df.head())

In [None]:
# Remove unnecessary columns
if df is not None:
    df = remove_unnecessary_columns(df)
    print("\nColumns after removal:")
    print(list(df.columns))

In [None]:
# Remove duplicates
if df is not None:
    df = remove_duplicates(df)
    print(f"\nFinal dataset shape after duplicate removal: {df.shape}")

In [None]:
# Parse dates
if df is not None:
    df = parse_dates(df)
    print("\nDataset info after date parsing:")
    print(df.info())

## Data Quality Report

In [None]:
def generate_data_quality_report(df):
    """
    Generate a comprehensive data quality report.
    
    Args:
        df (pd.DataFrame): Input dataframe
    """
    print("=" * 50)
    print("DATA QUALITY REPORT")
    print("=" * 50)
    
    print(f"\nDataset Overview:")
    print(f"  - Total rows: {len(df):,}")
    print(f"  - Total columns: {len(df.columns)}")
    
    print(f"\nMissing Data Summary:")
    missing_data = df.isnull().sum()
    missing_pct = (missing_data / len(df) * 100).round(2)
    
    for col in df.columns:
        if missing_data[col] > 0:
            print(f"  - {col}: {missing_data[col]:,} ({missing_pct[col]}%)")
    
    if 'Job Posting Date' in df.columns:
        valid_dates = df['Job Posting Date'].notna().sum()
        print(f"\nDate Parsing Success:")
        print(f"  - Valid dates: {valid_dates:,} ({(valid_dates/len(df)*100):.1f}%)")
    
    print(f"\nKey Statistics:")
    if 'Job Posting ID' in df.columns:
        unique_jobs = df['Job Posting ID'].nunique()
        print(f"  - Unique job postings: {unique_jobs:,}")
    
    if 'Company Name' in df.columns:
        unique_companies = df['Company Name'].nunique()
        print(f"  - Unique companies: {unique_companies:,}")
    
    if 'Job Title' in df.columns:
        unique_titles = df['Job Title'].nunique()
        print(f"  - Unique job titles: {unique_titles:,}")

# Generate the report
if df is not None:
    generate_data_quality_report(df)