# LinkedIn Job Data Cleaning Pipeline
This notebook processes and cleans LinkedIn job posting data

In [None]:
# Import Libraries
import pandas as pd
import re
from langdetect import detect, LangDetectException
from tqdm.notebook import tqdm
from googletrans import Translator
import warnings
warnings.filterwarnings('ignore')

# Initialize translator
translator = Translator()

In [None]:
# Load Data
df = pd.read_excel("../data/raw/Final Dataset.xlsx")
print(f"Loaded {len(df)} rows")
df.head(10)

In [None]:
df.info()

In [None]:
# 1. Make all column names lowercase
df.columns = df.columns.str.lower()

In [None]:
import pandas as pd
import numpy as np

# Clean the column (remove commas/whitespace)
s = df['date'].astype(str).str.strip().str.replace(r',\s*$', '', regex=True)

# Try exact dd/mm/yyyy format first
parsed = pd.to_datetime(s, format='%d/%m/%Y', errors='coerce')

# For any remaining values (like 2024-09-09 00:00:00), do a flexible parse
mask = parsed.isna()
parsed[mask] = pd.to_datetime(s[mask], errors='coerce')

# Assign back
df['date'] = parsed

print(f"Parsed: {df['date'].notna().sum()} | Failed: {df['date'].isna().sum()}")


In [None]:
df

In [None]:
df.info()

## Text Processing Functions

In [None]:
# Language Detection and Translation
def detect_and_translate(text):
    """
    Detects language and translates to English if needed.
    Uses langdetect for detection and googletrans for translation.
    """
    try:
        # Handle empty or non-string values
        if not isinstance(text, str) or text.strip() == '':
            return 'Not Provided'
        
        # Detect language
        detected_lang = detect(text)
        
        # If not English, translate
        if detected_lang != 'en':
            translated = translator.translate(text, src=detected_lang, dest='en')
            return translated.text
        
        return text
    
    except LangDetectException:
        # If language detection fails, try translation anyway
        try:
            translated = translator.translate(text, dest='en')
            return translated.text
        except:
            return 'Not Provided'
    except Exception as e:
        print(f"Translation error: {e}")
        return 'Not Provided'

In [None]:
# Feature Extraction Functions
def clean_html(text):
    """Remove HTML tags and clean text"""
    if not isinstance(text, str):
        return 'Not Provided'
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Decode HTML entities
    text = text.replace('&nbsp;', ' ')
    text = text.replace('&amp;', '&')
    text = text.replace('&lt;', '<')
    text = text.replace('&gt;', '>')
    text = text.replace('&quot;', '"')
    
    return text.strip()


def clean_title(text):
    """Clean job title by removing salary info and extra content"""
    if not isinstance(text, str):
        return 'Not Provided'
    
    # Remove salary information
    text = re.sub(r'[\$£€₦₹]\s?[\d,]+.*', '', text)
    
    # Remove content after pipe or dash
    text = re.sub(r'\s*[-–|].*', '', text)
    
    return text.strip()


def extract_tools(text):
    """Extract technical tools and technologies from text"""
    tools = [
        'Excel', 'SQL', 'Python', 'R', 'Tableau', 'Power BI', 'Looker', 
        'SAS', 'SPSS', 'Jupyter', 'Pandas', 'NumPy', 'Matplotlib', 
        'Seaborn', 'Snowflake', 'AWS', 'Google Analytics', 'BigQuery', 
        'Hadoop', 'Spark', 'Django', 'Flask', 'Git', 'GitHub', 'Docker', 
        'Kubernetes', 'PySpark', 'Airflow', 'Azure', 'GCP', 'Redshift',
        'DAX', 'ETL', 'API', 'PostgreSQL', 'MySQL', 'MongoDB'
    ]
    
    if not isinstance(text, str):
        return 'Not Provided'
    
    found = []
    for tool in tools:
        pattern = r'\b' + re.escape(tool) + r'\b'
        if re.search(pattern, text, re.IGNORECASE):
            found.append(tool)
    
    return ', '.join(found) if found else 'Not Provided'


def extract_education(text):
    """Extract education level requirements"""
    if not isinstance(text, str):
        return 'Not Provided'

    education_keywords = {
        r"bachelor[''s]*|b\.?s\.?c\.?|undergraduate": "Bachelor's",
        r"master[''s]*|m\.?s\.?c\.?|postgraduate": "Master's",
        r"ph[.]?d|doctorate|doctoral": "PhD",
        r"associate degree|associates": "Associate Degree",
        r"diploma|certification": "Diploma"
    }

    found_levels = []
    for pattern, label in education_keywords.items():
        if re.search(pattern, text, re.IGNORECASE):
            found_levels.append(label)

    return ', '.join(sorted(set(found_levels))) if found_levels else 'Not Provided'


def extract_experience_years(text):
    """Extract years of experience required"""
    if not isinstance(text, str):
        return 'Not Provided'
    
    patterns = [
        r'(\d+)\+?\s*(?:years?|yrs?)(?:\s+of)?\s+experience',
        r'minimum\s+(?:of\s+)?(\d+)\s*(?:years?|yrs?)',
        r'at\s+least\s+(\d+)\s*(?:years?|yrs?)',
        r'(\d+)\s*(?:to|-)\s*(\d+)\s*(?:years?|yrs?)'
    ]

    for pattern in patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            try:
                years = int(match.group(1))
                if years <= 15:  # Reasonable upper limit
                    return str(years)
            except:
                continue
    
    return 'Not Provided'


def extract_job_type(text):
    """Extract job type (Full Time, Part Time, etc.)"""
    if not isinstance(text, str):
        return 'Not Provided'

    job_types = {
        r'full[-\s]?time': 'Full Time',
        r'part[-\s]?time': 'Part Time',
        r'contract(?:ual)?': 'Contract',
        r'intern(?:ship)?': 'Internship',
        r'freelance|freelancer': 'Freelance',
        r'temporary|temp': 'Temporary'
    }
    
    for pattern, job_type in job_types.items():
        if re.search(pattern, text, re.IGNORECASE):
            return job_type
    
    return 'Not Provided'


def extract_country(place):
    """Extract country from location string"""
    if not isinstance(place, str) or place.strip() == '':
        return 'Not Provided'
    
    # Assuming country is the last part after comma
    parts = place.split(',')
    country = parts[-1].strip()
    
    return country if country else 'Not Provided'


def extract_remote_onsite(text):
    """Determine if job is remote, hybrid, or onsite"""
    if not isinstance(text, str):
        return 'Not Provided'

    if re.search(r'\bremote\b', text, re.IGNORECASE):
        return 'Remote'
    elif re.search(r'\bhybrid\b', text, re.IGNORECASE):
        return 'Hybrid'
    elif re.search(r'on[-\s]?site|office', text, re.IGNORECASE):
        return 'Onsite'
    
    return 'Not Provided'


def extract_experience_level(text):
    """Extract experience level (Senior, Mid-Level, Junior, etc.)"""
    if not isinstance(text, str):
        return 'Not Provided'

    levels = {
        r'\b(senior|lead|principal|staff)\b': 'Senior',
        r'\b(mid[-\s]?level|intermediate)\b': 'Mid-Level',
        r'\b(junior|entry[-\s]?level|associate)\b': 'Junior',
        r'\b(intern|graduate|trainee)\b': 'Intern'
    }
    
    for pattern, level in levels.items():
        if re.search(pattern, text, re.IGNORECASE):
            return level
    
    return 'Not Provided'

## Apply Cleaning Functions

In [None]:
# Prepare Text Columns
df['title'] = df['title'].fillna('').astype(str)
df['description'] = df['description'].fillna('').astype(str)
df['place'] = df['place'].fillna('').astype(str)

print("Text columns prepared")

## Translate Non-English Text

In [None]:
from tqdm import tqdm
import pandas as pd
import time

# Safe translation wrapper
def safe_translate(text):
    try:
        return detect_and_translate(text)
    except:
        return text

def translate_columns_in_batches_manual(df, columns, batch_size=500, sleep_sec=1):
    for col in columns:
        print(f"Translating '{col}' in batches...")
        translated = []
        total = len(df)
        for start in range(0, total, batch_size):
            end = min(start + batch_size, total)
            batch = df[col].iloc[start:end]
            # Manual row-by-row translation with tqdm
            batch_translated = []
            for text in tqdm(batch, desc=f"{col} rows {start}-{end}"):
                batch_translated.append(safe_translate(text))
            translated.extend(batch_translated)
            time.sleep(sleep_sec)  # avoid API rate limits
        df[col + '_translated'] = pd.Series(translated, index=df.index)
    print("All translations complete!")
    return df

# Usage
df = translate_columns_in_batches_manual(df, columns=['title', 'description'], batch_size=500, sleep_sec=1)


In [None]:
df

In [None]:
# Save the DataFrame to a CSV file
df.to_csv("translated_jobs.csv", index=False)

print("Data saved to 'translated_jobs.csv'")

In [None]:
# Clean HTML from translated description
print("Cleaning HTML from descriptions...")
df['description_clean'] = df['description_translated'].apply(clean_html)

print("HTML cleaning complete")

In [None]:
# Clean Title (use translated version)
df['title_clean'] = df['title_translated'].apply(clean_title)
print("Titles cleaned")
df[['title', 'title_translated', 'title_clean']].head()

In [None]:
# Combine Text (use translated and cleaned text)
df['combined_text'] = df['title_translated'] + ' ' + df['description_clean']
print("Combined text created")
df['combined_text'].head()

## Extract Features

In [None]:
print("Extracting tools...")
df['tools'] = df['combined_text'].apply(extract_tools)
df[['tools']].head()

In [None]:
print("Extracting education requirements...")
df['education'] = df['combined_text'].apply(extract_education)
df[['education']].head()

In [None]:
print("Extracting experience years...")
df['experience_years'] = df['combined_text'].apply(extract_experience_years)
df[['experience_years']].head()

In [None]:
print("Extracting experience level...")
df['experience_level'] = df['combined_text'].apply(extract_experience_level)
df[['experience_level']].head()

In [None]:
print("Extracting job type...")
df['job_type'] = df['combined_text'].apply(extract_job_type)
df[['job_type']].head()

In [None]:
print("Extracting work arrangement...")
df['remote_onsite'] = df['combined_text'].apply(extract_remote_onsite)
df[['remote_onsite']].head()

In [None]:
print("Extracting country...")
df['country'] = df['place'].apply(extract_country)
df[['place', 'country']].head()

In [None]:
print("Feature extraction complete!")
df.head()

## Filter for Analytics Roles

In [None]:
def keep_analytics_roles(df, title_column='title_clean'):
    """
    Filters dataset to keep only analytics-focused job titles.
    Includes: analyst, analytics, BI specialist, SQL developer, etc.
    Excludes: non-relevant dev/engineering roles unless in whitelist.
    """
    df[title_column] = df[title_column].str.lower()

    # Include if title contains any of these
    must_include = [
        'analyst', 'analytics',
        'analytics engineer', 'sql developer', 'bi developer',
        'bi specialist', 'bi analyst', 'bi consultant', 'bi engineer',
        'business intelligence'
    ]

    # Exclude these unless in whitelist
    must_exclude = [
        'machine learning', 'ml engineer', 'ai engineer', 
        'data scientist', 'solution architect', 'cloud engineer',
        'devops', 'support engineer', 'qa', 'system admin',
        'game', 'unity', 'security', 'salesforce', 'frontend',
        'backend', 'web developer', 'test engineer', 'researcher'
    ]

    # Whitelist exceptions
    allowed_roles = [
        'sql developer', 'analytics engineer', 'bi developer',
        'bi analyst', 'bi specialist', 'bi consultant', 'bi engineer',
        'data analyst', 'business analyst', 'reporting analyst'
    ]

    def is_valid_title(title):
        # Must match at least one include term
        include = any(kw in title for kw in must_include)
        
        # Exclude only if excluded term present AND not in allowed roles
        exclude = any(
            bad in title and not any(allowed in title for allowed in allowed_roles)
            for bad in must_exclude
        )
        
        return include and not exclude

    filtered_df = df[df[title_column].apply(is_valid_title)].copy()
    print(f"Filtered from {len(df)} to {len(filtered_df)} analytics roles")
    
    return filtered_df

df_final = keep_analytics_roles(df, title_column='title_clean')

In [None]:
# View sample of filtered data
df_final[['title_clean', 'experience_level', 'tools']].head(10)

## Export Results

In [None]:
# Select relevant columns for export
columns_to_export = [
    'job_id', 'title', 'title_clean', 'company', 'place', 'country',
    'date', 'description_clean', 'link', 'tools', 'education', 
    'experience_years', 'experience_level', 'job_type', 'remote_onsite'
]

# Only keep columns that exist
columns_to_export = [col for col in columns_to_export if col in df_final.columns]

df_export = df_final[columns_to_export].copy()

# Export to Excel
df_export.to_excel('../data/processed/linkedin_data_final.xlsx', index=False)
print(f"Exported {len(df_export)} rows to linkedin_data_final.xlsx")

In [None]:
# Display summary statistics
print("\n=== Dataset Summary ===")
print(f"Total rows: {len(df_export)}")
print(f"Date range: {df_export['date'].min()} to {df_export['date'].max()}")
print(f"\nTop 10 Tools:")
print(df_export['tools'].str.split(', ').explode().value_counts().head(10))
print(f"\nJob Type Distribution:")
print(df_export['job_type'].value_counts())
print(f"\nExperience Level Distribution:")
print(df_export['experience_level'].value_counts())

In [None]:
# Display final dataset
df_export.head()