In [18]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [21]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\agamj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\agamj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\agamj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\agamj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [22]:


# Preprocessing function
def clean_text(text):
    """Cleans and preprocesses text data."""
    if pd.isnull(text):  # Handle missing values
        return ""
    
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenization means to convert the sentence into tokens
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords removing words like the is i am
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization converting words into its original form eg. running to run
    
    return " ".join(tokens)

# Function to clean dataset
def preprocess_dataset(file_path):
    """Loads dataset, handles missing values, combines text columns, and applies text preprocessing."""
    df = pd.read_csv(file_path)

    # Drop rows where critical columns are missing
    df = df.dropna(subset=['job_id', 'description'])

    # Fill categorical missing values
    df['company_name'] = df['company_name'].fillna("Unknown")
    df['location'] = df['location'].fillna("Unknown")
    df['formatted_experience_level'] = df['formatted_experience_level'].fillna("Unknown")
    df['skills_desc'] = df['skills_desc'].fillna("No skills provided")

    # Fill numerical missing values with median salary
    for col in ['max_salary', 'min_salary', 'med_salary', 'normalized_salary']:
        df[col] = df[col].fillna(df[col].median())

    # Drop unnecessary columns
    df = df.drop(columns=['pay_period', 'views', 'company_id', 'applies', 'remote_allowed',
                          'job_posting_url', 'application_url', 'expiry', 'closed_time',
                          'listed_time', 'posting_domain', 'sponsored'], errors='ignore')

    # Combine job description and skills description
    df['combined_description'] = df['description'].fillna('') + " " + df['skills_desc'].fillna('')

    # Apply text preprocessing on the combined text
    df['clean_text'] = df['combined_description'].apply(clean_text)

    return df

In [23]:
if __name__ == "__main__":
    # Test the function with a sample dataset
    df_cleaned = preprocess_dataset('../datasets/job_description.csv')

    # Print a sample of cleaned data
    print("\n✅ Sample of cleaned data:")
    print(df_cleaned[['combined_description', 'clean_text']].head())


✅ Sample of cleaned data:
                                combined_description  \
0  Job descriptionA leading real estate firm in N...   
1  At Aspen Therapy and Wellness , we are committ...   
2  The National Exemplar is accepting application...   
3  Senior Associate Attorney - Elder Law / Trusts...   
4  Looking for HVAC service tech with experience ...   

                                          clean_text  
0  job descriptiona leading real estate firm new ...  
1  aspen therapy wellness committed serving clien...  
2  national exemplar accepting application assist...  
3  senior associate attorney elder law trust esta...  
4  looking hvac service tech experience commerica...  
