# Data Cleaning, Pre-processing, and Feature Engineering
## Detecting Gender Bias in Job Descriptions

# Job Description Analysis

This notebook performs data cleaning and feature extraction on job descriptions, focusing on detecting gender bias through communal and agentic language.


In [None]:
# Import necessary libraries
import pandas as pd
import nltk
import pycountry
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt
import re
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from textstat import textstat


## Download NLTK Data

Download necessary NLTK data for tokenization, stopwords, and lemmatization.


In [None]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


## Load and Inspect Data

Load the CSV file into a Pandas DataFrame and inspect the first few rows and general information.


In [None]:
# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('job_title_des.csv')

# Inspect the data
print(df.head())
print(df.info())


## Define Helper Functions

Define functions for cleaning text, lemmatizing, detecting communal and agentic words, extracting seniority levels, and extracting salary information.


In [None]:
# Define stop words and initialize lemmatizer
stop_words = set(stopwords.words('english'))

#Testing removal
# lemmatizer = WordNetLemmatizer()

# Clean text function
def clean_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

# Lemmatize text function
#def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Define communal and agentic words
communal_words = [
    'empathy', 'supportive', 'collaborative', 'kind', 'warm', 'compassionate', 
    'nurturing', 'cooperative', 'helpful', 'understanding', 'friendly', 'patient', 
    'approachable', 'loyal', 'trustworthy', 'caring', 'empathetic', 'sympathetic', 
    'considerate', 'tolerant', 'generous', 'amicable', 'benevolent', 'cordial', 
    'gentle', 'forgiving', 'inclusive', 'reliable', 'sympathizing', 'responsive', 
    'supporting', 'altruistic', 'dedicated', 'personable', 'sociable', 'neighborly', 
    'thoughtful', 'cohesive', 'agreeable', 'emotional intelligence', 'community-oriented',
    'kind-hearted', 'empathic', 'charitable', 'helping', 'humane', 'cross-functional',
    'teamwork', 'collaborate', 'partner', 'team', 'teamwork', 'community', 'share',
    'support'
]

agentic_words = [
    'ambition', 'independence', 'assertiveness', 'confident', 'competitive', 
    'leadership', 'proactive', 'self-reliant', 'dominant', 'persistent', 'decisive', 
    'driven', 'strategic', 'innovative', 'analytical', 'resilient', 'determined', 
    'goal-oriented', 'ambitious', 'tenacious', 'self-confident', 'entrepreneurial', 
    'resourceful', 'problem-solving', 'visionary', 'dynamic', 'risk-taking', 
    'decisiveness', 'assertive', 'result-oriented', 'influential', 'high-achieving', 
    'challenger', 'independent thinking', 'self-starter', 'self-assured', 'decisive', 
    'executive', 'autonomous', 'enterprising', 'bold', 'driving', 'impactful', 
    'asserting', 'purposeful', 'assertive', 'motivated', 'authority', 'control', 'dominant', 
    'leadership', 'mastery', 'strong', 'expertise', 'leadership', 'disciplined', 'authority',
    'control', 'dominance', 'mastery', 'governance', 'command', 'ascendancy', 'command',
    'vigor', 'supremacy', 'rule'    
]

# Count words function
def count_words(tokens, word_list):
    return sum(1 for token in tokens if token in word_list)

# Detect agentic words function
def detect_agentic_words(description):
    words = description.split()
    detected_words = [word for word in words if word.lower() in agentic_words]
    return ' '.join(detected_words)


# Detect communal words function
def detect_communal_words(description):
    words = description.split()
    detected_words = [word for word in words if word.lower() in communal_words]
    return ' '.join(detected_words)

# Extract seniority level from job titles
def extract_seniority(title):
    title = title.lower()
    if any(keyword in title for keyword in ['senior', 'lead', 'manager', 'director']):
        return 'senior'
    elif any(keyword in title for keyword in ['junior', 'assistant', 'entry']):
        return 'junior'
    else:
        return 'mid'

# Extract salary range from job descriptions
def extract_salary(description):
    salary_pattern = re.compile(r'\$\d{2,3}(?:,\d{3})*(?:-\$\d{2,3}(?:,\d{3})*)?')
    match = salary_pattern.search(description)
    if match:
        salary_str = match.group()
        if '-' in salary_str:
            lower, upper = salary_str.split('-')
            return (int(lower.replace('$', '').replace(',', '')) + int(upper.replace('$', '').replace(',', ''))) / 2
        else:
            return int(salary_str.replace('$', '').replace(',', ''))
    return None


## Extract Features

Apply the helper functions to extract seniority levels, clean job descriptions, detect agentic and communal words, add sentiment scores, and readability scores.


In [None]:
# Extract seniority level from job titles
df['seniority_level'] = df['Job Title'].apply(extract_seniority)

# Extract salary information
df['salary'] = df['Job Description'].apply(lambda x: extract_salary(x) if isinstance(x, str) else None)

# Clean job descriptions
df['cleaned_description'] = df['Job Description'].apply(clean_text)
df['cleaned_description'] = df['cleaned_description'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Add text length features
df['word_count'] = df['cleaned_description'].apply(lambda x: len(x.split()))
df['char_count'] = df['cleaned_description'].apply(lambda x: len(x))
df['avg_word_length'] = df['cleaned_description'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))
df['sentence_count'] = df['cleaned_description'].apply(lambda x: len(TextBlob(x).sentences))
df['avg_sentence_length'] = df['word_count'] / df['sentence_count']


## Add Additional Features

Detect agentic and communal words, add sentiment scores, and readability scores.


In [None]:
# Add columns for detected agentic and communal words
df['agentic_words'] = df['cleaned_description'].apply(detect_agentic_words)
df['communal_words'] = df['cleaned_description'].apply(detect_communal_words)

# Add sentiment score
df['sentiment'] = df['cleaned_description'].apply(lambda x: TextBlob(x).sentiment.polarity)

# Add readability scores
df['flesch_reading_ease'] = df['cleaned_description'].apply(lambda x: textstat.flesch_reading_ease(x))
df['flesch_kincaid_grade'] = df['cleaned_description'].apply(lambda x: textstat.flesch_kincaid_grade(x))


In [None]:
print (df.cleaned_description.head(20))

## Calculate Communal and Agentic Scores

Calculate communal and agentic scores, as well as their frequencies.


In [None]:
# Calculate communal and agentic scores
df['communal_score'] = df['cleaned_description'].apply(lambda x: count_words(x, communal_words))
df['agentic_score'] = df['cleaned_description'].apply(lambda x: count_words(x, agentic_words))

# Calculate frequency of communal and agentic words
df['communal_freq'] = df['communal_score'] / df['cleaned_description'].apply(len)
df['agentic_freq'] = df['agentic_score'] / df['cleaned_description'].apply(len)


## Handle Missing Salary Values

Handle missing salary values by imputing with the median salary.


In [None]:
# Handle missing salary values if necessary (e.g., imputation, dropping, etc.)
df['salary'] = df['salary'].fillna(df['salary'].median())


## Save Cleaned Data

Save the cleaned data with features to a CSV file.


In [None]:
# Save the cleaned data with features to the current working directory
df.to_csv('cleaned_dataset.csv', index=False)
print("Data cleaning and feature extraction complete. Cleaned data saved to 'cleaned_dataset.csv'.")


## Visualize Results

Create visual


In [None]:
# Visualization
plt.figure(figsize=(12, 6))
df['communal_score'].plot(kind='hist', alpha=0.5, label='Communal Score', color='blue')
df['agentic_score'].plot(kind='hist', alpha=0.5, label='Agentic Score', color='red')
plt.legend()
plt.xlabel('Score')
plt.title('Distribution of Communal and Agentic Scores in Job Descriptions')
plt.show()

plt.figure(figsize=(12, 6))
df['communal_freq'].plot(kind='hist', alpha=0.5, label='Communal Frequency', color='blue')
df['agentic_freq'].plot(kind='hist', alpha=0.5, label='Agentic Frequency', color='red')
plt.legend()
plt.xlabel('Frequency')
plt.title('Distribution of Communal and Agentic Frequencies in Job Descriptions')
plt.show()

print(df.info())
