In [1]:
# Import necessary libraries
import pandas as pd
import nltk
import pycountry
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import matplotlib.pyplot as plt
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
data_path = '/mnt/data/your_dataset.csv'  # Update with the actual path to your dataset
df = pd.read_csv(data_path)

# Inspect the data
print(df.head())
print(df.info())

# Clean the data
stop_words = set(stopwords.words('english'))
def clean_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and punctuation, and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

df['cleaned_description'] = df['Job Description'].apply(clean_text)

# Define communal and agentic words
communal_words = [
    'empathy', 'supportive', 'collaborative', 'kind', 'warm', 'compassionate', 
    'nurturing', 'cooperative', 'helpful', 'understanding', 'friendly', 'patient', 
    'approachable', 'loyal', 'trustworthy', 'caring'
]

agentic_words = [
    'ambition', 'independence', 'assertiveness', 'confident', 'competitive', 
    'leadership', 'proactive', 'self-reliant', 'dominant', 'persistent', 'decisive', 
    'driven', 'strategic', 'innovative', 'analytical', 'resilient'
]

def count_words(tokens, word_list):
    return sum(1 for token in tokens if token in word_list)

# Calculate communal and agentic scores
df['communal_score'] = df['cleaned_description'].apply(lambda x: count_words(x, communal_words))
df['agentic_score'] = df['cleaned_description'].apply(lambda x: count_words(x, agentic_words))

# Calculate frequency of communal and agentic words
df['communal_freq'] = df['communal_score'] / df['cleaned_description'].apply(len)
df['agentic_freq'] = df['agentic_score'] / df['cleaned_description'].apply(len)

# Extract seniority level from job titles
def extract_seniority(title):
    title = title.lower()
    if 'senior' in title or 'lead' in title or 'manager' in title or 'director' in title:
        return 'senior'
    elif 'junior' in title or 'assistant' in title or 'entry' in title:
        return 'junior'
    else:
        return 'mid'

df['seniority_level'] = df['Job Title'].apply(extract_seniority)

# Extract salary range from job descriptions (if salary information is present)
def extract_salary(description):
    # Example pattern to extract salary range; modify as needed based on actual data format
    salary_pattern = re.compile(r'\$\d{2,3}(?:,\d{3})*(?:-\$\d{2,3}(?:,\d{3})*)?')
    match = salary_pattern.search(description)
    if match:
        salary_str = match.group()
        if '-' in salary_str:
            lower, upper = salary_str.split('-')
            return (int(lower.replace('$', '').replace(',', '')) + int(upper.replace('$', '').replace(',', ''))) / 2
        else:
            return int(salary_str.replace('$', '').replace(',', ''))
    return None

df['salary'] = df['Job Description'].apply(lambda x: extract_salary(x) if isinstance(x, str) else None)

# Handle missing salary values if necessary (e.g., imputation, dropping, etc.)
df['salary'] = df['salary'].fillna(df['salary'].median())

# Visualization
plt.figure(figsize=(12, 6))
df['communal_score'].plot(kind='hist', alpha=0.5, label='Communal Score', color='blue')
df['agentic_score'].plot(kind='hist', alpha=0.5, label='Agentic Score', color='red')
plt.legend()
plt.xlabel('Score')
plt.title('Distribution of Communal and Agentic Scores in Job Descriptions')
plt.show()

plt.figure(figsize=(12, 6))
df['communal_freq'].plot(kind='hist', alpha=0.5, label='Communal Frequency', color='blue')
df['agentic_freq'].plot(kind='hist', alpha=0.5, label='Agentic Frequency', color='red')
plt.legend()
plt.xlabel('Frequency')
plt.title('Distribution of Communal and Agentic Frequencies in Job Descriptions')
plt.show()

# Save the cleaned data with features
df.to_csv('/mnt/data/cleaned_dataset.csv', index=False)

print("Data cleaning and feature extraction complete. Cleaned data saved to 'cleaned_dataset.csv'.")


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kristenfinley/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kristenfinley/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/your_dataset.csv'