# Exploratory Data Analysis

### Setting up the Environment

In [26]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from textblob import TextBlob
from textstat import textstat
import spacy
from collections import Counter

### Loading and Inspecting the Dataset
Load the cleaned dataset and inspect its structure:

In [27]:
# Load the cleaned dataset
df = pd.read_csv('gendered_data.csv')
print(df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742 entries, 0 to 741
Data columns (total 50 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  742 non-null    int64  
 1   Job Title                   742 non-null    object 
 2   Salary Estimate             742 non-null    object 
 3   Job Description             742 non-null    object 
 4   Rating                      742 non-null    float64
 5   Company Name                742 non-null    object 
 6   Location                    742 non-null    object 
 7   Headquarters                742 non-null    object 
 8   Size                        742 non-null    object 
 9   Founded                     742 non-null    int64  
 10  Type of ownership           742 non-null    object 
 11  Industry                    742 non-null    object 
 12  Sector                      742 non-null    object 
 13  Revenue                     742 non

In [28]:
# Define the function for word count
def word_count(description, word_list):
    """
    Count the occurrences of words in the given word_list within the description.
    """
    return sum(word in description for word in word_list)

# Calculate communal and agentic scores
df['communal_score'] = df['Lemmatized_Description'].apply(lambda x: word_count(x, Communal_Words))
df['agentic_score'] = df['Lemmatized_Description'].apply(lambda x: word_count(x, Agentic_Words))

# Calculate frequency of communal and agentic words
df['communal_freq'] = df['communal_score'] / df['Lemmatized_Description'].apply(len)
df['agentic_freq'] = df['agentic_score'] / df['Lemmatized_Description'].apply(len)

# Visualization
plt.figure(figsize=(12, 6))
df['communal_score'].plot(kind='hist', alpha=0.5, label='Communal Score', color='blue')
df['agentic_score'].plot(kind='hist', alpha=0.5, label='Agentic Score', color='red')
plt.legend()
plt.xlabel('Score')
plt.title('Distribution of Communal and Agentic Scores in Job Descriptions')
plt.show()

plt.figure(figsize=(12, 6))
df['communal_freq'].plot(kind='hist', alpha=0.5, label='Communal Frequency', color='blue')
df['agentic_freq'].plot(kind='hist', alpha=0.5, label='Agentic Frequency', color='red')
plt.legend()
plt.xlabel('Frequency')
plt.title('Distribution of Communal and Agentic Frequencies in Job Descriptions')
plt.show()

print(df.info())


NameError: name 'Communal_Words' is not defined

In [25]:
# Calculate communal and agentic scores
df['communal_score'] = df['Lemmatized_Description'].apply(lambda x: desc_len(x, Communal_Words))
df['agentic_score'] = df['Lemmatized_Description'].apply(lambda x: desc_len(x, Agentic_Words))

# Calculate frequency of communal and agentic words
df['communal_freq'] = df['communal_score'] / df['Lemmatized_Description'].apply(len)
df['agentic_freq'] = df['agentic_score'] / df['Lemmatized_Description'].apply(len)

# Visualization
plt.figure(figsize=(12, 6))
df['communal_score'].plot(kind='hist', alpha=0.5, label='Communal Score', color='blue')
df['agentic_score'].plot(kind='hist', alpha=0.5, label='Agentic Score', color='red')
plt.legend()
plt.xlabel('Score')
plt.title('Distribution of Communal and Agentic Scores in Job Descriptions')
plt.show()

plt.figure(figsize=(12, 6))
df['communal_freq'].plot(kind='hist', alpha=0.5, label='Communal Frequency', color='blue')
df['agentic_freq'].plot(kind='hist', alpha=0.5, label='Agentic Frequency', color='red')
plt.legend()
plt.xlabel('Frequency')
plt.title('Distribution of Communal and Agentic Frequencies in Job Descriptions')
plt.show()

print(df.info())

NameError: name 'desc_len' is not defined

In [24]:
# Calculate communal and agentic scores
df['communal_score'] = df['Lemmatized_Description'].apply(lambda x: count_words(x, Communal_Words))
df['agentic_score'] = df['Lemmatized_Description'].apply(lambda x: count_words(x, Agentic_Words))

# Calculate frequency of communal and agentic words
df['communal_freq'] = df['communal_score'] / df['Lemmatized_Description'].apply(len)
df['agentic_freq'] = df['agentic_score'] / df['Lemmatized_Description'].apply(len)

# Visualization
plt.figure(figsize=(12, 6))
df['communal_score'].plot(kind='hist', alpha=0.5, label='Communal Score', color='blue')
df['agentic_score'].plot(kind='hist', alpha=0.5, label='Agentic Score', color='red')
plt.legend()
plt.xlabel('Score')
plt.title('Distribution of Communal and Agentic Scores in Job Descriptions')
plt.show()

plt.figure(figsize=(12, 6))
df['communal_freq'].plot(kind='hist', alpha=0.5, label='Communal Frequency', color='blue')
df['agentic_freq'].plot(kind='hist', alpha=0.5, label='Agentic Frequency', color='red')
plt.legend()
plt.xlabel('Frequency')
plt.title('Distribution of Communal and Agentic Frequencies in Job Descriptions')
plt.show()

print(df.info())

NameError: name 'count_words' is not defined

### Basic Statistical Summary
Calculate basic statistics for numerical attributes:

In [None]:
# Basic statistics
basic_stats = df.describe()
print(basic_stats)

In [None]:
# Calculate communal and agentic scores
df['communal_score'] = df['cleaned_description'].apply(lambda x: count_words(x, communal_words))
df['agentic_score'] = df['cleaned_description'].apply(lambda x: count_words(x, agentic_words))

# Calculate frequency of communal and agentic words
df['communal_freq'] = df['communal_score'] / df['cleaned_description'].apply(len)
df['agentic_freq'] = df['agentic_score'] / df['cleaned_description'].apply(len)

# Visualization
plt.figure(figsize=(12, 6))
df['communal_score'].plot(kind='hist', alpha=0.5, label='Communal Score', color='blue')
df['agentic_score'].plot(kind='hist', alpha=0.5, label='Agentic Score', color='red')
plt.legend()
plt.xlabel('Score')
plt.title('Distribution of Communal and Agentic Scores in Job Descriptions')
plt.show()

plt.figure(figsize=(12, 6))
df['communal_freq'].plot(kind='hist', alpha=0.5, label='Communal Frequency', color='blue')
df['agentic_freq'].plot(kind='hist', alpha=0.5, label='Agentic Frequency', color='red')
plt.legend()
plt.xlabel('Frequency')
plt.title('Distribution of Communal and Agentic Frequencies in Job Descriptions')
plt.show()

print(df.info())

### Distribution of Communal and Agentic Scores
Visualize the distribution of communal and agentic scores:

In [None]:
# Distribution of Communal and Agentic Scores
plt.figure(figsize=(12, 6))
df['communal_score'].plot(kind='hist', alpha=0.5, label='Communal Score', color='blue')
df['agentic_score'].plot(kind='hist', alpha=0.5, label='Agentic Score', color='red')
plt.legend()
plt.xlabel('Score')
plt.title('Distribution of Communal and Agentic Scores in Job Descriptions')
plt.show()


### Word Frequency Analysis
Generate word clouds for communal and agentic words:

In [None]:
# Word Frequency Analysis (Word Clouds)
communal_words = ' '.join(df[df['communal_score'] > 0]['cleaned_description'])
agentic_words = ' '.join(df[df['agentic_score'] > 0]['cleaned_description'])

# Generate word clouds
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
wordcloud_comm = WordCloud(width=800, height=400, background_color='white').generate(communal_words)
plt.imshow(wordcloud_comm, interpolation='bilinear')
plt.title('Communal Words')
plt.axis('off')

plt.subplot(1, 2, 2)
wordcloud_agentic = WordCloud(width=800, height=400, background_color='white').generate(agentic_words)
plt.imshow(wordcloud_agentic, interpolation='bilinear')
plt.title('Agentic Words')
plt.axis('off')

plt.show()


In [None]:
# Word Frequency Analysis (Word Clouds)
communal_words = ' '.join(df[df['communal_score'] > 0]['cleaned_description'])

# Generate word clouds
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
wordcloud_comm = WordCloud(width=800, height=400, background_color='white').generate(communal_words)
plt.imshow(wordcloud_comm, interpolation='bilinear')
plt.title('Communal Words')
plt.axis('off')

plt.show()

In [None]:
# Word Frequency Analysis (Word Clouds)
agentic_words = ' '.join(df[df['agentic_score'] > 0]['cleaned_description'])

# Generate word clouds
plt.subplot(1, 2, 2)
wordcloud_agentic = WordCloud(width=800, height=400, background_color='white').generate(agentic_words)
plt.imshow(wordcloud_agentic, interpolation='bilinear')
plt.title('Agentic Words')
plt.axis('off')

plt.show()

### Sentiment Analysis
Analyze and visualize sentiment distribution:

In [None]:
# Sentiment Analysis
df['sentiment'] = df['cleaned_description'].apply(lambda x: TextBlob(x).sentiment.polarity)

plt.figure(figsize=(12, 6))
df['sentiment'].plot(kind='hist', bins=20, alpha=0.7, color='purple')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Frequency')
plt.title('Distribution of Sentiment Polarity in Job Descriptions')
plt.show()

Sentiment analysis calculates the sentiment polarity of each job description text. Sentiment polarity typically ranges from -1 to +1:
Negative Sentiment: Close to -1 indicates negative sentiment (e.g., unhappy, critical).
Neutral Sentiment: Around 0 indicates neutral sentiment (e.g., factual, objective).
Positive Sentiment: Close to +1 indicates positive sentiment (e.g., happy, satisfied).

The visualization often takes the form of a histogram.
X-axis: Represents the sentiment polarity values ranging from -1 (negative) to +1 (positive).
Y-axis: Represents the frequency or count of job descriptions falling into each sentiment polarity range.
A bell-shaped curve centered around 0 indicates a balanced mix of positive and negative sentiments.
Skewed distributions towards the negative or positive ends suggest prevalent sentiment trends in the dataset.

Sentiment Trends: Analyzing the histogram helps in understanding whether the majority of job descriptions are positively, neutrally, or negatively perceived.

Variability: Variability in sentiment scores indicates diversity in the emotional tone of job descriptions, which may influence candidate perceptions and application decisions.

### Readability Metrics Analysis
Visualize readability metrics distribution:

In [None]:
# Readability Metrics
plt.figure(figsize=(12, 6))
df['flesch_reading_ease'].plot(kind='hist', bins=20, alpha=0.7, color='orange', label='Flesch Reading Ease')
df['flesch_kincaid_grade'].plot(kind='hist', bins=20, alpha=0.5, color='blue', label='Flesch Kincaid Grade')
plt.legend()
plt.xlabel('Readability Score')
plt.ylabel('Frequency')
plt.title('Distribution of Readability Scores in Job Descriptions')
plt.show()


The Flesch Reading Ease and Flesch-Kincaid Grade Level are readability metrics commonly used to assess the complexity and difficulty of written text, including job descriptions. Here's what each metric indicates:

Flesch Reading Ease
The Flesch Reading Ease score is a measure of how easy a piece of text is to read. It is calculated based on the average number of syllables per word and the average number of words per sentence in the text. The formula for calculating the Flesch Reading Ease score is:

206.835
−
1.015
×
(
average words per sentence
)
−
84.6
×
(
average syllables per word
)
206.835−1.015×(average words per sentence)−84.6×(average syllables per word)

The score typically ranges from 0 to 100, where higher scores indicate easier readability:

90-100: Very easy to read. Easily understandable by an average 11-year-old student.
80-89: Easy to read. Understandable by 6th-grade students.
70-79: Fairly easy to read. Understandable by 7th-8th grade students.
60-69: Standard readability. Understandable by 9th-10th grade students.
50-59: Fairly difficult to read. Understandable by high school graduates.
30-49: Difficult to read. Understandable by college graduates.
0-29: Very difficult to read. Best understood by university graduates.
Flesch-Kincaid Grade Level
The Flesch-Kincaid Grade Level is another readability test that assesses the approximate grade level required to understand a piece of text. It calculates the grade level based on the average number of words per sentence and the average number of syllables per word. The formula is:

0.39
×
(
average words per sentence
)
+
11.8
×
(
average syllables per word
)
−
15.59
0.39×(average words per sentence)+11.8×(average syllables per word)−15.59

The result is a score that corresponds to a U.S. grade level. For example, a score of 8.0 indicates that the text is readable by an average eighth grader.

Interpretation
Lower Scores: Indicate more complex and difficult-to-read text.
Higher Scores: Indicate easier and more accessible text.
In the context of job descriptions, these metrics can help gauge how understandable and accessible the language is, which is crucial for attracting a diverse pool of applicants and ensuring clarity in communication.