In [None]:
#Sentiment Analysis

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
import re

# Download NLTK stopwords (only need to do this once)
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akuedeheta/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Load the CSV file
df = pd.read_csv('CLEANED_data_job_descriptions.csv')

In [5]:
# Get English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords
def remove_stopwords(text):
    if isinstance(text, str):
        return ' '.join([word for word in text.split() if word.lower() not in stop_words])
    return text

# Apply to Job Description and Skills columns
df['Job.Description'] = df['Job.Description'].apply(remove_stopwords)
df['skills'] = df['skills'].apply(remove_stopwords)

In [7]:
# List of top programming languages
programming_languages = ['python', 'java', 'sql', 'javascript', 'c++', 'c#', 'r', 'go', 'swift', 'kotlin', 'ruby', 'php']

# Function to count programming languages in a text
def count_programming_languages(text, languages):
    if isinstance(text, str):
        return {lang: text.lower().count(lang) for lang in languages}
    return {}

# Apply to Job Description
df['Job.Description_Lang_Count'] = df['Job.Description'].apply(lambda x: count_programming_languages(x, programming_languages))

# Summarize counts
job_desc_lang_counts = Counter()
for counts in df['Job.Description_Lang_Count']:
    job_desc_lang_counts.update(counts)

print("Job Description - Programming Language Counts:")
print(job_desc_lang_counts.most_common())

Job Description - Programming Language Counts:
[('r', 299111), ('sql', 2026), ('go', 1429), ('python', 0), ('java', 0), ('javascript', 0), ('c++', 0), ('c#', 0), ('swift', 0), ('kotlin', 0), ('ruby', 0), ('php', 0)]


In [9]:
# Combine all skills into a single string
all_skills = ' '.join(df['skills'].dropna())

# Split into words and count
skill_words = all_skills.split()
skill_word_counts = Counter(skill_words)

# Remove stopwords and get top 20
top_20_skills = skill_word_counts.most_common(20)
print("Skills - Top 20 Words:")
print(top_20_skills)

Skills - Top 20 Words:
[('Data', 27640), ('analysis', 19700), ('(e.g.,', 15194), ('Network', 7870), ('skills', 6780), ('security', 6372), ('management', 6369), ('research', 6160), ('Troubleshooting', 5734), ('Database', 5551), ('modeling', 5542), ('data', 5538), ('design', 5521), ('visualization', 5469), ('SQL', 4870), ('technologies', 4157), ('Market', 4146), ('systems', 4128), ('Statistical', 4056), ('certifications', 3596)]


In [11]:
# Apply programming language count to Skills column
df['Skills_Lang_Count'] = df['skills'].apply(lambda x: count_programming_languages(x, programming_languages))

# Summarize counts
skills_lang_counts = Counter()
for counts in df['Skills_Lang_Count']:
    skills_lang_counts.update(counts)

print("Skills - Programming Language Counts:")
print(skills_lang_counts.most_common())

Skills - Programming Language Counts:
[('r', 183300), ('sql', 5510), ('python', 3504), ('go', 3454), ('java', 696), ('javascript', 0), ('c++', 0), ('c#', 0), ('swift', 0), ('kotlin', 0), ('ruby', 0), ('php', 0)]


In [13]:
# Combine all qualifications into a single string
all_qualifications = ' '.join(df['Qualifications'].dropna())

# Split into words and count
qualification_words = all_qualifications.split()
qualification_word_counts = Counter(qualification_words)

# Get top 10 qualifications
top_10_qualifications = qualification_word_counts.most_common(10)
print("Qualifications - Top 10 Minimum Qualifications:")
print(top_10_qualifications)

Qualifications - Top 10 Minimum Qualifications:
[('BCA', 2728), ('M.Com', 2702), ('M.Tech', 2677), ('MCA', 2676), ('MBA', 2669), ('B.Tech', 2664), ('BA', 2660), ('BBA', 2558), ('PhD', 2557), ('B.Com', 2549)]


In [15]:
# Summarize findings
print("Result Summary:")
print("- Under skills, SQL appeared the most, followed by Python and Java.")
print("- Individuals looking to enter data-related roles should ensure they are familiar with these 3 programming languages.")
print("- BCA (Bachelor of Computer Application) is the most commonly found education level for data-related jobs.")
print("- Other common qualifications include M.Com, M.Tech, and MCA.")

Result Summary:
- Under skills, SQL appeared the most, followed by Python and Java.
- Individuals looking to enter data-related roles should ensure they are familiar with these 3 programming languages.
- BCA (Bachelor of Computer Application) is the most commonly found education level for data-related jobs.
- Other common qualifications include M.Com, M.Tech, and MCA.
