In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the job descriptions from the CSV file
file_path = 'resources/software_developer_united_states_1971_20191023_1.csv'  # Update this path to the correct location
df = pd.read_csv(file_path)

# Check if 'string' column is present; adapt column name if necessary
df['string'] = df['job_title']+ df['job_description']

# Define the keywords (programming languages and databases) and the term 'junior'
keywords = [
    'java', 'python', 'c', 'kotlin', 'swift', 'rust', 'ruby', 'scala',
    'julia', 'lua', 'oracle', 'mysql', 'microsoft sql server', 'postgresql',
    'mongodb', 'redis', 'snowflake', 'elasticsearch', 'ibm db2', 'sqlite', 'junior'
]

# Vectorize the job descriptions with the defined keywords
vectorizer = TfidfVectorizer(stop_words='english', vocabulary=keywords)
tfidf_matrix = vectorizer.fit_transform(df['string'])

# Extract the row corresponding to 'junior' as a vector
junior_vector = tfidf_matrix[:, keywords.index('junior')]

# Calculate cosine similarity between 'junior' and all other keywords
cosine_sim_with_junior = cosine_similarity(junior_vector.T, tfidf_matrix.T)

# Extract the similarity scores for each keyword with 'junior'
similarity_scores = cosine_sim_with_junior.flatten()

# Pair the keywords (excluding 'junior') with their scores
keyword_similarity = list(zip(keywords, similarity_scores))

# Sort by relevance (descending order)
sorted_keyword_similarity = sorted(keyword_similarity, key=lambda x: x[1], reverse=True)

# Display the top programming languages and databases related to 'junior'
print("Essential skills for junior-level positions:")
for keyword, score in sorted_keyword_similarity:
    print(f"{keyword}: {score:.4f}")


Essential skills for junior-level positions:
junior: 1.0000
java: 0.1246
python: 0.0956
ruby: 0.0832
mysql: 0.0583
oracle: 0.0482
mongodb: 0.0228
snowflake: 0.0221
postgresql: 0.0181
scala: 0.0161
elasticsearch: 0.0147
swift: 0.0103
redis: 0.0071
sqlite: 0.0062
rust: 0.0041
kotlin: 0.0017
c: 0.0000
julia: 0.0000
lua: 0.0000
microsoft sql server: 0.0000
ibm db2: 0.0000


In [19]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the job descriptions from the CSV file
file_path = 'resources/software_developer_united_states_1971_20191023_1.csv'  # Update this path
df = pd.read_csv(file_path)

# Check if 'string' column is present; adapt column name if necessary
df['string'] = df['job_title']+ df['job_description']

# Define the programming languages
programming_languages = [
    'java', 'python', 'c', 'kotlin', 'swift', 'rust', 'ruby', 'scala',
    'julia', 'lua'
]

# Vectorize the job descriptions with the defined programming languages
vectorizer = TfidfVectorizer(stop_words='english', vocabulary=programming_languages)
tfidf_matrix = vectorizer.fit_transform(df['string'])

# Calculate the cosine similarity between all programming languages
cosine_sim_matrix = cosine_similarity(tfidf_matrix.T)

# Convert similarity matrix to a DataFrame for easier analysis
cosine_sim_df = pd.DataFrame(
    cosine_sim_matrix, 
    index=programming_languages, 
    columns=programming_languages
)

# Primary language (select based on initial analysis or top importance score)
primary_language = 'java'  # Replace with your choice

# Sort complementary languages by similarity score
complementary_languages = cosine_sim_df[primary_language].sort_values(ascending=False)[1:]

# Display recommendations
print(f"Top complementary programming languages for '{primary_language}':")
print(complementary_languages.head())


Top complementary programming languages for 'java':
python    0.232416
scala     0.097139
ruby      0.083994
swift     0.042116
kotlin    0.030037
Name: java, dtype: float64


# P(Senior | Skill)

In [20]:
import numpy as np
# Define the prior probabilities
p_senior = 0.07  # P(Senior Software Developer)
p_not_senior = 1 - p_senior  # P(Not Senior)

# Define database skills and their probabilities (assumed distribution)
database_skills = [
    'oracle', 'mysql', 'microsoft sql server', 'postgresql', 'mongodb',
    'redis', 'snowflake', 'elasticsearch', 'ibm db2', 'sqlite'
]

p_skill = {  # P(Skill), pre-calculated frequencies
    'oracle': 1392 / 7583,
    'mysql': 667 / 7583,
    'microsoft sql server': 868 / 7583,
    'postgresql': 261 / 7583,
    'mongodb': 296 / 7583,
    'redis': 106 / 7583,
    'snowflake': 15 / 7583,
    'elasticsearch': 161 / 7583,
    'ibm db2': 48 / 7583,
    'sqlite': 28 / 7583
}

# Assume P(Skill | Senior) follows a distribution (adjust these based on domain knowledge)
np.random.seed(42)  # For reproducibility
p_skill_given_senior = {skill: np.random.uniform(0.5, 0.9) for skill in database_skills}
p_skill_given_not_senior = {skill: np.random.uniform(0.1, 0.4) for skill in database_skills}

# Monte Carlo Simulation
iterations = 10_000
results = []

for _ in range(iterations):
    for skill in database_skills:
        # P(Skill) calculation
        p_skill_combined = (
            p_skill_given_senior[skill] * p_senior +
            p_skill_given_not_senior[skill] * p_not_senior
        )
        
        # Bayes' theorem to calculate P(Senior | Skill)
        p_senior_given_skill = (
            p_skill_given_senior[skill] * p_senior / p_skill_combined
        )
        
        results.append((skill, p_senior_given_skill))

# Summarize the results
results_df = pd.DataFrame(results, columns=['Skill', 'P(Senior | Skill)'])
mean_results = results_df.groupby('Skill')['P(Senior | Skill)'].mean().sort_values(ascending=False)

# Display the most relevant database proficiencies for senior roles
print("Database proficiencies best suited for senior software developers:")
print(mean_results)


Database proficiencies best suited for senior software developers:
Skill
oracle                  0.315379
postgresql              0.253731
sqlite                  0.239332
mongodb                 0.215014
redis                   0.214495
elasticsearch           0.198396
ibm db2                 0.195336
snowflake               0.170744
microsoft sql server    0.145755
mysql                   0.144912
Name: P(Senior | Skill), dtype: float64


# P(Skill | senior)

In [22]:
import numpy as np# Prior probability for senior software developer

p_senior = 0.07  # P(Senior)
p_skill = {  # P(Skill), pre-calculated frequencies
    'oracle': 1392 / 7583,
    'mysql': 667 / 7583,
    'microsoft sql server': 868 / 7583,
    'postgresql': 261 / 7583,
    'mongodb': 296 / 7583,
    'redis': 106 / 7583,
    'snowflake': 15 / 7583,
    'elasticsearch': 161 / 7583,
    'ibm db2': 48 / 7583,
    'sqlite': 28 / 7583
}

# Assume P(Senior | Skill) is based on a uniform distribution (to be refined if real data is available)
np.random.seed(42)  # For reproducibility
p_senior_given_skill = {skill: np.random.uniform(0.5, 0.9) for skill in p_skill.keys()}

# Monte Carlo simulation to compute P(Skill | Senior)
iterations = 10_000
results = []

for _ in range(iterations):
    for skill in p_skill.keys():
        # Simulate P(Senior | Skill) (fixed in this case, but could vary in real scenarios)
        simulated_p_senior_given_skill = p_senior_given_skill[skill]
        
        # Calculate P(Skill | Senior) using Bayes' theorem
        p_skill_given_senior = (
            simulated_p_senior_given_skill * p_skill[skill] / p_senior
        )
        
        results.append((skill, p_skill_given_senior))

# Summarize the results
results_df = pd.DataFrame(results, columns=['Skill', 'P(Skill | Senior)'])
mean_results = results_df.groupby('Skill')['P(Skill | Senior)'].mean().sort_values(ascending=False)

# Display the result
print("Database proficiencies best suited for senior software developers (P(Skill | Senior)):")
print(mean_results)


Database proficiencies best suited for senior software developers (P(Skill | Senior)):
Skill
oracle                  1.704082
microsoft sql server    1.296412
mysql                   1.106141
postgresql              0.363595
mongodb                 0.313620
elasticsearch           0.256743
redis                   0.112308
ibm db2                 0.066957
sqlite                  0.041315
snowflake               0.014786
Name: P(Skill | Senior), dtype: float64
