In [8]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
# Define job roles and required skills
roles_data = {
    "Data Scientist": {"Python", "Statistics", "Machine Learning", "Data Visualization"},
    "ML Engineer": {"Python", "Machine Learning", "Deployment", "Algorithms"},
    "Data Analyst": {"SQL", "Python", "Data Visualization", "Excel"},
    "Data Engineer": {"Python", "SQL", "ETL", "Cloud Computing"},
    "AI Researcher": {"Python", "Deep Learning", "Machine Learning", "Algorithms"},
    "Business Analyst": {"Excel", "SQL", "Data Visualization", "Business Intelligence"},
    "NLP Engineer": {"Python", "NLP", "Machine Learning", "Deep Learning"}
}

In [10]:
# Convert skills into a DataFrame
mlb = MultiLabelBinarizer()
df = pd.DataFrame(mlb.fit_transform(roles_data.values()), index=roles_data.keys(), columns=mlb.classes_)

In [11]:
# Create a lowercase mapping for roles
roles_mapping = {role.lower(): role for role in roles_data}

In [12]:
# Recommendation Function
def get_top_recommendations_cosine(input_role, top_n=3, debug=False):
    input_role = input_role.lower()  # Convert input to lowercase for case-insensitive matching

    if input_role not in roles_mapping:
        return "Role not found!"
    
    actual_role = roles_mapping[input_role]  # Get the properly formatted role name

    # Prevent invalid top_n values
    top_n = max(1, min(top_n, len(roles_data)-1))

    input_vector = df.loc[actual_role].values.reshape(1, -1)
    similarities = {}

    for role in roles_data:
        if role != actual_role:
            role_vector = df.loc[role].values.reshape(1, -1)
            similarity = cosine_similarity(input_vector, role_vector)[0][0]
            similarities[role] = similarity

    if debug:
        print(f"Cosine Similarity scores for {actual_role}: {similarities}")

    # Sort roles by similarity and return the top N
    sorted_roles = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return [role for role, score in sorted_roles[:top_n]]


In [13]:
print(get_top_recommendations_cosine("business analyst"))  # Case insensitive check

['Data Analyst', 'Data Scientist', 'Data Engineer']


In [14]:
print(get_top_recommendations_cosine("data Scientist"))  

['ML Engineer', 'Data Analyst', 'AI Researcher']
