In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast

data = pd.read_csv('../data/platinum/report_data.csv')

data['userSkills'] = data['userSkills'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
data['courseSkills'] = data['courseSkills'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

data['skills_vector'] = data['courseSkills'].apply(lambda x: ' '.join(x))

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(data['skills_vector'])


Course Recommendation Function

In [13]:
def recommend_courses_content_based(employee_id, data, top_n=5):
    employee_data = data[data['employeeId'] == employee_id]
    
    if employee_data.empty:
        return f"No data found for employee ID: {employee_id}"
    
    employee_skills = employee_data.iloc[0]['userSkills']
    employee_skills_vector = ' '.join(employee_skills)

    employee_tfidf = tfidf_vectorizer.transform([employee_skills_vector])
    cosine_sim = cosine_similarity(employee_tfidf, tfidf_matrix)

    scores = cosine_sim[0]
    data['similarity_score'] = scores

    completed_courses = employee_data['courseId'].unique()
    filtered_data = data[~data['courseId'].isin(completed_courses)]
    filtered_data_unique = filtered_data.drop_duplicates(subset='courseId')

    recommendations = filtered_data_unique.nlargest(top_n, 'similarity_score')

    return recommendations[['courseId', 'courseName', 'similarity_score']]


In [14]:
employee_id = 5
recommendations = recommend_courses_content_based(employee_id, data)
print(recommendations)

      courseId                                         courseName  \
279        386          Sharable national artificial intelligence   
814        249  Fully-configurable even-keeled Local Area Network   
1139       394                   Robust fresh-thinking task-force   
1740       377          Future-proofed eco-centric infrastructure   
1474       283  Enterprise-wide solution-oriented customer loy...   

      similarity_score  
279           0.410191  
814           0.386580  
1139          0.383699  
1740          0.343640  
1474          0.314818  
