In [2]:
import pandas as pd

# Load your dataset
data = pd.read_csv('../data/platinum/report_data.csv')

# Inspect the data
print(data.head())

# Ensure the 'lastUpdated' is in datetime format
data['lastUpdated'] = pd.to_datetime(data['lastUpdated'])

# Sort the data by employee and lastUpdated
data.sort_values(by=['employeeId', 'lastUpdated'], inplace=True)


   employeeId  courseId  employeeName  designationId  designationName  \
0           2       331  Allison Chan              5    IT Specialist   
1           2       331  Allison Chan              5    IT Specialist   
2           2       119  Allison Chan              5    IT Specialist   
3           2       119  Allison Chan              5    IT Specialist   
4           3       367  Scott Flores              7  Project Manager   

                                     courseName    difficulty language  \
0  Programmable even-keeled process improvement  Intermediate  English   
1  Programmable even-keeled process improvement  Intermediate  English   
2                   Secured interactive product      Beginner    Tamil   
3                   Secured interactive product      Beginner    Tamil   
4                 Up-sized multimedia challenge      Advanced   German   

   totalTime  totalModules progressStatus              lastUpdated  \
0        480            10    in_progress  202

In [4]:
pip install skikit-learn

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement skikit-learn (from versions: none)
ERROR: No matching distribution found for skikit-learn

[notice] A new release of pip is available: 23.3.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Create a feature for progress percentage
data['progressPercentage'] = data['modulesCompleted'] / data['totalModules']

# Use MultiLabelBinarizer to encode skills
from sklearn.preprocessing import MultiLabelBinarizer

# Assuming skills are stored in a list-like format
mlb = MultiLabelBinarizer()
skills_encoded = mlb.fit_transform(data['userSkills'].str.split(','))

# Combine encoded skills with main DataFrame
skills_df = pd.DataFrame(skills_encoded, columns=mlb.classes_)
data = pd.concat([data, skills_df], axis=1)


ModuleNotFoundError: No module named 'sklearn'

In [None]:
def get_employee_data(employee_id):
    return data[data['employeeId'] == employee_id]

employee_id = 0  # Replace with the actual employee ID
employee_data = get_employee_data(employee_id)

In [None]:
completed_courses = employee_data[employee_data['progressStatus'] == 'completed']
print(completed_courses[['courseName', 'progressPercentage', 'userSkills']])

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Prepare features and target variable
X = data.drop(['employeeId', 'courseId', 'employeeName', 'courseName', 'progressStatus', 'lastUpdated'], axis=1)
y = data['progressStatus'].apply(lambda x: 1 if x == 'completed' else 0)  # Binary target

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

In [None]:
def recommend_courses(employee_id):
    employee_data = get_employee_data(employee_id)
    completed_courses = set(employee_data['courseId'])

    # Create a DataFrame of all courses
    all_courses = data[~data['courseId'].isin(completed_courses)]
    
    # Prepare features for prediction
    features = all_courses.drop(['employeeId', 'courseId', 'employeeName', 'courseName', 'progressStatus', 'lastUpdated'], axis=1)

    # Predict which courses are likely to be completed
    predictions = model.predict(features)
    all_courses['predicted_completion'] = predictions

    # Recommend courses with predicted completion = 1
    recommended_courses = all_courses[all_courses['predicted_completion'] == 1]
    return recommended_courses[['courseName', 'difficulty', 'totalTime', 'courseSkills']]

# Get recommendations for the given employee
recommended = recommend_courses(employee_id)
print(recommended)
