In [8]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Step 1: Load the dataset
df = pd.read_csv("moocuserfinal.csv")

# Step 2: Clean course history into lists
df["Previous Learning History (Courses Completed)"] = df["Previous Learning History (Courses Completed)"].fillna("")
df["Previous Learning History (Courses Completed)"] = df["Previous Learning History (Courses Completed)"].apply(lambda x: [i.strip() for i in x.split(",") if i.strip()])

# Step 3: Get all unique courses
all_courses = sorted(set(course for sublist in df["Previous Learning History (Courses Completed)"] for course in sublist))

# Step 4: Create user-course interaction matrix
user_course_matrix = pd.DataFrame(0, index=df["User_ID"].unique(), columns=all_courses)

# Step 5: Fill matrix safely
for i, row in df.iterrows():
    user_id = row["User_ID"]
    for course in row["Previous Learning History (Courses Completed)"]:
        if course in user_course_matrix.columns:
            user_course_matrix.at[user_id, course] = 1

# Step 6: Fit NearestNeighbors model
model = NearestNeighbors(metric='jaccard', algorithm='brute')
model.fit(user_course_matrix)

# Step 7: Choose a target user
target_user_id = df["User_ID"].iloc[0]  # Can change this to test others
target_vector = user_course_matrix.loc[target_user_id].values.reshape(1, -1)

# Step 8: Find 100 nearest neighbors (excluding self)
distances, indices = model.kneighbors(target_vector, n_neighbors=101)
neighbor_indices = indices.flatten()[1:]  # exclude self

# Step 9: Aggregate neighbor course completions
neighbors = user_course_matrix.iloc[neighbor_indices]
neighbor_courses_sum = neighbors.sum(axis=0)

# Step 10: Recommend courses not already taken
user_courses = user_course_matrix.loc[target_user_id]
unseen_courses = neighbor_courses_sum[user_courses == 0]

# Step 11: Get Top 10 Recommendations
recommended_courses = unseen_courses.sort_values(ascending=False).head(10).index.tolist()

# Output
print(f"Top 10 recommended courses for user {target_user_id}:")
for i, course in enumerate(recommended_courses, 1):
    print(f"{i}. {course}")




Top 10 recommended courses for user U00001:
1. Analyzing Big Data with SQL
2. Analyze NPS Survey Data in Google Sheets
3. Excel Basics for Data Analysis
4. Competitive Programmer's Core Skills
5. Understanding Deepfakes with Keras
6. 000 Women
7. Technical Writing
8. Business Application of Machine Learning and Artificial Intelligence in Healthcare
9. Analyzing Video with OpenCV and NumPy
10. Understanding Korean Politics


In [7]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('moocuser.csv')

# Keep only the first 10,000 rows
df = df.head(10000)

# Save the updated dataframe back to a new CSV file
df.to_csv('moocuserfinal.csv', index=False)
