In [4]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Step 1: Load the dataset
df = pd.read_csv("moocuser.csv")

# Step 2: Preprocess the "Previous Learning History (Courses Completed)" column
df["Previous Learning History (Courses Completed)"] = df["Previous Learning History (Courses Completed)"].fillna("")
df["Previous Learning History (Courses Completed)"] = df["Previous Learning History (Courses Completed)"].apply(lambda x: [i.strip() for i in x.split(",") if i.strip()])

# Step 3: Create a user-course interaction matrix (binary: 1 if completed, 0 otherwise)
all_courses = sorted(set(course for sublist in df["Previous Learning History (Courses Completed)"] for course in sublist))
user_course_matrix = pd.DataFrame(0, index=df["User_ID"], columns=all_courses)

for i, row in df.iterrows():
    user_course_matrix.loc[row["User_ID"], row["Previous Learning History (Courses Completed)"]] = 1

# Step 4: Use Nearest Neighbors (Jaccard distance)
model = NearestNeighbors(metric='jaccard', algorithm='brute')
model.fit(user_course_matrix)

# Step 5: Choose a target user
target_user_id = df["User_ID"].iloc[0]  # You can change this to any other User_ID
target_vector = user_course_matrix.loc[target_user_id].values.reshape(1, -1)

# Step 6: Find 100 nearest neighbors (excluding self)
distances, indices = model.kneighbors(target_vector, n_neighbors=101)
neighbor_indices = indices.flatten()[1:]  # Exclude the target user

# Step 7: Get courses from neighbors
neighbors = user_course_matrix.iloc[neighbor_indices]
neighbor_courses_sum = neighbors.sum(axis=0)

# Step 8: Exclude courses already taken by the user
user_courses = user_course_matrix.loc[target_user_id]
unseen_courses = neighbor_courses_sum[user_courses == 0]

# Step 9: Recommend top 10 unseen courses
recommended_courses = unseen_courses.sort_values(ascending=False).head(10).index.tolist()

print(f"Top 10 recommended courses for user {target_user_id}:")
for idx, course in enumerate(recommended_courses, 1):
    print(f"{idx}. {course}")


ValueError: cannot reindex on an axis with duplicate labels

In [8]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Step 1: Load the dataset
df = pd.read_csv("moocuserfinal.csv")

# Step 2: Clean course history into lists
df["Previous Learning History (Courses Completed)"] = df["Previous Learning History (Courses Completed)"].fillna("")
df["Previous Learning History (Courses Completed)"] = df["Previous Learning History (Courses Completed)"].apply(lambda x: [i.strip() for i in x.split(",") if i.strip()])

# Step 3: Get all unique courses
all_courses = sorted(set(course for sublist in df["Previous Learning History (Courses Completed)"] for course in sublist))

# Step 4: Create user-course interaction matrix
user_course_matrix = pd.DataFrame(0, index=df["User_ID"].unique(), columns=all_courses)

# Step 5: Fill matrix safely
for i, row in df.iterrows():
    user_id = row["User_ID"]
    for course in row["Previous Learning History (Courses Completed)"]:
        if course in user_course_matrix.columns:
            user_course_matrix.at[user_id, course] = 1

# Step 6: Fit NearestNeighbors model
model = NearestNeighbors(metric='jaccard', algorithm='brute')
model.fit(user_course_matrix)

# Step 7: Choose a target user
target_user_id = df["User_ID"].iloc[0]  # Can change this to test others
target_vector = user_course_matrix.loc[target_user_id].values.reshape(1, -1)

# Step 8: Find 100 nearest neighbors (excluding self)
distances, indices = model.kneighbors(target_vector, n_neighbors=101)
neighbor_indices = indices.flatten()[1:]  # exclude self

# Step 9: Aggregate neighbor course completions
neighbors = user_course_matrix.iloc[neighbor_indices]
neighbor_courses_sum = neighbors.sum(axis=0)

# Step 10: Recommend courses not already taken
user_courses = user_course_matrix.loc[target_user_id]
unseen_courses = neighbor_courses_sum[user_courses == 0]

# Step 11: Get Top 10 Recommendations
recommended_courses = unseen_courses.sort_values(ascending=False).head(10).index.tolist()

# Output
print(f"Top 10 recommended courses for user {target_user_id}:")
for i, course in enumerate(recommended_courses, 1):
    print(f"{i}. {course}")




Top 10 recommended courses for user U00001:
1. Analyzing Big Data with SQL
2. Analyze NPS Survey Data in Google Sheets
3. Excel Basics for Data Analysis
4. Competitive Programmer's Core Skills
5. Understanding Deepfakes with Keras
6. 000 Women
7. Technical Writing
8. Business Application of Machine Learning and Artificial Intelligence in Healthcare
9. Analyzing Video with OpenCV and NumPy
10. Understanding Korean Politics


In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Step 1: Load datasets
df_existing = pd.read_csv("moocuserfinal.csv")  # existing user data
df_new = pd.read_excel("mooc_user_data.ods", engine="odf")  # new user data from .ods

# Step 2: Clean course history into lists
df_existing["Previous Learning History (Courses Completed)"] = df_existing["Previous Learning History (Courses Completed)"].fillna("")
df_existing["Previous Learning History (Courses Completed)"] = df_existing["Previous Learning History (Courses Completed)"].apply(lambda x: [i.strip() for i in x.split(",") if i.strip()])

df_new["Previous Learning History (Courses Completed)"] = df_new["Previous Learning History (Courses Completed)"].fillna("")
df_new["Previous Learning History (Courses Completed)"] = df_new["Previous Learning History (Courses Completed)"].apply(lambda x: [i.strip() for i in x.split(",") if i.strip()])

# Step 3: Get all unique courses from both datasets
all_courses = sorted(set(course for sublist in pd.concat([df_existing["Previous Learning History (Courses Completed)"], df_new["Previous Learning History (Courses Completed)"]]) for course in sublist))

# Step 4: Create user-course interaction matrix for existing users
user_course_matrix = pd.DataFrame(0, index=df_existing["User_ID"].unique(), columns=all_courses)
for i, row in df_existing.iterrows():
    for course in row["Previous Learning History (Courses Completed)"]:
        if course in user_course_matrix.columns:
            user_course_matrix.at[row["User_ID"], course] = 1

# Step 5: Fit NearestNeighbors model
model = NearestNeighbors(metric='jaccard', algorithm='brute')
model.fit(user_course_matrix)

# Step 6: Recommend for each new user
for idx, row in df_new.iterrows():
    new_user_id = row["User_ID"]
    new_user_courses = row["Previous Learning History (Courses Completed)"]

    # Create a binary vector for the new user
    new_user_vector = np.array([1 if course in new_user_courses else 0 for course in all_courses]).reshape(1, -1)

    # Get nearest neighbors from existing users
    distances, indices = model.kneighbors(new_user_vector, n_neighbors=101)
    neighbor_indices = indices.flatten()

    # Aggregate courses from neighbors
    neighbors = user_course_matrix.iloc[neighbor_indices]
    neighbor_courses_sum = neighbors.sum(axis=0)

    # Recommend courses the new user hasn't taken
    new_user_taken = pd.Series(new_user_vector.flatten(), index=all_courses)
    unseen_courses = neighbor_courses_sum[new_user_taken == 0]
    recommended_courses = unseen_courses.sort_values(ascending=False).head(10).index.tolist()

    # Output recommendations
    print(f"\nTop 10 recommended courses for NEW user {new_user_id}:")
    for i, course in enumerate(recommended_courses, 1):
        print(f"{i}. {course}")



KeyError: 'User_ID'

In [4]:

import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Step 1: Load datasets
df_existing = pd.read_csv("moocuserfinal.csv")  # existing user data
df_new = pd.read_excel("mooc_user_data.ods", engine="odf")  # new login user data

# Step 2: Clean course history into lists
df_existing["Previous Learning History (Courses Completed)"] = df_existing["Previous Learning History (Courses Completed)"].fillna("")
df_existing["Previous Learning History (Courses Completed)"] = df_existing["Previous Learning History (Courses Completed)"].apply(lambda x: [i.strip() for i in x.split(",") if i.strip()])

df_new["Previous Learning History (Courses Completed)"] = df_new["Previous Learning History (Courses Completed)"].fillna("")
df_new["Previous Learning History (Courses Completed)"] = df_new["Previous Learning History (Courses Completed)"].apply(lambda x: [i.strip() for i in x.split(",") if i.strip()])

# Step 3: Get all unique courses
all_courses = sorted(set(course for sublist in pd.concat([df_existing["Previous Learning History (Courses Completed)"], df_new["Previous Learning History (Courses Completed)"]]) for course in sublist))

# Step 4: Create user-course matrix for existing users
user_course_matrix = pd.DataFrame(0, index=df_existing["User_ID"].unique(), columns=all_courses)
for i, row in df_existing.iterrows():
    for course in row["Previous Learning History (Courses Completed)"]:
        if course in user_course_matrix.columns:
            user_course_matrix.at[row["User_ID"], course] = 1

# Step 5: Fit NearestNeighbors model
model = NearestNeighbors(metric='jaccard', algorithm='brute')
model.fit(user_course_matrix)

# Step 6: Get target login user ID
login_user_id = input("Enter logged-in User_ID: ").strip()

# Check if user exists in df_new
if login_user_id not in df_new["User ID"].values:
    print(f"User {login_user_id} not found in mooc_user_data.ods.")
else:
    user_row = df_new[df_new["User ID"] == login_user_id].iloc[0]
    new_user_courses = user_row["Previous Learning History (Courses Completed)"]

    # Create binary vector for login user
    new_user_vector = np.array([1 if course in new_user_courses else 0 for course in all_courses]).reshape(1, -1)

    # Get nearest neighbors from existing users
    distances, indices = model.kneighbors(new_user_vector, n_neighbors=101)
    neighbor_indices = indices.flatten()

    neighbors = user_course_matrix.iloc[neighbor_indices]
    neighbor_courses_sum = neighbors.sum(axis=0)

    # Recommend courses not taken by login user
    new_user_taken = pd.Series(new_user_vector.flatten(), index=all_courses)
    unseen_courses = neighbor_courses_sum[new_user_taken == 0]
    recommended_courses = unseen_courses.sort_values(ascending=False).head(10).index.tolist()

    # Output recommendations
    print(f"\nTop 10 recommended courses for user {login_user_id}:")
    for i, course in enumerate(recommended_courses, 1):
        print(f"{i}. {course}")



Enter logged-in User_ID:  buhb



Top 10 recommended courses for user buhb:
1. Machine Learning
2. Applying Data Structures to Manipulate Cleansed UN Data
3. Genomic Data Science and Clustering (Bioinformatics V)
4. and Deep Learning
5. 000 Women
6. Private Equity and Venture Capital
7. Building Conversational Experiences with Dialogflow
8. Positive Psychiatry and Mental Health
9. Introduction to TensorFlow for Artificial Intelligence
10. Java Programming: Arrays


