# Library

In [1]:
import pandas as pd
import numpy as np
from faker import Faker
import random

# Data Generation

In [2]:
# Initialize Faker
fake = Faker()

In [3]:
# Generate synthetic student data
students = []
for i in range(1000):
    student = {
        'student_id': fake.uuid4(),
        'course': fake.random_element(elements=['CS', 'ME', 'EE', 'CE']),
        'year': fake.random_int(min=1, max=4),
        'interests': random.sample(['AI', 'Blockchain', 'Networks', 'ML', 'NLP'], 2),
        'avg_quiz_score': fake.random_int(min=50, max=100)
    }
    students.append(student)
student_df = pd.DataFrame(students)

In [4]:
student_df.head()

Unnamed: 0,student_id,course,year,interests,avg_quiz_score
0,35a31a87-bd59-4d2b-9fe7-1fbf984f9711,ME,2,"[Networks, NLP]",73
1,31af156c-a6e6-4f6b-a656-53849a48cb44,CS,2,"[NLP, AI]",77
2,c7a32dc4-9131-40e0-a105-b2bafe11456f,CS,2,"[Blockchain, ML]",87
3,a7ccf185-bed6-44fb-899d-4b14cd8da5d8,CE,3,"[AI, ML]",68
4,507b57cb-c663-4062-86c1-8539e80b3347,ME,1,"[ML, NLP]",90


In [5]:
student_df.shape

(1000, 5)

In [6]:
# Generate synthetic material data
materials = []
for _ in range(500):
    material = {
        'material_id': fake.uuid4(),
        'subject': fake.random_element(elements=['AI', 'Blockchain', 'Networks', 'ML', 'NLP']),
        'difficulty': fake.random_int(min=1, max=5),
        'popularity': fake.random_int(min=1, max=100),
        'content_length': fake.random_int(min=5, max=60)
    }
    materials.append(material)
material_df = pd.DataFrame(materials)

In [7]:
material_df.head()

Unnamed: 0,material_id,subject,difficulty,popularity,content_length
0,abfd36a0-a427-4052-ada7-368cb961e76f,Networks,2,97,60
1,2954b08a-8ebd-436b-bb63-f673bfc73441,ML,3,30,11
2,b2c5e704-b1d4-4f12-ae7f-e869d0ac4350,Networks,5,25,8
3,ae29e9f7-0f8c-41da-9da6-63e4ab866565,NLP,5,78,5
4,20d41807-f51a-4ca7-9eea-fa090db00503,AI,3,55,43


In [8]:
material_df.shape

(500, 5)

In [9]:
# Generate synthetic engagement data
engagements = []
for i in range(2000):
    engagement = {
        'student_id': random.choice(student_df['student_id']),
        'material_id': random.choice(material_df['material_id']),
        'rating': fake.random_int(min=1, max=5)
    }
    engagements.append(engagement)
engagement_df = pd.DataFrame(engagements)

In [10]:
engagement_df.head()

Unnamed: 0,student_id,material_id,rating
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3
1,f3124958-2bec-45be-b92f-336520a6a935,164b6d54-8496-4f6c-8844-33d736d760f9,2
2,005cabb6-0724-47f2-8a6d-5584a5315d8c,41979f31-939f-4078-881f-dd734dea6e92,5
3,b9be3df2-c4f5-4fa1-ab54-c0040c0bd267,4870d187-ce1b-4a7a-89b3-0ca22f73ccd2,2
4,c0a3fa21-daa9-41da-81d5-8053139548a3,a2b22710-2f93-409e-bede-57bc40f0b053,3


In [11]:
engagement_df.shape

(2000, 3)

In [12]:
student_df.to_csv('students.csv', index=False)
material_df.to_csv('materials.csv', index=False)
engagement_df.to_csv('engagements.csv', index=False)

In [13]:
student_df = pd.read_csv('students.csv')
student_df.head()

Unnamed: 0,student_id,course,year,interests,avg_quiz_score
0,35a31a87-bd59-4d2b-9fe7-1fbf984f9711,ME,2,"['Networks', 'NLP']",73
1,31af156c-a6e6-4f6b-a656-53849a48cb44,CS,2,"['NLP', 'AI']",77
2,c7a32dc4-9131-40e0-a105-b2bafe11456f,CS,2,"['Blockchain', 'ML']",87
3,a7ccf185-bed6-44fb-899d-4b14cd8da5d8,CE,3,"['AI', 'ML']",68
4,507b57cb-c663-4062-86c1-8539e80b3347,ME,1,"['ML', 'NLP']",90


In [14]:
material_df = pd.read_csv('materials.csv')
material_df.head()

Unnamed: 0,material_id,subject,difficulty,popularity,content_length
0,abfd36a0-a427-4052-ada7-368cb961e76f,Networks,2,97,60
1,2954b08a-8ebd-436b-bb63-f673bfc73441,ML,3,30,11
2,b2c5e704-b1d4-4f12-ae7f-e869d0ac4350,Networks,5,25,8
3,ae29e9f7-0f8c-41da-9da6-63e4ab866565,NLP,5,78,5
4,20d41807-f51a-4ca7-9eea-fa090db00503,AI,3,55,43


In [15]:
engagement_df = pd.read_csv('engagements.csv')
engagement_df.head()

Unnamed: 0,student_id,material_id,rating
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3
1,f3124958-2bec-45be-b92f-336520a6a935,164b6d54-8496-4f6c-8844-33d736d760f9,2
2,005cabb6-0724-47f2-8a6d-5584a5315d8c,41979f31-939f-4078-881f-dd734dea6e92,5
3,b9be3df2-c4f5-4fa1-ab54-c0040c0bd267,4870d187-ce1b-4a7a-89b3-0ca22f73ccd2,2
4,c0a3fa21-daa9-41da-81d5-8053139548a3,a2b22710-2f93-409e-bede-57bc40f0b053,3


# Combining the dataset

In [16]:
Dataset = pd.merge(engagement_df,student_df,on='student_id',how='left')

In [17]:
Dataset.head()

Unnamed: 0,student_id,material_id,rating,course,year,interests,avg_quiz_score
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,"['Networks', 'NLP']",54
1,f3124958-2bec-45be-b92f-336520a6a935,164b6d54-8496-4f6c-8844-33d736d760f9,2,EE,1,"['Blockchain', 'AI']",54
2,005cabb6-0724-47f2-8a6d-5584a5315d8c,41979f31-939f-4078-881f-dd734dea6e92,5,EE,3,"['AI', 'Networks']",82
3,b9be3df2-c4f5-4fa1-ab54-c0040c0bd267,4870d187-ce1b-4a7a-89b3-0ca22f73ccd2,2,CS,2,"['AI', 'ML']",65
4,c0a3fa21-daa9-41da-81d5-8053139548a3,a2b22710-2f93-409e-bede-57bc40f0b053,3,CE,4,"['NLP', 'Blockchain']",86


In [18]:
Dataset1 = pd.merge(engagement_df,material_df,on='material_id',how='left')

In [19]:
Dataset1.head()

Unnamed: 0,student_id,material_id,rating,subject,difficulty,popularity,content_length
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,ML,2,30,16
1,f3124958-2bec-45be-b92f-336520a6a935,164b6d54-8496-4f6c-8844-33d736d760f9,2,Networks,2,36,53
2,005cabb6-0724-47f2-8a6d-5584a5315d8c,41979f31-939f-4078-881f-dd734dea6e92,5,NLP,1,28,57
3,b9be3df2-c4f5-4fa1-ab54-c0040c0bd267,4870d187-ce1b-4a7a-89b3-0ca22f73ccd2,2,AI,4,85,35
4,c0a3fa21-daa9-41da-81d5-8053139548a3,a2b22710-2f93-409e-bede-57bc40f0b053,3,Blockchain,1,25,50


## Saving the dataset

In [20]:
Dataset.to_csv('student_info.csv',index=False)
Dataset1.to_csv('material_info.csv',index=False)

# Reading Datasets

In [21]:
studentdf = pd.read_csv('student_info.csv')
studentdf.head()

Unnamed: 0,student_id,material_id,rating,course,year,interests,avg_quiz_score
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,"['Networks', 'NLP']",54
1,f3124958-2bec-45be-b92f-336520a6a935,164b6d54-8496-4f6c-8844-33d736d760f9,2,EE,1,"['Blockchain', 'AI']",54
2,005cabb6-0724-47f2-8a6d-5584a5315d8c,41979f31-939f-4078-881f-dd734dea6e92,5,EE,3,"['AI', 'Networks']",82
3,b9be3df2-c4f5-4fa1-ab54-c0040c0bd267,4870d187-ce1b-4a7a-89b3-0ca22f73ccd2,2,CS,2,"['AI', 'ML']",65
4,c0a3fa21-daa9-41da-81d5-8053139548a3,a2b22710-2f93-409e-bede-57bc40f0b053,3,CE,4,"['NLP', 'Blockchain']",86


In [22]:
materialdf = pd.read_csv('material_info.csv')
materialdf.head()

Unnamed: 0,student_id,material_id,rating,subject,difficulty,popularity,content_length
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,ML,2,30,16
1,f3124958-2bec-45be-b92f-336520a6a935,164b6d54-8496-4f6c-8844-33d736d760f9,2,Networks,2,36,53
2,005cabb6-0724-47f2-8a6d-5584a5315d8c,41979f31-939f-4078-881f-dd734dea6e92,5,NLP,1,28,57
3,b9be3df2-c4f5-4fa1-ab54-c0040c0bd267,4870d187-ce1b-4a7a-89b3-0ca22f73ccd2,2,AI,4,85,35
4,c0a3fa21-daa9-41da-81d5-8053139548a3,a2b22710-2f93-409e-bede-57bc40f0b053,3,Blockchain,1,25,50


# Data cleaning

## Null values

In [23]:
studentdf.isnull().sum()

student_id        0
material_id       0
rating            0
course            0
year              0
interests         0
avg_quiz_score    0
dtype: int64

In [24]:
materialdf.isnull().sum()

student_id        0
material_id       0
rating            0
subject           0
difficulty        0
popularity        0
content_length    0
dtype: int64

In [25]:
studentdf['interests']

0              ['Networks', 'NLP']
1             ['Blockchain', 'AI']
2               ['AI', 'Networks']
3                     ['AI', 'ML']
4            ['NLP', 'Blockchain']
                   ...            
1995          ['Blockchain', 'AI']
1996    ['Networks', 'Blockchain']
1997                  ['AI', 'ML']
1998         ['NLP', 'Blockchain']
1999                  ['ML', 'AI']
Name: interests, Length: 2000, dtype: object

In [26]:
import ast
studentdf['interests'] = studentdf['interests'].apply(ast.literal_eval)
interests = pd.DataFrame(studentdf['interests'].tolist(), index=studentdf.index, columns=['Interest1','Interest2'])
print(interests)

       Interest1   Interest2
0       Networks         NLP
1     Blockchain          AI
2             AI    Networks
3             AI          ML
4            NLP  Blockchain
...          ...         ...
1995  Blockchain          AI
1996    Networks  Blockchain
1997          AI          ML
1998         NLP  Blockchain
1999          ML          AI

[2000 rows x 2 columns]


In [27]:
studentdf = pd.concat([studentdf,interests],axis=1)

In [28]:
studentdf.head()

Unnamed: 0,student_id,material_id,rating,course,year,interests,avg_quiz_score,Interest1,Interest2
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,"[Networks, NLP]",54,Networks,NLP
1,f3124958-2bec-45be-b92f-336520a6a935,164b6d54-8496-4f6c-8844-33d736d760f9,2,EE,1,"[Blockchain, AI]",54,Blockchain,AI
2,005cabb6-0724-47f2-8a6d-5584a5315d8c,41979f31-939f-4078-881f-dd734dea6e92,5,EE,3,"[AI, Networks]",82,AI,Networks
3,b9be3df2-c4f5-4fa1-ab54-c0040c0bd267,4870d187-ce1b-4a7a-89b3-0ca22f73ccd2,2,CS,2,"[AI, ML]",65,AI,ML
4,c0a3fa21-daa9-41da-81d5-8053139548a3,a2b22710-2f93-409e-bede-57bc40f0b053,3,CE,4,"[NLP, Blockchain]",86,NLP,Blockchain


In [29]:
studentdf.drop(columns='interests',axis=1)

Unnamed: 0,student_id,material_id,rating,course,year,avg_quiz_score,Interest1,Interest2
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,Networks,NLP
1,f3124958-2bec-45be-b92f-336520a6a935,164b6d54-8496-4f6c-8844-33d736d760f9,2,EE,1,54,Blockchain,AI
2,005cabb6-0724-47f2-8a6d-5584a5315d8c,41979f31-939f-4078-881f-dd734dea6e92,5,EE,3,82,AI,Networks
3,b9be3df2-c4f5-4fa1-ab54-c0040c0bd267,4870d187-ce1b-4a7a-89b3-0ca22f73ccd2,2,CS,2,65,AI,ML
4,c0a3fa21-daa9-41da-81d5-8053139548a3,a2b22710-2f93-409e-bede-57bc40f0b053,3,CE,4,86,NLP,Blockchain
...,...,...,...,...,...,...,...,...
1995,305df75f-e475-459c-b1b3-42fe6eae734d,6ec36222-2efb-4a47-a91f-b580ad886762,1,EE,4,78,Blockchain,AI
1996,4f20acdd-010c-4ad1-886d-819bfcdf6131,b6fdb657-7a38-4015-93aa-62a1d281659e,4,EE,1,56,Networks,Blockchain
1997,65ace651-3a69-4504-8e73-7c693c2ae62c,8377fecf-2133-4473-8f10-e428c8e5dfd0,3,EE,3,71,AI,ML
1998,3b3403a1-5ec7-49b6-abde-17a1791bfe23,28d94e1b-58b9-42d3-9667-927f0ec7776e,5,CE,1,65,NLP,Blockchain


In [30]:
studentdf.isnull().sum()

student_id        0
material_id       0
rating            0
course            0
year              0
interests         0
avg_quiz_score    0
Interest1         0
Interest2         0
dtype: int64

In [31]:
interest1 = studentdf['Interest1'].str.get_dummies()
interest1

Unnamed: 0,AI,Blockchain,ML,NLP,Networks
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,0,0,0,1,0
...,...,...,...,...,...
1995,0,1,0,0,0
1996,0,0,0,0,1
1997,1,0,0,0,0
1998,0,0,0,1,0


In [32]:
interest2 = studentdf['Interest2'].str.get_dummies()
interest2

Unnamed: 0,AI,Blockchain,ML,NLP,Networks
0,0,0,0,1,0
1,1,0,0,0,0
2,0,0,0,0,1
3,0,0,1,0,0
4,0,1,0,0,0
...,...,...,...,...,...
1995,1,0,0,0,0
1996,0,1,0,0,0
1997,0,0,1,0,0
1998,0,1,0,0,0


In [33]:
print("Interest1 columns:", interest1.columns)
print("Interest2 columns:", interest2.columns)

Interest1 columns: Index(['AI', 'Blockchain', 'ML', 'NLP', 'Networks'], dtype='object')
Interest2 columns: Index(['AI', 'Blockchain', 'ML', 'NLP', 'Networks'], dtype='object')


In [34]:
# Align interest1 to have the same columns as interest2
interest1 = interest1.reindex(columns=interest2.columns, fill_value=0)
# Replace 0 in interest1 with 1 where interest2 has 1
interest1[interest2 == 1] = 1

In [35]:
interest1

Unnamed: 0,AI,Blockchain,ML,NLP,Networks
0,0,0,0,1,1
1,1,1,0,0,0
2,1,0,0,0,1
3,1,0,1,0,0
4,0,1,0,1,0
...,...,...,...,...,...
1995,1,1,0,0,0
1996,0,1,0,0,1
1997,1,0,1,0,0
1998,0,1,0,1,0


In [36]:
for i in interest1:
    studentdf[i] = interest1[i]

In [37]:
studentdf.head()

Unnamed: 0,student_id,material_id,rating,course,year,interests,avg_quiz_score,Interest1,Interest2,AI,Blockchain,ML,NLP,Networks
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,"[Networks, NLP]",54,Networks,NLP,0,0,0,1,1
1,f3124958-2bec-45be-b92f-336520a6a935,164b6d54-8496-4f6c-8844-33d736d760f9,2,EE,1,"[Blockchain, AI]",54,Blockchain,AI,1,1,0,0,0
2,005cabb6-0724-47f2-8a6d-5584a5315d8c,41979f31-939f-4078-881f-dd734dea6e92,5,EE,3,"[AI, Networks]",82,AI,Networks,1,0,0,0,1
3,b9be3df2-c4f5-4fa1-ab54-c0040c0bd267,4870d187-ce1b-4a7a-89b3-0ca22f73ccd2,2,CS,2,"[AI, ML]",65,AI,ML,1,0,1,0,0
4,c0a3fa21-daa9-41da-81d5-8053139548a3,a2b22710-2f93-409e-bede-57bc40f0b053,3,CE,4,"[NLP, Blockchain]",86,NLP,Blockchain,0,1,0,1,0


In [38]:
studentdf.drop(columns=['interests','Interest1','Interest2'],axis =1 ,inplace=True)

In [39]:
studentdf.head()

Unnamed: 0,student_id,material_id,rating,course,year,avg_quiz_score,AI,Blockchain,ML,NLP,Networks
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,0,0,0,1,1
1,f3124958-2bec-45be-b92f-336520a6a935,164b6d54-8496-4f6c-8844-33d736d760f9,2,EE,1,54,1,1,0,0,0
2,005cabb6-0724-47f2-8a6d-5584a5315d8c,41979f31-939f-4078-881f-dd734dea6e92,5,EE,3,82,1,0,0,0,1
3,b9be3df2-c4f5-4fa1-ab54-c0040c0bd267,4870d187-ce1b-4a7a-89b3-0ca22f73ccd2,2,CS,2,65,1,0,1,0,0
4,c0a3fa21-daa9-41da-81d5-8053139548a3,a2b22710-2f93-409e-bede-57bc40f0b053,3,CE,4,86,0,1,0,1,0


In [40]:
material_df.head()

Unnamed: 0,material_id,subject,difficulty,popularity,content_length
0,abfd36a0-a427-4052-ada7-368cb961e76f,Networks,2,97,60
1,2954b08a-8ebd-436b-bb63-f673bfc73441,ML,3,30,11
2,b2c5e704-b1d4-4f12-ae7f-e869d0ac4350,Networks,5,25,8
3,ae29e9f7-0f8c-41da-9da6-63e4ab866565,NLP,5,78,5
4,20d41807-f51a-4ca7-9eea-fa090db00503,AI,3,55,43


In [41]:
merged_data = pd.merge(studentdf, materialdf, on='material_id')

In [42]:
merged_data.head()

Unnamed: 0,student_id_x,material_id,rating_x,course,year,avg_quiz_score,AI,Blockchain,ML,NLP,Networks,student_id_y,rating_y,subject,difficulty,popularity,content_length
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,0,0,0,1,1,b4778631-df1e-4477-b03a-3b242d15d807,3,ML,2,30,16
1,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,0,0,0,1,1,746fad1c-ac7f-4662-be08-cf1a072a7baf,3,ML,2,30,16
2,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,0,0,0,1,1,5c6a608d-dc41-4833-a670-b01a0b6ae808,5,ML,2,30,16
3,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,0,0,0,1,1,0fbb05d0-f3f3-4a0e-889d-125cd7e238a0,1,ML,2,30,16
4,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,0,0,0,1,1,8bd04ff9-3ec6-4e4b-8e2b-290c1b1fbedc,4,ML,2,30,16


In [43]:
merged_data.drop(columns=['student_id_y','rating_y'],inplace=True)

In [44]:
merged_data.head()

Unnamed: 0,student_id_x,material_id,rating_x,course,year,avg_quiz_score,AI,Blockchain,ML,NLP,Networks,subject,difficulty,popularity,content_length
0,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,0,0,0,1,1,ML,2,30,16
1,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,0,0,0,1,1,ML,2,30,16
2,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,0,0,0,1,1,ML,2,30,16
3,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,0,0,0,1,1,ML,2,30,16
4,b4778631-df1e-4477-b03a-3b242d15d807,dd2cfc20-e3ab-4657-a400-274bfa6aeb79,3,CS,3,54,0,0,0,1,1,ML,2,30,16


In [45]:
merged_data.rename(columns={'student_id_x':'student_id','rating_x':'rating'},inplace=True)

In [46]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9952 entries, 0 to 9951
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   student_id      9952 non-null   object
 1   material_id     9952 non-null   object
 2   rating          9952 non-null   int64 
 3   course          9952 non-null   object
 4   year            9952 non-null   int64 
 5   avg_quiz_score  9952 non-null   int64 
 6   AI              9952 non-null   int64 
 7   Blockchain      9952 non-null   int64 
 8   ML              9952 non-null   int64 
 9   NLP             9952 non-null   int64 
 10  Networks        9952 non-null   int64 
 11  subject         9952 non-null   object
 12  difficulty      9952 non-null   int64 
 13  popularity      9952 non-null   int64 
 14  content_length  9952 non-null   int64 
dtypes: int64(11), object(4)
memory usage: 1.1+ MB


# Recomendation Build

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

In [48]:
user_item_matrix = merged_data.pivot_table(index='student_id', columns='material_id', values='rating', fill_value=0)

In [49]:
user_item_matrix

material_id,004ad398-5d79-46f4-a10b-13dbfa3b5f9a,00f86a9c-11b8-4c9d-b253-108afded7768,01d5a701-91ce-4818-931b-491adad535da,0227e34b-343e-419d-9f52-f4e1540c6f36,027cbfb6-e50c-4154-9662-bf60185d0bd5,03ac55a7-5f5b-473e-b278-bd46ba09bc59,03d13f30-941d-4aeb-b160-382e57b01137,03f03478-f352-4620-bda7-47cc9c0ffbf5,04f3a3ea-1d89-4bdb-a277-9efac6f5fa0f,0506bc4b-9b2e-4dfc-8252-5cde50768493,...,fbc6eb60-8db1-4696-9e33-090fa4a277a7,fbd177e8-8b77-4d0b-94a6-2852e8bca8b5,fbe559f7-c9db-48ba-acdd-addd855a8e84,fc14987c-1ac0-4f71-9567-2986c0a43d88,fc9cb500-deb9-4622-8bcf-a35d2700273b,fd53ce06-183e-47b1-853f-e249c23f13e4,fe7b7fbd-e49a-4891-8e30-95e9fdad03bb,fe9e0419-34ec-402c-a655-284388770152,fef05f68-9338-47ee-ac06-ba2f8d14d9c3,ff67d01d-f940-421b-84c5-3c4a84aee080
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
003d0bca-5ea6-495b-b522-fd0f8622593c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
005cabb6-0724-47f2-8a6d-5584a5315d8c,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0060ecad-c890-4035-b628-c818b32d0a15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
00bb38a6-2c72-49a3-8921-6c83cdb48915,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0118b9fe-0c27-4357-8fb3-c46bcf3bf449,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ff0f2bc8-3727-480d-b69a-6921f2f9e1e6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ff75f27b-a94b-4875-b2c2-ba2aeb137d5b,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ff78336b-ae36-470a-8612-23dad354c728,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ffa30bd2-83ed-48a4-ac83-3c5f0d3d1634,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
train_data, test_data = train_test_split(user_item_matrix, test_size=0.2, random_state=42)

In [51]:
nmf = NMF(n_components=20, init='random', random_state=0)
user_features = nmf.fit_transform(train_data)
item_features = nmf.components_

In [52]:
user_features

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.00557836, 0.00931308, ..., 0.00662616, 0.        ,
        0.00657513],
       [0.        , 0.        , 0.19290187, ..., 0.        , 0.        ,
        0.00241168],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.00409934,
        0.00149833],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.01790288],
       [0.47648757, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [53]:
item_features

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.43326427, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06631461, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.20462409, 0.        ,
        0.        ]])

In [54]:
predicted_ratings = np.dot(user_features, item_features)

In [55]:
predicted_ratings

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 3.96168173e-04, 2.81365831e-03],
       [0.00000000e+00, 2.43038582e-03, 0.00000000e+00, ...,
        9.81170023e-03, 1.15619510e-04, 0.00000000e+00],
       [1.51412615e-03, 6.98753493e-04, 9.11160970e-04, ...,
        6.46839650e-04, 2.89792508e-03, 1.70314364e-03],
       ...,
       [1.24986769e-04, 0.00000000e+00, 7.42039569e-06, ...,
        5.78441183e-04, 9.66390608e-05, 1.27845530e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.66335951e-03, 0.00000000e+00, 6.80922840e-02],
       [1.33262920e-03, 0.00000000e+00, 5.75583528e-02, ...,
        3.80405054e-06, 0.00000000e+00, 0.00000000e+00]])

In [56]:
train_data_array = train_data.to_numpy()
predicted_ratings_array = predicted_ratings

In [57]:
print(train_data_array)
print("'------------------")
print(predicted_ratings_array)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
'------------------
[[0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 0.00000000e+00
  3.96168173e-04 2.81365831e-03]
 [0.00000000e+00 2.43038582e-03 0.00000000e+00 ... 9.81170023e-03
  1.15619510e-04 0.00000000e+00]
 [1.51412615e-03 6.98753493e-04 9.11160970e-04 ... 6.46839650e-04
  2.89792508e-03 1.70314364e-03]
 ...
 [1.24986769e-04 0.00000000e+00 7.42039569e-06 ... 5.78441183e-04
  9.66390608e-05 1.27845530e-05]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 ... 3.66335951e-03
  0.00000000e+00 6.80922840e-02]
 [1.33262920e-03 0.00000000e+00 5.75583528e-02 ... 3.80405054e-06
  0.00000000e+00 0.00000000e+00]]


In [58]:
# Mask zero entries in the original matrix
from sklearn.metrics import mean_squared_error
mask = train_data_array != 0
mse = mean_squared_error(train_data_array[mask], predicted_ratings_array[mask])
rmse = np.sqrt(mse)
print(f'RMSE: {rmse}')


RMSE: 2.9678975705422617


In [59]:
def recommend_materials(student_id, user_features, item_features, material_df, n=5):
    # Calculate predicted ratings for the student
    student_index = train_data.index.get_loc(student_id)
    predicted_ratings = np.dot(user_features[student_index, :], item_features)
    recommended_material_indices = np.argsort(predicted_ratings)[::-1][:n]
    recommended_material_ids = user_item_matrix.columns[recommended_material_indices]
    recommendations = material_df[material_df['material_id'].isin(recommended_material_ids)]
    
    return recommendations

# Generating recommendations
sample_student_id = user_item_matrix.index[0]
recommendations = recommend_materials(sample_student_id, user_features, item_features, material_df)
print(recommendations)


                              material_id   subject  difficulty  popularity  \
187  38d53ef1-13d6-48d3-8081-0f14bde8921c       NLP           5          55   
297  af0ee7de-16cb-42be-af3c-e143505f65ec  Networks           1          71   
396  41eb82e7-1f5a-479f-b738-bb27aac7dce2        ML           3          84   
434  f81cb996-7500-40f0-993b-0bc6cd536da5       NLP           2          48   
442  4bc8012d-cfb8-42c7-b707-e509c6258619        ML           5          77   

     content_length  
187               8  
297              22  
396              41  
434              39  
442               9  
