In [1]:
import numpy as np
import pandas as pd

In [2]:
interest_df = pd.read_csv('interest.csv')
ratings_df = pd.read_csv('ratings.csv')

In [3]:
interest_df.head()

Unnamed: 0,interestId,title,course
0,1,Building,Ordinary Diploma in Civil Engineering|Ordinary...
1,2,Research,Ordinary Diploma in Civil Engineering|Ordinary...
2,3,Health,Ordinary Diploma in Biotechnology|Ordinary Dip...
3,4,Technology,Ordinary Diploma in Computer Engineering|Ordin...
4,5,Computer,Ordinary Diploma in Computer Engineering|Ordin...


In [4]:
interest_df['title'] = interest_df['title'].apply(lambda x: x.strip())

In [5]:
interest_df.head()

Unnamed: 0,interestId,title,course
0,1,Building,Ordinary Diploma in Civil Engineering|Ordinary...
1,2,Research,Ordinary Diploma in Civil Engineering|Ordinary...
2,3,Health,Ordinary Diploma in Biotechnology|Ordinary Dip...
3,4,Technology,Ordinary Diploma in Computer Engineering|Ordin...
4,5,Computer,Ordinary Diploma in Computer Engineering|Ordin...


In [6]:
#Every genre is separated by a | so we simply have to call the split function on |
interest_df['course'] = interest_df.course.str.split('|')
interest_df.head()

Unnamed: 0,interestId,title,course
0,1,Building,"[Ordinary Diploma in Civil Engineering, Ordina..."
1,2,Research,"[Ordinary Diploma in Civil Engineering, Ordina..."
2,3,Health,"[Ordinary Diploma in Biotechnology, Ordinary D..."
3,4,Technology,"[Ordinary Diploma in Computer Engineering, Ord..."
4,5,Computer,"[Ordinary Diploma in Computer Engineering, Ord..."


In [7]:
#Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
interestWithCourse_df = interest_df.copy()

#For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in interest_df.iterrows():
    for course in row['course']:
        interestWithCourse_df.at[index, course] = 1
        
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
interestWithCourse_df = interestWithCourse_df.fillna(0)
interestWithCourse_df.head()

Unnamed: 0,interestId,title,course,Ordinary Diploma in Civil Engineering,Ordinary Diploma in Electrical Engineering,Ordinary Diploma in Computer Engineering,Ordinary Diploma in Biotechnology,Ordinary Diploma Renewable Energy Technology,Ordinary Diploma in Food Science and Technology,Ordinary diploma in Biomedical Equipment Engineering,Ordinary Diploma in Information Technology,Ordinary Diploma Multimedia and Film Technology,Ordinary Diploma in Electronics and Telecommunications Engineering,Unnamed: 14,Ordinary Diploma in Mining Engineering,Ordinary Diploma in Mechanical Engineering,Ordinary Diploma in Science and Laboratory Technology
0,1,Building,"[Ordinary Diploma in Civil Engineering, Ordina...",1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Research,"[Ordinary Diploma in Civil Engineering, Ordina...",1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Health,"[Ordinary Diploma in Biotechnology, Ordinary D...",0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Technology,"[Ordinary Diploma in Computer Engineering, Ord...",0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
4,5,Computer,"[Ordinary Diploma in Computer Engineering, Ord...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [8]:
ratings_df.head()

Unnamed: 0,userId,interestId,rating
0,1,1,4.0
1,1,2,4.0
2,1,3,4.0
3,1,4,5.0
4,1,5,5.0


In [8]:
userInput = [
            {'title':'Health', 'rating':5},
            {'title':'Computer', 'rating':3.5},
            {'title':'Laboratory', 'rating':2},
            {'title':"Medicine", 'rating':5},
            {'title':'Programming', 'rating':4.5}
         ] 
inputInterest = pd.DataFrame(userInput)
inputInterest

Unnamed: 0,title,rating
0,Health,5.0
1,Computer,3.5
2,Laboratory,2.0
3,Medicine,5.0
4,Programming,4.5


In [9]:
#Filtering out the movies by title
inputId = interest_df[interest_df['title'].isin(inputInterest['title'].tolist())]

#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputInterest = pd.merge(inputId, inputInterest)

#Dropping information we won't use from the input dataframe
inputInterest = inputInterest.drop('course', 1)

inputInterest

  


Unnamed: 0,interestId,title,rating
0,3,Health,5.0
1,5,Computer,3.5
2,13,Medicine,5.0
3,14,Programming,4.5
4,18,Laboratory,2.0


In [10]:
#Filtering out the movies from the input
userInterest = interestWithCourse_df[interestWithCourse_df['interestId'].isin(inputInterest['interestId'].tolist())]
userInterest

Unnamed: 0,interestId,title,course,Ordinary Diploma in Civil Engineering,Ordinary Diploma in Electrical Engineering,Ordinary Diploma in Computer Engineering,Ordinary Diploma in Biotechnology,Ordinary Diploma Renewable Energy Technology,Ordinary Diploma in Food Science and Technology,Ordinary diploma in Biomedical Equipment Engineering,Ordinary Diploma in Information Technology,Ordinary Diploma Multimedia and Film Technology,Ordinary Diploma in Electronics and Telecommunications Engineering,Unnamed: 14,Ordinary Diploma in Mining Engineering,Ordinary Diploma in Mechanical Engineering,Ordinary Diploma in Science and Laboratory Technology
2,3,Health,"[Ordinary Diploma in Biotechnology, Ordinary D...",0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Computer,"[Ordinary Diploma in Computer Engineering, Ord...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
12,13,Medicine,"[Ordinary Diploma in Biotechnology, Ordinary D...",0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,14,Programming,"[Ordinary Diploma in Computer Engineering, Ord...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17,18,Laboratory,"[Ordinary Diploma in Biotechnology, Ordinary D...",0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [11]:
#Resetting the index to avoid future issues
userInterest = userInterest.reset_index(drop=True)

#Dropping unnecessary issues due to save memory and to avoid issues
userMovieTable = userInterest.drop('interestId', 1).drop('title', 1).drop('course', 1)
userMovieTable

  """


Unnamed: 0,Ordinary Diploma in Civil Engineering,Ordinary Diploma in Electrical Engineering,Ordinary Diploma in Computer Engineering,Ordinary Diploma in Biotechnology,Ordinary Diploma Renewable Energy Technology,Ordinary Diploma in Food Science and Technology,Ordinary diploma in Biomedical Equipment Engineering,Ordinary Diploma in Information Technology,Ordinary Diploma Multimedia and Film Technology,Ordinary Diploma in Electronics and Telecommunications Engineering,Unnamed: 11,Ordinary Diploma in Mining Engineering,Ordinary Diploma in Mechanical Engineering,Ordinary Diploma in Science and Laboratory Technology
0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
inputInterest['rating']

0    5.0
1    3.5
2    5.0
3    4.5
4    2.0
Name: rating, dtype: float64

In [13]:
#Dot produt to get weights
userProfile = userMovieTable.transpose().dot(inputInterest['rating'])

#The user profile
userProfile

Ordinary Diploma in Civil Engineering                                  0.0
Ordinary Diploma in Electrical Engineering                             0.0
Ordinary Diploma in Computer Engineering                               8.0
Ordinary Diploma in Biotechnology                                     12.0
Ordinary Diploma Renewable Energy Technology                           0.0
Ordinary Diploma in Food Science and Technology                       12.0
Ordinary diploma in Biomedical Equipment Engineering                   7.0
Ordinary Diploma in Information Technology                             8.0
Ordinary Diploma Multimedia and Film Technology                        0.0
Ordinary Diploma in Electronics and Telecommunications Engineering     3.5
                                                                       0.0
Ordinary Diploma in Mining Engineering                                 0.0
Ordinary Diploma in Mechanical Engineering                             0.0
Ordinary Diploma in Scien

In [14]:
#Now let's get the genres of every movie in our original dataframe
courseTable = interestWithCourse_df.set_index(interestWithCourse_df['interestId'])

#And drop the unnecessary information
courseTable = courseTable.drop('interestId', 1).drop('title', 1).drop('course', 1)
courseTable.head()

  """


Unnamed: 0_level_0,Ordinary Diploma in Civil Engineering,Ordinary Diploma in Electrical Engineering,Ordinary Diploma in Computer Engineering,Ordinary Diploma in Biotechnology,Ordinary Diploma Renewable Energy Technology,Ordinary Diploma in Food Science and Technology,Ordinary diploma in Biomedical Equipment Engineering,Ordinary Diploma in Information Technology,Ordinary Diploma Multimedia and Film Technology,Ordinary Diploma in Electronics and Telecommunications Engineering,Unnamed: 11_level_0,Ordinary Diploma in Mining Engineering,Ordinary Diploma in Mechanical Engineering,Ordinary Diploma in Science and Laboratory Technology
interestId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [15]:
courseTable.shape

(18, 14)

In [16]:
#Multiply the genres by the weights and then take the weighted average
recommendationTable_df = ((courseTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()

interestId
1    0.152381
2    0.228571
3    0.590476
4    0.828571
5    0.371429
dtype: float64

In [17]:
recommended_courses = interest_df.loc[interest_df['interestId'].isin(recommendationTable_df.head(6).keys())]

In [18]:
recommended_courses

Unnamed: 0,interestId,title,course
0,1,Building,"[Ordinary Diploma in Civil Engineering, Ordina..."
1,2,Research,"[Ordinary Diploma in Civil Engineering, Ordina..."
2,3,Health,"[Ordinary Diploma in Biotechnology, Ordinary D..."
3,4,Technology,"[Ordinary Diploma in Computer Engineering, Ord..."
4,5,Computer,"[Ordinary Diploma in Computer Engineering, Ord..."
5,6,Electricity,"[Ordinary Diploma in Electrical Engineering, O..."


In [19]:
interest_df.loc[interest_df['interestId'].isin(recommendationTable_df.head(6).keys())]

Unnamed: 0,interestId,title,course
0,1,Building,"[Ordinary Diploma in Civil Engineering, Ordina..."
1,2,Research,"[Ordinary Diploma in Civil Engineering, Ordina..."
2,3,Health,"[Ordinary Diploma in Biotechnology, Ordinary D..."
3,4,Technology,"[Ordinary Diploma in Computer Engineering, Ord..."
4,5,Computer,"[Ordinary Diploma in Computer Engineering, Ord..."
5,6,Electricity,"[Ordinary Diploma in Electrical Engineering, O..."


In [39]:
userInput

[{'title': 'Health', 'rating': 5},
 {'title': 'Experiments', 'rating': 3.5},
 {'title': 'Laboratory', 'rating': 2},
 {'title': 'Medicine', 'rating': 5},
 {'title': 'Technology', 'rating': 4.5}]

In [40]:
userInput = [
            {'title':'Computer', 'rating':5},
            {'title':'Energy', 'rating':3.5},
            {'title':'Laboratory', 'rating':2},
            {'title':"Programming", 'rating':5},
            {'title':'Technology', 'rating':4.5}
         ] 
inputInterest = pd.DataFrame(userInput)
inputInterest

Unnamed: 0,title,rating
0,Computer,5.0
1,Energy,3.5
2,Laboratory,2.0
3,Programming,5.0
4,Technology,4.5


In [41]:
interest_df.loc[interest_df['interestId'].isin(recommendationTable_df.head(6).keys())]

Unnamed: 0,interestId,title,course
0,1,Building,"[Ordinary Diploma in Civil Engineering, Ordina..."
1,2,Research,"[Ordinary Diploma in Civil Engineering, Ordina..."
2,3,Health,"[Ordinary Diploma in Biotechnology, Ordinary D..."
3,4,Technology,"[Ordinary Diploma in Computer Engineering, Ord..."
4,5,Computer,"[Ordinary Diploma in Computer Engineering, Ord..."
5,6,Electricity,"[Ordinary Diploma in Electrical Engineering, O..."


In [42]:
#Filtering out the movies by title
inputId = interest_df[interest_df['title'].isin(inputInterest['title'].tolist())]

#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputInterest = pd.merge(inputId, inputInterest)

#Dropping information we won't use from the input dataframe
inputInterest = inputInterest.drop('course', 1)

inputInterest

  


Unnamed: 0,interestId,title,rating
0,4,Technology,4.5
1,5,Computer,5.0
2,8,Energy,3.5
3,14,Programming,5.0
4,18,Laboratory,2.0


In [43]:
#Filtering out the movies from the input
userInterest = interestWithCourse_df[interestWithCourse_df['interestId'].isin(inputInterest['interestId'].tolist())]
userInterest

Unnamed: 0,interestId,title,course,Ordinary Diploma in Civil Engineering,Ordinary Diploma in Electrical Engineering,Ordinary Diploma in Computer Engineering,Ordinary Diploma in Biotechnology,Ordinary Diploma Renewable Energy Technology,Ordinary Diploma in Food Science and Technology,Ordinary diploma in Biomedical Equipment Engineering,Ordinary Diploma in Information Technology,Ordinary Diploma Multimedia and Film Technology,Ordinary Diploma in Electronics and Telecommunications Engineering,Unnamed: 14,Ordinary Diploma in Mining Engineering,Ordinary Diploma in Mechanical Engineering,Ordinary Diploma in Science and Laboratory Technology
3,4,Technology,"[Ordinary Diploma in Computer Engineering, Ord...",0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
4,5,Computer,"[Ordinary Diploma in Computer Engineering, Ord...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
7,8,Energy,"[Ordinary Diploma in Mining Engineering, Ordin...",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13,14,Programming,"[Ordinary Diploma in Computer Engineering, Ord...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
17,18,Laboratory,"[Ordinary Diploma in Biotechnology, Ordinary D...",0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [44]:
#Resetting the index to avoid future issues
userInterest = userInterest.reset_index(drop=True)

#Dropping unnecessary issues due to save memory and to avoid issues
userMovieTable = userInterest.drop('interestId', 1).drop('title', 1).drop('course', 1)
userMovieTable

  """


Unnamed: 0,Ordinary Diploma in Civil Engineering,Ordinary Diploma in Electrical Engineering,Ordinary Diploma in Computer Engineering,Ordinary Diploma in Biotechnology,Ordinary Diploma Renewable Energy Technology,Ordinary Diploma in Food Science and Technology,Ordinary diploma in Biomedical Equipment Engineering,Ordinary Diploma in Information Technology,Ordinary Diploma Multimedia and Film Technology,Ordinary Diploma in Electronics and Telecommunications Engineering,Unnamed: 11,Ordinary Diploma in Mining Engineering,Ordinary Diploma in Mechanical Engineering,Ordinary Diploma in Science and Laboratory Technology
0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [45]:
inputInterest['rating']

0    4.5
1    5.0
2    3.5
3    5.0
4    2.0
Name: rating, dtype: float64

In [46]:
#Dot produt to get weights
userProfile = userMovieTable.transpose().dot(inputInterest['rating'])

#The user profile
userProfile

Ordinary Diploma in Civil Engineering                                  0.0
Ordinary Diploma in Electrical Engineering                             0.0
Ordinary Diploma in Computer Engineering                              14.5
Ordinary Diploma in Biotechnology                                      6.5
Ordinary Diploma Renewable Energy Technology                           3.5
Ordinary Diploma in Food Science and Technology                        6.5
Ordinary diploma in Biomedical Equipment Engineering                   2.0
Ordinary Diploma in Information Technology                            14.5
Ordinary Diploma Multimedia and Film Technology                        4.5
Ordinary Diploma in Electronics and Telecommunications Engineering     9.5
                                                                       0.0
Ordinary Diploma in Mining Engineering                                 3.5
Ordinary Diploma in Mechanical Engineering                             0.0
Ordinary Diploma in Scien

In [47]:
#Now let's get the genres of every movie in our original dataframe
courseTable = interestWithCourse_df.set_index(interestWithCourse_df['interestId'])

#And drop the unnecessary information
courseTable = courseTable.drop('interestId', 1).drop('title', 1).drop('course', 1)
courseTable.head()

  """


Unnamed: 0_level_0,Ordinary Diploma in Civil Engineering,Ordinary Diploma in Electrical Engineering,Ordinary Diploma in Computer Engineering,Ordinary Diploma in Biotechnology,Ordinary Diploma Renewable Energy Technology,Ordinary Diploma in Food Science and Technology,Ordinary diploma in Biomedical Equipment Engineering,Ordinary Diploma in Information Technology,Ordinary Diploma Multimedia and Film Technology,Ordinary Diploma in Electronics and Telecommunications Engineering,Unnamed: 11_level_0,Ordinary Diploma in Mining Engineering,Ordinary Diploma in Mechanical Engineering,Ordinary Diploma in Science and Laboratory Technology
interestId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [48]:
courseTable.shape

(18, 14)

In [49]:
#Multiply the genres by the weights and then take the weighted average
recommendationTable_df = ((courseTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()

interestId
1    0.216418
2    0.149254
3    0.223881
4    0.835821
5    0.574627
dtype: float64

In [50]:
interest_df.loc[interest_df['interestId'].isin(recommendationTable_df.head(6).keys())]

Unnamed: 0,interestId,title,course
0,1,Building,"[Ordinary Diploma in Civil Engineering, Ordina..."
1,2,Research,"[Ordinary Diploma in Civil Engineering, Ordina..."
2,3,Health,"[Ordinary Diploma in Biotechnology, Ordinary D..."
3,4,Technology,"[Ordinary Diploma in Computer Engineering, Ord..."
4,5,Computer,"[Ordinary Diploma in Computer Engineering, Ord..."
5,6,Electricity,"[Ordinary Diploma in Electrical Engineering, O..."


In [20]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,2,2,4.0,964981247
2,3,3,4.0,964982224
3,4,4,5.0,964983815
4,5,5,5.0,964982931
