In [2]:
import numpy as np
import pandas as pd

# loading data

In [3]:
interest_df = pd.read_csv('interest.csv')
ratings_df = pd.read_csv('ratings.csv')

In [4]:
interest_df.head()

Unnamed: 0,interestId,title,course
0,1,Building,Ordinary Diploma in Civil Engineering|Ordinary...
1,2,Research,Ordinary Diploma in Civil Engineering|Ordinary...
2,3,Health,Ordinary Diploma in Biotechnology|Ordinary Dip...
3,4,Technology,Ordinary Diploma in Computer Engineering|Ordin...
4,5,Computer,Ordinary Diploma in Computer Engineering|Ordin...


In [5]:
interest_df['title']

0          Building
1          Research
2            Health
3        Technology
4          Computer
5       Electricity
6        Networking
7            Energy
8     Communication
9        Automation
10          Geology
11             Film
12         Medicine
13      Programming
14            Media
15      Experiments
16      Maintenance
17       Laboratory
Name: title, dtype: object

In [6]:
interest_df['title'] = interest_df['title'].apply(lambda x: x.strip())

In [7]:
interest_df.head()

Unnamed: 0,interestId,title,course
0,1,Building,Ordinary Diploma in Civil Engineering|Ordinary...
1,2,Research,Ordinary Diploma in Civil Engineering|Ordinary...
2,3,Health,Ordinary Diploma in Biotechnology|Ordinary Dip...
3,4,Technology,Ordinary Diploma in Computer Engineering|Ordin...
4,5,Computer,Ordinary Diploma in Computer Engineering|Ordin...


In [8]:
ratings_df.head()

Unnamed: 0,userId,interestId,rating,timestamp
0,1,1,4.0,964982703
1,2,2,4.0,964981247
2,3,3,4.0,964982224
3,4,4,5.0,964983815
4,5,5,5.0,964982931


In [9]:
ratings_df = ratings_df.drop('timestamp', 1)
ratings_df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,userId,interestId,rating
0,1,1,4.0
1,2,2,4.0
2,3,3,4.0
3,4,4,5.0
4,5,5,5.0


# Let's begin by creating an input user to recommend course to:

In [10]:
userInput = [
            {'title':'Computer', 'rating':5},
            {'title':'Health', 'rating':3.5},
            {'title':'Medicine', 'rating':2},
            {'title':'Programming', 'rating':5},
            {'title':'Technology', 'rating':4.5}
         ] 
inputInterest = pd.DataFrame(userInput)
inputInterest

Unnamed: 0,title,rating
0,Computer,5.0
1,Health,3.5
2,Medicine,2.0
3,Programming,5.0
4,Technology,4.5


In [11]:
#Filtering out the interest by title
inputId = interest_df[interest_df['title'].isin(inputInterest['title'].tolist())]

#Then merging it so we can get the interestId. It's implicitly merging it by title.
inputInterest = pd.merge(inputId, inputInterest)

#Dropping information we won't use from the input dataframe
inputInterest = inputInterest.drop('course', 1)

inputInterest

  


Unnamed: 0,interestId,title,rating
0,3,Health,3.5
1,4,Technology,4.5
2,5,Computer,5.0
3,13,Medicine,2.0
4,14,Programming,5.0


# Users who have selected the same interest

In [12]:
#Filtering out users that have watched movies that the input has watched and storing it
userSubset = ratings_df[ratings_df['interestId'].isin(inputInterest['interestId'].tolist())]
userSubset.head()

Unnamed: 0,userId,interestId,rating
2,3,3,4.0
3,4,4,5.0
4,5,5,5.0
11,12,13,5.0
12,13,14,3.0


In [13]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

In [14]:
userSubsetGroup

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f2efbed7e50>

In [15]:
userSubsetGroup.get_group(13)

Unnamed: 0,userId,interestId,rating
12,13,14,3.0


In [16]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['interestId'])

In [17]:
userSubsetGroup.get_group(13)

Unnamed: 0,userId,interestId,rating
11,12,13,5.0
24,25,13,4.0
38,39,13,5.0
51,52,13,5.0
64,65,13,5.0
...,...,...,...
37374,37375,13,3.0
37387,37388,13,4.0
37400,37401,13,4.0
37413,37414,13,4.0


In [18]:
len(userSubsetGroup.get_group(13))

2879

In [19]:
userSubsetGroup.get_group(13)

Unnamed: 0,userId,interestId,rating
11,12,13,5.0
24,25,13,4.0
38,39,13,5.0
51,52,13,5.0
64,65,13,5.0
...,...,...,...
37374,37375,13,3.0
37387,37388,13,4.0
37400,37401,13,4.0
37413,37414,13,4.0


In [20]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['interestId'])

In [21]:
userSubsetGroup.get_group(14)

Unnamed: 0,userId,interestId,rating
12,13,14,3.0
25,26,14,5.0
39,40,14,3.0
52,53,14,5.0
65,66,14,5.0
...,...,...,...
37375,37376,14,3.0
37388,37389,14,4.0
37401,37402,14,5.0
37414,37415,14,3.5


In [22]:
len(userSubsetGroup.get_group(14))

2879

In [23]:
#Sorting it so users with interest most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [24]:
#Top most user with id 75 having all 5 similar interest
userSubsetGroup[0]

(3,
        userId  interestId  rating
 2           3           3     4.0
 15         16           3     5.0
 29         30           3     4.0
 42         43           3     3.0
 55         56           3     5.0
 ...       ...         ...     ...
 37378   37379           3     3.5
 37391   37392           3     3.0
 37404   37405           3     4.0
 37417   37418           3     4.5
 37430   37431           3     4.5
 
 [2880 rows x 3 columns])

In [25]:
#name of top user group
userSubsetGroup[0][0]

3

In [26]:
#dataframe of top user group
userSubsetGroup[0][1]

Unnamed: 0,userId,interestId,rating
2,3,3,4.0
15,16,3,5.0
29,30,3,4.0
42,43,3,3.0
55,56,3,5.0
...,...,...,...
37378,37379,3,3.5
37391,37392,3,3.0
37404,37405,3,4.0
37417,37418,3,4.5


In [27]:
userSubsetGroup = userSubsetGroup[0:100]

In [28]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='interestId')
    inputInterest = inputInterest.sort_values(by='interestId')
    
    #Get the N (total similar movies watched) for the formula 
    nRatings = len(group)
    
    #Get the review scores for the movies that they both have in common
    temp_df = inputInterest[inputInterest['interestId'].isin(group['interestId'].tolist())]
    
    ###For Debugging Purpose
    #if nRatings<5:
    #    print(inputMovies['movieId'].isin(group['movieId'].tolist()))
    #    break
    #else:
    #    continue
    
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    
    #Now let's calculate the pearson correlation between two users, so called, x and y

    #For package based
    #scipy.stats import pearsonr
    #pearsonr(tempRatingList,tempGroupList)[0]

    #For hard code based
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/np.sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [29]:
pearsonCorrelationDict.items()

dict_items([(3, 0.007416970157945006), (4, 0.02569109556964057), (5, 0.025820400435073523), (13, 0.025923369641531725), (14, -0.010604082776185573)])

In [30]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.head()

Unnamed: 0,0
3,0.007417
4,0.025691
5,0.02582
13,0.025923
14,-0.010604


In [31]:
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,0.007417,3
1,0.025691,4
2,0.02582,5
3,0.025923,13
4,-0.010604,14


In [32]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
3,0.025923,13
2,0.02582,5
1,0.025691,4
0,0.007417,3
4,-0.010604,14


In [33]:
topUsersRating = topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,interestId,rating
0,0.025923,13,14,3.0
1,0.02582,5,5,5.0
2,0.025691,4,4,5.0
3,0.007417,3,3,4.0
4,-0.010604,14,1,5.0


In [34]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,interestId,rating,weightedRating
0,0.025923,13,14,3.0,0.07777
1,0.02582,5,5,5.0,0.129102
2,0.025691,4,4,5.0,0.128455
3,0.007417,3,3,4.0,0.029668
4,-0.010604,14,1,5.0,-0.05302


In [35]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('interestId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
interestId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,-0.010604,-0.05302
3,0.007417,0.029668
4,0.025691,0.128455
5,0.02582,0.129102
14,0.025923,0.07777


In [36]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['interestId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,interestId
interestId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5.0,1
3,4.0,3
4,5.0,4
5,5.0,5
14,3.0,14


In [37]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,interestId
interestId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,5.0,1
4,5.0,4
5,5.0,5
3,4.0,3
14,3.0,14


In [38]:
interest_df.loc[interest_df['interestId'].isin(recommendation_df.head(20)['interestId'].tolist())]

Unnamed: 0,interestId,title,course
0,1,Building,Ordinary Diploma in Civil Engineering|Ordinary...
2,3,Health,Ordinary Diploma in Biotechnology|Ordinary Dip...
3,4,Technology,Ordinary Diploma in Computer Engineering|Ordin...
4,5,Computer,Ordinary Diploma in Computer Engineering|Ordin...
13,14,Programming,Ordinary Diploma in Computer Engineering|Ordin...


In [39]:
rec = interest_df.loc[interest_df['interestId'].isin(recommendation_df.head(6)['interestId'].tolist())]

In [40]:
rec.head()

Unnamed: 0,interestId,title,course
0,1,Building,Ordinary Diploma in Civil Engineering|Ordinary...
2,3,Health,Ordinary Diploma in Biotechnology|Ordinary Dip...
3,4,Technology,Ordinary Diploma in Computer Engineering|Ordin...
4,5,Computer,Ordinary Diploma in Computer Engineering|Ordin...
13,14,Programming,Ordinary Diploma in Computer Engineering|Ordin...


In [41]:
userInput

[{'title': 'Computer', 'rating': 5},
 {'title': 'Health', 'rating': 3.5},
 {'title': 'Medicine', 'rating': 2},
 {'title': 'Programming', 'rating': 5},
 {'title': 'Technology', 'rating': 4.5}]

In [42]:
rec = interest_df.loc[interest_df['interestId'].isin(recommendation_df.head(3)['interestId'].tolist())]

In [43]:
rec.head()

Unnamed: 0,interestId,title,course
0,1,Building,Ordinary Diploma in Civil Engineering|Ordinary...
2,3,Health,Ordinary Diploma in Biotechnology|Ordinary Dip...
3,4,Technology,Ordinary Diploma in Computer Engineering|Ordin...
4,5,Computer,Ordinary Diploma in Computer Engineering|Ordin...
13,14,Programming,Ordinary Diploma in Computer Engineering|Ordin...


In [44]:
rec = interest_df.loc[interest_df['interestId'].isin(recommendation_df.head(3)['interestId'].tolist())]

In [45]:
rec.head()

Unnamed: 0,interestId,title,course
0,1,Building,Ordinary Diploma in Civil Engineering|Ordinary...
3,4,Technology,Ordinary Diploma in Computer Engineering|Ordin...
4,5,Computer,Ordinary Diploma in Computer Engineering|Ordin...


In [46]:
userInput

[{'title': 'Computer', 'rating': 5},
 {'title': 'Health', 'rating': 3.5},
 {'title': 'Medicine', 'rating': 2},
 {'title': 'Programming', 'rating': 5},
 {'title': 'Technology', 'rating': 4.5}]