# Libraries

In [None]:
!pip install surprise

In [63]:
import pandas as pd 
import numpy as np
from numpy import dot
from numpy.linalg import norm 
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

from surprise import Dataset, Reader, accuracy, KNNWithMeans
from surprise.model_selection import GridSearchCV, train_test_split

import random
from random import randint

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 1000)

%matplotlib inline

# Data Generation

In [2]:
# Generate data required for the Recommender System
def generateData(n_courses=25, n_topics=5, n_professors=20, n_students=5000, size=20000):
  '''
  This function will generate a dataset with features associated to courses. 
  The dataset will have the following columns:
    - course_id (String): Unique identifier for the course
    - course_topic (Integer): An integer value representing the topic for the book, value is between 1 and 5, 
                              indicating that there are 15 unique topics. Each course can only have 1 topic.
    - professor_id (String): Unique identifier for the professor
    - student_id (String): Unique identifier for the student
    - course_rating (Integer): A value between 0 and 5
  
  params:
    n_courses (Integer): The number of courses in the dataset
    n_topics (Integer): The number of course topics to be chosen from
    n_professors (Integer): Number of professors to be generated
    n_students (Integer): Number of students for the dataset
    size (Integer): The number of rows in the dataset
  
  example:
    data = generateData()
  '''

  d = pd.DataFrame(
      {
          'course_id': [randint(1, n_courses) for _ in range(size)], 
          'course_topic': [randint(1, n_topics) for _ in range(size)],
          'professor_id': [randint(1, n_professors) for _ in range(size)],
          'student_id': [randint(1, n_students) for _ in range(size)],
          'course_ratings': [randint(1, 5) for _ in range(size)]
      }
  ).drop_duplicates()

  return d

random.seed(123)
raw_data = generateData()
raw_data.head(15)

Unnamed: 0,course_id,course_topic,professor_id,student_id,course_ratings
0,2,2,8,3397,3
1,9,3,14,3080,4
2,3,2,11,4588,2
3,25,5,3,1336,1
4,14,3,5,4345,5
5,9,3,9,4256,4
6,4,3,1,4660,4
7,2,5,9,2572,1
8,13,1,15,619,5
9,18,4,16,1608,4


# Manual Implementation
## Collaborative Filtering

In [3]:
# Create the sparse student by course matrix for course ratings
df = raw_data.pivot_table(columns='course_id', index='student_id', values='course_ratings').fillna(0)
mat = df.values
mat = csr_matrix(mat)

df.head()

course_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
1,0.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,1.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,0.0,2.0
3,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,2.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# Normalize predictions
def normalizePrediction(pred):
  return (pred - pred.min()) / (pred.max() - pred.min())

# Generate predictions
def prediction(mat, df, n_factors):
  if not 1 <= n_factors < min(mat.shape):
    raise ValueError('Must be 1 <= n_factors < min(mat.shape)')
  
  # Matrix factorization
  u, s, v = svds(mat, k=n_factors)
  s = np.diag(s)

  # Calculate predictions
  pred = np.dot(np.dot(u, s), v)
  pred = normalizePrediction(pred)

  # Convert it back to dataframe
  pred_df = pd.DataFrame(pred, columns = df.columns, index=list(df.index)).transpose()

  return pred_df

pred_df = prediction(mat, df, 5)

In [36]:
def collaborativeRecommender(pred_df, student_id, top_rec):
  student_pred = pred_df[student_id].reset_index().rename({student_id: 'rating'}, axis=1)
  recommendations = student_pred.sort_values(by='rating', ascending=False).head(top_rec).reset_index(drop=True)

  return recommendations

collaborativeRecommender(pred_df, 1301, 3)

Unnamed: 0,course_id,rating
0,6,0.442055
1,25,0.388326
2,2,0.386245


# Recommender System

## Collaborative Filtering

In [57]:
random.seed(123)
raw_data = generateData()

reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(raw_data[['student_id', 'course_id', 'course_ratings']], reader)
trainset, testset = train_test_split(data, test_size=0.3, random_state=123)

In [58]:
model = KNNWithMeans(sim_options={'name': 'cosine', 
                                  'user_based': True})
model.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7f75e51e9590>

In [59]:
pred = model.predict(1301, 6)
print('predicted course rating:', pred.est, '\n')
display(raw_data[(raw_data['student_id']==1301) & (raw_data['course_id']==6)])

predicted course rating: 3.8895238095238094 



Unnamed: 0,course_id,course_topic,professor_id,student_id,course_ratings
5989,6,2,11,1301,4


In [60]:
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)

# train = 80% of the data, holdout = 20% of the data
threshold = int(.8*len(raw_ratings))
train = raw_ratings[:threshold]
holdout = raw_ratings[threshold:]

data.raw_ratings = train 

params = {
    'sim_options': {
        'name': ['msd', 'cosine', 'pearson'], 
        'min_support': [3, 4, 5], 
        'user_based': [True]
        }
    }

cv = GridSearchCV(KNNWithMeans, params, measures=['rmse', 'mae'], cv=5)
cv.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [61]:
print('Best parameters:', cv.best_params['rmse'])
print('Best parameters:', cv.best_params['mae'])

print('Best RMSE Score:', cv.best_score['rmse'])
print('Best MAE Score:', cv.best_score['mae'])

Best parameters: {'sim_options': {'name': 'pearson', 'min_support': 5, 'user_based': True}}
Best parameters: {'sim_options': {'name': 'pearson', 'min_support': 5, 'user_based': True}}
Best RMSE Score: 1.7339858829325372
Best MAE Score: 1.4157454688194182


In [64]:
model = cv.best_estimator['rmse']

# retrain on the whole set A
trainset = data.build_full_trainset()
model.fit(trainset)

# Compute biased accuracy on A
predictions = model.test(trainset.build_testset())
print('Biased accuracy on the Training,', end='   ')
accuracy.rmse(predictions)

# Compute unbiased accuracy on B
testset = data.construct_testset(holdout)  # testset is now the set B
predictions = model.test(testset)
print('Unbiased accuracy on the Holdout,', end=' ')
accuracy.rmse(predictions)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Biased accuracy on the Training,   RMSE: 0.4717
Unbiased accuracy on the holdout, RMSE: 1.7517


1.7516865109738355

## Content Based System

In [65]:
raw_ratings = data.raw_ratings
random.shuffle(raw_ratings)

# train = 80% of the data, holdout = 10% of the data
threshold = int(.8*len(raw_ratings))
train = raw_ratings[:threshold]
holdout = raw_ratings[threshold:]

data.raw_ratings = train 

params = {
    'sim_options': {
        'name': ['msd', 'cosine', 'pearson'], 
        'min_support': [3, 4, 5], 
        'user_based': [False]
        }
    }

cv = GridSearchCV(KNNWithMeans, params, measures=['rmse', 'mae'], cv=5)
cv.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [66]:
print('Best parameters:', cv.best_params['rmse'])
print('Best parameters:', cv.best_params['mae'])

print('Best RMSE Score:', cv.best_score['rmse'])
print('Best MAE Score:', cv.best_score['mae'])

Best parameters: {'sim_options': {'name': 'cosine', 'min_support': 3, 'user_based': False}}
Best parameters: {'sim_options': {'name': 'cosine', 'min_support': 3, 'user_based': False}}
Best RMSE Score: 1.715431889309326
Best MAE Score: 1.4118926402300784


In [67]:
model = cv.best_estimator['rmse']

# retrain on the whole set the Training
trainset = data.build_full_trainset()
model.fit(trainset)

# Compute biased accuracy on the Training
predictions = model.test(trainset.build_testset())
print('Biased accuracy on the Training,', end='   ')
accuracy.rmse(predictions)

# Compute unbiased accuracy on the Holdout
testset = data.construct_testset(holdout) 
predictions = model.test(testset)
print('Unbiased accuracy on the Holdout,', end=' ')
accuracy.rmse(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Biased accuracy on the Training,   RMSE: 1.0617
Unbiased accuracy on the holdout, RMSE: 1.7098


1.7097859484347893