# Importing the Libraries

In [1]:
!pip install scikit-surprise
import os
import pandas as pd

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
     -------------------------------------- 772.0/772.0 kB 3.0 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py): started
  Building wheel for scikit-surprise (setup.py): finished with status 'done'
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-win_amd64.whl size=1095050 sha256=5f42e2a94945372eee12f4266b0852ba83b36e384a6705394cb9f6cc0d6fa2c3
  Stored in directory: c:\users\hchinta1\appdata\local\pip\cache\wheels\c6\3a\46\9b17b3512bdf283c6cb84f59929cdd5199d4e754d596d22784
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [2]:
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import KFold
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.accuracy import rmse
from collections import defaultdict
from surprise import KNNBasic
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import accuracy
from surprise import AlgoBase
from surprise.model_selection import KFold


# Helper functions

In [3]:
# Function to load the ratings and movies datasets

# dir = '/content/drive/MyDrive/SWM/Data'
def loadDataset():
  # ratings_df = pd.read_csv(dir+'/ratings.csv')
  # movies_df = pd.read_csv(dir+'/movies.csv')
  ratings_df = pd.read_csv('./Data/ratings.csv')
  movies_df = pd.read_csv('./Data/movies.csv')
  
  ratings_df.drop('timestamp',axis =1, inplace = True)

  return ratings_df,movies_df


In [4]:
# Calculates the evaluations metrics Precision and Recall for each loop of cross validation

def metricsAtK(predictions, k=10, threshold=3.5):

    # dictionary to store user and the respective predicted ratings of the movies
    user_predicted = defaultdict(list)
    for u_id, _, rating_actual, rating_predicted, _ in predictions:
        user_predicted[u_id].append((rating_predicted, rating_actual))

    precisions = dict()
    recalls = dict()

    for u_id, u_ratings in user_predicted.items():
        u_ratings.sort(key=lambda x: x[0], reverse=True)

        # Calculating actual values of the ratings
        actual = sum((ratings_true >= threshold) for (_, ratings_true) in u_ratings)
        # Calculating predicted values of the ratings
        predicted = sum((estimate_value >= threshold) for (estimate_value, _) in u_ratings[:k])

        # Calculating true positives and negatives
        positive_true = sum(((ratings_true >= threshold) and (estimate_value >= threshold))
                              for (estimate_value, ratings_true) in u_ratings[:k])
        negative_true = sum(((ratings_true < threshold) and (estimate_value < threshold))
                              for (estimate_value, ratings_true) in u_ratings[:k])
        
        # Calculating actual and predicted positives and negatives
        positive_predicted = predicted if predicted != 0 else 1
        negative_predicted = predicted if predicted != 1 else 0
        positive_actual = actual if actual != 0 else 1
        negative_actual = actual if actual != 1 else 0

        # Calculating precision, recall and accuracy
        precisions[u_id] = positive_true / positive_predicted
        recalls[u_id] = positive_true / positive_actual

    return precisions, recalls

In [5]:
def fit(model, ratings_data, num_of_splits=5):
  # Generator object for k - cross validation
  kf = KFold(n_splits=num_of_splits)
  split_df = list()

  i = 1
  for train, test in kf.split(ratings_data):
      predictions = model.fit(train).test(test)
      rmse = accuracy.rmse(predictions, verbose=False)
      mae = accuracy.mae(predictions, verbose=False)
      precisions, recalls = metricsAtK(predictions, k=5, threshold=4)
      precision = sum(prec for prec in precisions.values()) / len(precisions)
      recall = sum(rec for rec in recalls.values()) / len(recalls)
      f1_score = (2*precision*recall)/(precision+recall)
      split_df.append([i,precision,recall,rmse,f1_score,mae])
      i +=1

  split_df = pd.DataFrame(split_df, columns=['Split', 'Precision', 'Recall', 'RMSE', 'F1 Score', 'MAE'])
  return split_df

In [6]:
def getSortedPredictions(predictions):
    
    sorted_predictions = defaultdict(list)    
    for u_id, id, _, rating_predicted, _ in predictions:
        sorted_predictions[u_id].append((id, rating_predicted))

    for u_id, u_ratings in sorted_predictions.items():
        u_ratings.sort(key=lambda x: x[1], reverse=True)

    return sorted_predictions

In [7]:
def inference(model):
  trainset = ratings_data.build_full_trainset()

  # build_anti_testset will generate the entires for the movies which the user has not rated. i.e 
  # The completement of the user's ratings. It assumes the rating to be equal to the global mean of the ratings.
  testset = trainset.build_anti_testset() 

  # Training the model with trainset and getting predictions using the generated testset
  model.fit(trainset)
  predictions = model.test(testset)

  return predictions


In [8]:
# Post processing the predictions to produce the list of recommendations
def getRecommendations(predictions, n=10):

  # Getting sorted predictions
  total = getSortedPredictions(predictions)

  # Extracting only n number of movie predictions
  for user_id, user_ratings in total.items():
      total[user_id] = user_ratings[:n]

  total_df = pd.DataFrame.from_dict(total)
  total_df = total_df.transpose()

  result = []
  for user_id,user_ratings in total.items():
    result.append(total_df.loc[user_id])

  #Developing recommendations
  recommendations = []
  for i in result:
    recommended_movieIds=[]
    for x in range(0, n):
      recommended_movieIds.append(i[x][0])
    recommendations.append(recommended_movieIds)

  recommendation_list = []
  for i in recommendations:
    df = movies_df[movies_df['movieId'].isin(i)]
    temp = df['title'].tolist()
    recommendation_list.append(temp)

  recommendation_df = pd.DataFrame(recommendation_list)
  return recommendation_df


# Loading Dataset

In [9]:
ratings_df , movies_df = loadDataset()

In [10]:
reader = Reader(rating_scale=(0.5, 5.0))
ratings_keys = ['userId', 'movieId', 'rating']
ratings_filter = ratings_df[ratings_keys]
ratings_data = Dataset.load_from_df(ratings_filter, reader)

# Modelling Section

## K Nearest Neighbors

### Using MSD similarity

In [11]:
msd_knn = KNNBasic(k= 40, n_epochs=20)
#msd_preds = msd_knn.fit(trainset).test(testset)


In [12]:
# Generating evaluation metrics Precision, Recall, F1-Score, RMSE, MAE for the ratings dataset
msd_metrics = fit(msd_knn, ratings_data)
msd_metrics

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0,Split,Precision,Recall,RMSE,F1 Score,MAE
0,1,0.67194,0.27693,0.947999,0.392215,0.725111
1,2,0.67235,0.26145,0.947533,0.376496,0.725553
2,3,0.675369,0.272897,0.946729,0.388722,0.722828
3,4,0.672195,0.273666,0.948169,0.388973,0.72853
4,5,0.665654,0.261422,0.943976,0.375409,0.725491


In [13]:
msd_predictions = inference(msd_knn)


Computing the msd similarity matrix...
Done computing similarity matrix.


In [15]:
msd_df = getRecommendations(msd_predictions)
msd_df.to_csv('msd_knn.csv',index = False)


### Using Pearson Correlation

In [16]:
# Defining the similarity options with pearson correlation.
sim_options = {
    'name': 'pearson'
}

pearson_knn = KNNBasic(k= 35, n_epochs=25,sim_options = sim_options)
#cosine_preds = cosine_knn.fit(trainset).test(testset)



In [17]:
# Generating evaluation metrics Precision, Recall, F1-Score, RMSE, MAE for the ratings dataset
pearson_metrics = fit(pearson_knn, ratings_data)
pearson_metrics

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


Unnamed: 0,Split,Precision,Recall,RMSE,F1 Score,MAE
0,1,0.660546,0.25314,0.961238,0.366013,0.744061
1,2,0.650219,0.246703,0.979441,0.357692,0.753912
2,3,0.642295,0.244734,0.969965,0.354422,0.750371
3,4,0.658907,0.250208,0.983169,0.36269,0.758474
4,5,0.64386,0.237663,0.971412,0.347175,0.74999


In [18]:

pearson_predictions = inference(pearson_knn)

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [19]:

pearson_df = getRecommendations(pearson_predictions)
pearson_df.to_csv('pearson_knn.csv',index = False)

### Using Cosine similarity

In [20]:
# Defining the similarity options with cosine similarity.
sim_options = {
    'name': 'cosine'
}

cosine_knn = KNNBasic(k= 35, n_epochs=25)
#cosine_preds = cosine_knn.fit(trainset).test(testset)


In [21]:
# Generating evaluation metrics Precision, Recall, F1-Score, RMSE, MAE for the ratings dataset
cosine_metrics = fit(cosine_knn, ratings_data)
cosine_metrics

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


Unnamed: 0,Split,Precision,Recall,RMSE,F1 Score,MAE
0,1,0.667596,0.27407,0.950828,0.388605,0.729382
1,2,0.676355,0.267268,0.946588,0.383136,0.722245
2,3,0.657718,0.254623,0.939984,0.367122,0.716903
3,4,0.660301,0.270438,0.956019,0.383717,0.73312
4,5,0.677732,0.276924,0.943801,0.393189,0.72503


In [22]:
cosine_predictions = inference(cosine_knn)


Computing the msd similarity matrix...
Done computing similarity matrix.


In [23]:
cosine_df = getRecommendations(cosine_predictions)
cosine_df.to_csv('cosine_knn.csv',index = False)
