<a href="https://colab.research.google.com/github/AnshumanJain101/Recommendation-System/blob/main/content_based.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
from collections import Counter, defaultdict
import math
import numpy as np
import os
import pandas as pd
import re
from math import sqrt
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error
import urllib.request
import zipfile

In [38]:
def cosine_sim(a, b):
      # The cosine similarity, defined as: dot(a, b) / ||a|| * ||b||
      # where ||a|| indicates the Euclidean normal of vector a.

    a = a.toarray()
    b = b.toarray()
    return (np.dot(a,b.T)) / (np.sqrt(np.sum(np.square(a))) * np.sqrt(np.sum(np.square(b))))

In [39]:
def tokenize_string(my_string):
#here we replace all "|" this ","
    return re.findall('[\w\-]+', my_string.lower())

In [40]:
def tokenize(movies):
    movies['tokens'] = [tokenize_string(genre) for genre in movies['genres']]

    return movies

In [41]:
def make_vocab(movies):
    # tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
    # where:
    # i is a term
    # d is a document (movie)
    # tf(i, d) is the frequency of term i in document d
    # max_k tf(k, d) is the maximum frequency of any term in document d
    # N is the number of documents (movies)
    # df(i) is the number of unique documents containing term i
   
    #creating a vocab of all the unique genres
    vocab = {movie_tokens:idx for idx, movie_tokens in enumerate(sorted(np.unique(np.concatenate(movies.tokens))))}

    # creating df
    df = defaultdict(int)
    for movie_genre in movies.tokens:
        for genre in vocab:
            if genre in movie_genre:
                df[genre]+=1

    all_csr = []
    for idx, movie in enumerate(movies.tokens):
        colmn, data, row = [], [], []
        tf = Counter(movie)     # tf
        max_k = tf.most_common(1)[0][1]
        for genre, freq in tf.items():
            if genre in vocab:
                colmn.append(vocab[genre])
                data.append((freq/max_k)*math.log10(len(movies)/df[genre])) # tf-idf
                X = csr_matrix((np.asarray(data), (np.zeros(shape=(len(data))), np.asarray(colmn))), shape=(1, len(vocab)))

        all_csr.append(X)

    movies['features'] = all_csr

    return movies, vocab

In [42]:
#rendom split of dataset for training and testing
# print(ratings)
def train_test(ratings):
    test = set(range(len(ratings))[::1000])
    # print("\n\n\n")
    # print(test)
    # print("\n\n\n")
    train = sorted(set(range(len(ratings))) - test)
    # print(train)
    # print("\n\n\n")
    test = sorted(test)
    return ratings.iloc[train], ratings.iloc[test]

In [43]:
def predicting_main_fxn(movies, ratings_train, ratings_test):
    
    predictions = []
    for test_userid, test_movieid in zip(ratings_test.userId, ratings_test.movieId):
        # got the test userid & test movieid
        # print("Getting for", test_userid, test_movieid)
        
        weight_ratings = []
        weights = []
        target_user_ratings = []
        for idx, train_user in ratings_train.loc[ratings_train.userId == test_userid, 'movieId': 'rating'].iterrows():
            # got the ratings and movieId for the test userId
            # print(idx)
            # print("\n\n\n")
            # print(train_user)
            # print((movies.loc[movies.movieId == int(train_user.movieId)].features.values[0]))
            
            cos_sim_weight = cosine_sim(movies.loc[movies.movieId == int(train_user.movieId)].features.values[0],
                                        movies.loc[movies.movieId == int(test_movieid)].features.values[0]) #finding cosine similarity

            # print(weight_ratings)
            weight_ratings.append(train_user.rating * cos_sim_weight)
            weights.append(cos_sim_weight)
            target_user_ratings.append(train_user.rating)

        if np.count_nonzero(weights) > 0:
            predictions.append(np.sum(weight_ratings)/np.sum(weights))
        else:
            predictions.append(ratings_train.loc[ratings_train.userId == test_userid, 'rating'].mean())

    return np.asarray(predictions)

In [44]:
#finding error
def error(predictions, ratings_test):
    return np.abs(predictions - np.array(ratings_test.rating)).mean() #returning the value of error

In [45]:
def get_rmse(pred, actual):
    # Ignore zero terms.
    # pred = pred[actual.nonzero()].flatten()
    # actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(pred, actual.rating))

In [46]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')
# 1st step I was thinking of to have genure list
movies = tokenize(movies)
# 2nd step I was thinking to include tf-idf as well as an id corresponding to all genures
movies, vocab = make_vocab(movies)
#print('vocab:')
#print(sorted(vocab.items())[:10])
# print(ratings)
# print("\n\n\n")
ratings_train, ratings_test = train_test(ratings)
# print('%d training ratings; %d testing ratings' % (len(ratings_train), len(ratings_test)))
predictions = predicting_main_fxn(movies, ratings_train, ratings_test)
print('MAE=%f' %error(predictions, ratings_test))
print("\n")
print('RMSE=%f' %get_rmse(predictions, ratings_test))


MAE=0.700948


RMSE=0.875552
