In [None]:
import os
import re
import numpy as np
import pandas as pd
from collections import defaultdict
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

dataset_path = "ml-20m"

ratings_df = pd.read_csv(os.path.join(dataset_path,"ratings.csv"), encoding= "utf-8", sep=",")

user_id, movie_id, ratings = list(ratings_df[u'userId']), list(ratings_df[u'movieId']), list(ratings_df[u'rating'])

user_ratings_map, user_mean_sd_ratings = defaultdict(dict), defaultdict(float)


normalized_ratings = defaultdict(dict)

for idx in range(len(user_id)):
    user_ratings_map[user_id[idx]][movie_id[idx]] = float(ratings[idx]) 
    
for user_id,movie_rating_map in user_ratings_map.items():
    ratings = [r for m,r in movie_rating_map.items()]
    mean_r, sd_r = np.mean(ratings),np.std(ratings)
    
    user_mean_sd_ratings[user_id] = (mean_r, sd_r)
        
    for movie_id, ratings in movie_rating_map.items():
        if (sd_r == 0):
            normalized_ratings[user_id][movie_id] = 0.0
        else:
            normalized_ratings[user_id][movie_id] = float(ratings - mean_r)/sd_r
        
    


In [None]:
tags_df = pd.read_csv(os.path.join(dataset_path, "tags.csv"), encoding="utf-8", sep=",")

genres_df = pd.read_csv(os.path.join(dataset_path, "movies.csv"), encoding="utf-8",sep=",")

stop_words = set(stopwords.words('english'))

movie_id, tags = list(tags_df[u'movieId']), list(tags_df[u'tag'])

tags = [str(tag) for tag in tags]

movie_tag_map = defaultdict(list)

for idx in range(len(movie_id)):
    tag = tags[idx].lower()  
    tag = re.sub("[^a-zA-Z0-9 ]", " ", tag)
    tag = tag.strip()
    tag = re.sub("\s+", " ", tag)
        
    if (len(tag)>0):
        tag_words = tag.split()
        tag = " ".join([x for x in tag_words if x not in stop_words])
        
        movie_tag_map[movie_id[idx]].append(tag)

In [None]:
movie_id, genres = list(genres_df[u'movieId']), list(genres_df[u'genres'])

for idx in range(len(movie_id)):
    genre = genres[idx].lower()
    all_genres = genre.split("|")
    
    for gen in all_genres:
        movie_tag_map[movie_id[idx]].append(gen)
        
movie_tags = []
movie_ids_index = defaultdict(int)
movie_ids = [m_id for m_id, _ in movie_tag_map.items()]

for idx in range(len(movie_ids)):
    m_id = movie_ids[idx]
    movie_ids_index[m_id] = idx
    movie_tags.append("###".join(movie_tag_map[m_id]))
#print(movie_tags)
#Create the TF-IDF weighted movie-tag matrix
vectorizer = TfidfVectorizer(tokenizer=lambda sent: sent.split("###"), ngram_range=(1,1), stop_words='english')
# print(vectorizer)
movie_tag_mat = vectorizer.fit_transform(movie_tags)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression


def get_user_model(user_id, user_ratings_map, movie_tag_mat, movie_ids_index):
    movie_ids = [m_id for m_id, rating in user_ratings_map[user_id].items() if m_id in movie_ids_index]
    movie_ids_rows = [movie_ids_index[m_id] for m_id in movie_ids]
    labels = np.array([rating for m_id, rating in user_ratings_map[user_id].items() if m_id in movie_ids_index])
    train_data = movie_tag_mat[movie_ids_rows,:]
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    errors = []
    
    for train_index, test_index in kf.split(train_data):
        X_train, X_test = train_data[train_index], train_data[test_index]
        y_train, y_test = labels[train_index], labels[test_index]
        model = LinearRegression()
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        
        errors.append(mean_squared_error(y_test, preds))
    
    model = LinearRegression()
    model.fit(train_data, labels)
    
    return model, np.mean(errors)




In [None]:
def get_recommendations(model, user_id, user_ratings_map, movie_tag_mat, movie_ids_index, num_rec=10):
    rated_movie_ids = set([m_id for m_id, _ in user_ratings_map[user_id].items()])
    unrated_movie_ids = [m_id for m_id, idx in movie_ids_index.items() if m_id not in rated_movie_ids and m_id in movie_ids_index]
    
    movie_ids_rows = [movie_ids_index[m_id] for m_id in movie_ids]
    test_data = movie_tag_mat[movie_ids_rows,:]
    
    preds = model.predict(test_data)
    preds = sorted(zip(unrated_movie_ids, preds), key=lambda k:-k[1])
    
    return preds[:min(len(preds), num_rec)]

In [None]:
import random

selected_user_ids = [user_id for user_id,rate_map in user_ratings_map.items() if len(rate_map) > 500]
selected_user_ids =selected_user_ids[:100]

validation_data = []

for user_id in selected_user_ids:
    movie_ratings_map = user_ratings_map[user_id]
    movie_ids = [mid for mid in movie_ratings_map]
    
    selected_movie_ids = random.sample(movie_ids, int(0.01 * len(movie_ids)))
    
    for s_id in selected_movie_ids:
        validation_data.append((user_id, s_id, user_ratings_map[user_id][s_id]))

In [None]:
models = dict()

for user_id,movie_id,ratings in validation_data:
    user_ratings_map[user_id].pop(movie_id)
    
for user_id in selected_user_ids:
    model,err = get_user_model(user_id,user_ratings_map,movie_tag_mat,movie_ids_index)
    models[user_id] = model
    #print (user_id,err)
    

In [None]:
def get_predicted_rating(model, movie_id, movie_tag_mat, movie_ids_index):
    test_data = movie_tag_mat[movie_ids_index[movie_id],:]
    preds = model.predict(test_data)
    
    return preds[0]

In [None]:
import math
sums = 0.0

for user_id, movie_id, actual_rating in validation_data:
    model = models[user_id]
    pred_rating = get_predicted_rating(model, movie_id, movie_tag_mat, movie_ids_index)
    sums += (pred_rating - actual_rating) ** 2.0
    
#print (math.sqrt(float(sums)/len(validation_data)))

In [None]:
import math

def user_similarity(user_ratings_1, user_ratings_2):
    sum1, sum2, sums = 0.0, 0.0, 0.0
    
    dist = 0.0
    
    movies_rated_1 = set([movie_id for movie_id, rating in user_ratings_1.items()])
    movies_rated_2 = set([movie_id for movie_id, rating in user_ratings_2.items()])
    
    common_rated = movies_rated_1.intersection(movies_rated_2)
    
    if (len(common_rated) > 0):
        for movie_id in common_rated:
            rating_1, rating_2 = user_ratings_1[movie_id], user_ratings_2[movie_id]
            dist += (rating_1 - rating_2) ** 2
        
        dist /= float(len(common_rated))
        
        return 1.0 - np.tanh(math.sqrt(2*dist))
    
    return 0.0
        
    
def get_similar_users(user_id, all_user_ids, normalized_ratings, num_sim=5):
    sims = []
    
    for j in range(len(all_user_ids)):
        user_id_1 = all_user_ids[j]
        
        if(user_id_1 != user_id):
            sim = user_similarity(normalized_ratings[user_id_1],normalized_ratings[user_id])
            sims.append([user_id_1, sim])
            
    sims = sorted(sims, key=lambda k:-k[1])
    sims = sims[:min(len(sims), num_sim)]
    
    return sims

def get_predicted_rating(user_id,movie_id,normalized_ratings,user_mean_sd_ratings,similar_user_ids):
    sims = [sim for similar_user_id, sim in similar_user_ids]
    sim_sum = np.sum(sims)
    
    pred = 0.0
    
    for similar_uid, sim in similar_user_ids:
        if movie_id in normalized_ratings[similar_uid]:
            rating = normalized_ratings[similar_uid][movie_id]
            pred += (sim * rating)/float(sim_sum)
        
    return user_mean_sd_ratings[user_id][1] * pred + user_mean_sd_ratings[user_id][0]    
    

In [None]:
all_users_ids = [user_id for user_id,_ in user_ratings_map[user_id].items()]

sums = 0.0

for user_id, movie_id, actual_rating in validation_data:
    filtered_user_id = [u_id for u_id in all_users_ids if movie_id in normalized_ratings[u_id]]
    similar_user_ids = get_similar_users(user_id, filtered_user_id, normalized_ratings, 3)
    
    pred_rating = get_predicted_rating(user_id, movie_id, normalized_ratings, user_mean_sd_ratings, similar_user_ids)
    
    #print (actual_rating, pred_rating)
    
    sums += (pred_rating - actual_rating) ** 2.0

#print (float(sums)/len(validation_data))