#### **Note**: This model and its implementation is directly based on the implementation by Avinash Navlani (https://www.datacamp.com/tutorial/streaming-platform-analysis)



In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
import numpy as np
from sklearn import preprocessing
from scipy.sparse import hstack
import pandas_profiling
import re


In [127]:
# Get movie streaming data
df = pd.read_csv("data/MoviesOnStreamingPlatforms.csv")
# remove unnamed index column
df = df.iloc[:, 1:]
# Make all columns lowercase for consistency across datasets
df.columns = [x.lower() for x in df.columns]

# Get ratings data
data = pd.io.parsers.read_csv("data/ratings.dat",
                              names=["user_id", "movie_id",
                                     "rating", "time"],
                              engine="python",
                              delimiter="::")

movie_data = pd.io.parsers.read_csv("data/movies.dat",
                                    names=["movie_id", "title", "genre"],
                                    engine="python",
                                    delimiter="::")

# Fix a title formatting issue that is found within the dataset
"""
Titles with the word "the" at the start were displayed at the end. If the movie
was named "The horse (2000)" it would be displayed as "horse, The (2000)"
"""
movie_data.title = np.where(
    movie_data.title.str.contains(", The"),
    "The " + movie_data.title.str.replace(", The", ""),
    movie_data.title)


In [128]:
# Combine title with year for consistency
df["title"] = df["title"] + " (" + df["year"].astype(str) + ")"

# Remove "year" column
df.drop("year", axis=1, inplace=True)

df["inDatabase?"] = np.where(df.title.str == movie_data.title.str, 1, 0)
df.head()

Unnamed: 0,id,title,age,rotten tomatoes,netflix,hulu,prime video,disney+,type,inDatabase?
0,1,The Irishman (2019),18+,98/100,1,0,0,0,0,0
1,2,Dangal (2016),7+,97/100,1,0,0,0,0,0
2,3,David Attenborough: A Life on Our Planet (2020),7+,95/100,1,0,0,0,0,0
3,4,Lagaan: Once Upon a Time in India (2001),7+,94/100,1,0,0,0,0,0
4,5,Roma (2018),18+,94/100,1,0,0,0,0,0


In [None]:
# Creating a rating matrix with rows as movies and columns as users
ratings_mat = np.ndarray(
    shape=(int(np.max(data.movie_id.values)), int(np.max(data.user_id.values))),
    dtype=np.uint8)
ratings_mat[data.movie_id.values-1, data.user_id.values-1] = data.rating.values


In [None]:
# Normalize the matrix
normalised_mat = ratings_mat - np.asarray([(np.mean(ratings_mat, 1))]).T


In [None]:
# Compute the Singular Value Decommposition (SVD)
A = normalised_mat.T / np.sqrt(ratings_mat.shape[0] - 1)
U, S, V = np.linalg.svd(A)


In [None]:
# Define a function to calculate the cosine similarity. Sort by most similar and return the top N results
def top_cosine_similarity(data, movie_id, top_n=10):
    # Movie id starts from 1 in the dataset
    index = movie_id - 1
    movie_row = data[index, :]
    magnitude = np.sqrt(np.einsum('ij, ij -> i', data, data))
    similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)
    sort_indexes = np.argsort(-similarity)
    return sort_indexes[:top_n]


In [None]:
# Define a function to print top N similar movies
def print_similar_movies(movie_data, movie_id, top_indexes):
    print('Recommendations for {0}: \n'.format(
        movie_data[movie_data.movie_id == movie_id].title.values[0]))
    for id in top_indexes + 1:
        print(movie_data[movie_data.movie_id == id].title.values[0])


In [None]:
# Initialize the value of k principal components, id of the move as given in the dataset, and number of top elements to be printed.
k = 50
# Get an id from movies.dat
movie_id = 11
top_n = 10
# Representative data
sliced = V.T[:, :k]
indexes = top_cosine_similarity(sliced, movie_id, top_n)


  similarity = np.dot(movie_row, data.T) / (magnitude[index] * magnitude)


In [None]:
print_similar_movies(movie_data, movie_id, indexes)

Recommendations for American President, The (1995): 

American President, The (1995)
Dave (1993)
While You Were Sleeping (1995)
Notting Hill (1999)
Sleepless in Seattle (1993)
Phenomenon (1996)
Pretty Woman (1990)
Sabrina (1995)
Bridges of Madison County, The (1995)
Speechless (1994)


In [None]:
# Remove missing values from streaming platform dataset
""" 
Note: In his tutorial Navlani drops all NA and drops any column(s) which have
more than 50% missing values. We chose to mimic this as this model is based
off of his model and implementation
"""
# Dropping values with missing % more than 50%
df.drop(['Rotten Tomatoes', 'Age'], axis=1, inplace=True)
# Dropping Na's from the following columns
df.dropna(
    subset=['Directors', 'Genres', 'Country', 'Language', 'Runtime'],
    inplace=True)
df.reset_index(inplace=True,drop=True)
# Converting into object type
df.Year = df.Year.astype("object")

#### References

Recommendation System for Streaming Platforms Tutorial. (n.d.). Www.datacamp.com. Retrieved June 27, 2022, from https://www.datacamp.com/tutorial/streaming-platform-analysis


Movies on Netflix, Prime Video, Hulu and Disney+. (n.d.). Www.kaggle.com. Retrieved June 27, 2022, from https://www.kaggle.com/datasets/ruchi798/movies-on-netflix-prime-video-hulu-and-disney


https://grouplens.org/datasets/movielens/1m/