#### Note:
The algorithm and a large part of the model implementation is based off of Kaggle user "Niharika Pandit"'s notebook at
https://www.kaggle.com/code/niharika41298/netflix-visualizations-recommendation-eda

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st


In [3]:
netflix = pd.read_csv("data/netflix_titles.csv")
amazon = pd.read_csv("data/amazon_prime_titles.csv")
hulu = pd.read_csv("data/hulu_titles.csv")
disney = pd.read_csv("data/disney_plus_titles.csv")

In [4]:
# Combining the initial data without any labels for origin
data = pd.concat([netflix, amazon, hulu, disney], ignore_index=True)

# Removing "na" fields
data = data.fillna("")

# Removing any duplicate movies
data = data.drop_duplicates(subset=["title", "type"], keep="last")

# Adding a label to all the data with their source
netflix = netflix.assign(netflix=1, amazon=0, hulu=0, disney=0)
amazon = amazon.assign(netflix=0, amazon=1, hulu=0, disney=0)
hulu = hulu.assign(netflix=0, amazon=0, hulu=1, disney=0)
disney = disney.assign(netflix=0, amazon=0, hulu=0, disney=1)

# Defining aggregate functions
g = {"netflix": "sum", "amazon": "sum", "hulu": "sum", "disney": "sum"}

# Creating a temporary dataframe with only titles, type, & origins
data_temp = pd.concat([netflix, amazon, hulu, disney], ignore_index=True).groupby(
    ["title", "type"], as_index=False).agg(g).reset_index()
data_temp = data_temp.fillna("")

# Merging the two dataframes together to create a complete system
# Inner merge information found via TutorialsPoint tutorial
# Source: https://www.tutorialspoint.com/python_pandas/python_pandas_merging_joining.htm
data = pd.merge(data, data_temp, on=["title", "type"], how="inner")

untouched_data = data


In [5]:
def clean_data(data):
    """
    Define a function to "clean" the data by forcing all needed groups lowercase
    """
    return str.lower(data.replace(" ", ""))

In [6]:
# Identifying features for the model
features = ["title", "director", "cast", "listed_in", "description"]
data = data[features]

# Apply cleaning function
for feature in features:
    data[feature] = data[feature].apply(clean_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feature] = data[feature].apply(clean_data)


In [7]:
def create_soup(data):
    """
    Create a "soup" or "bag of words" for all rows
    """
    return data["title"] + " " + data["director"] + " " + data["cast"] + " " + data["listed_in"] + " " + data["description"]


In [8]:
data["soup"] = data.apply(create_soup, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["soup"] = data.apply(create_soup, axis=1)


In [9]:
count = CountVectorizer(stop_words="english")
count_matrix = count.fit_transform(data["soup"])

cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim.shape

(22242, 22242)

In [10]:
data = data.reset_index()
indices = pd.Series(data.index, index=data["title"])

In [11]:
def get_recommendations(title, cosine_sim=cosine_sim):
    """
    Get recommendations based off of input movie title
    """
    title = title.replace(' ', '').lower()
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return untouched_data['title'].iloc[movie_indices]


In [12]:
def get_best_service(recommendation):
    """
    Returns the count of where the recommendations can be found
    """
    output = {
        "netflix": 0,
        "amazon": 0,
        "hulu": 0,
        "disney": 0
    }
    print(recommendation)
    for title in recommendation:
        output["netflix"] += int(untouched_data.loc[untouched_data["title"]
                                 == title, "netflix"].values[0])
        output["amazon"] += int(untouched_data.loc[untouched_data["title"]
                                == title, "amazon"].values[0])
        output["hulu"] += int(untouched_data.loc[untouched_data["title"]
                              == title, "hulu"].values[0])
        output["disney"] += int(untouched_data.loc[untouched_data["title"]
                                == title, "disney"].values[0])

    return output


In [13]:
recommendations = get_recommendations("naruto", cosine_sim)

final = get_best_service(recommendations)
print(final)

20536                                     Naruto Shippuden
20706                                              Sonic X
20672                                            One Piece
18814                                           Fire Force
19400                                     Digimon Frontier
20363                                             Basilisk
20778                                             Primeval
19669       Pokémon the Movie: Hoopa and the Clash of Ages
19670    Pokémon The Movie: Volcanion and the Mechanica...
19978                               Voltron: Fleet Of Doom
Name: title, dtype: object
{'netflix': 1, 'amazon': 1, 'hulu': 10, 'disney': 0}
