In [1]:
%matplotlib inline

In [2]:
import daal4py as d4p

In [3]:
import numpy as np
import pandas as pd
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling as pp

In [4]:
data = pd.read_csv('hulu_titles.csv')

In [5]:
d4p.daalinit()

In [6]:
data.columns[data.isna().any()].tolist() #intermediary steps

['director',
 'cast',
 'country',
 'date_added',
 'rating',
 'duration',
 'description']

In [7]:
data[data.columns[data.isnull().any()]].isnull().sum() * 100 / data.shape[0] #intermediary steps

director        99.902376
cast           100.000000
country         47.282786
date_added       0.911162
rating          16.921575
duration        15.587374
description      0.130166
dtype: float64

In [8]:
data.columns #intermediary steps

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [9]:
data.listed_in.value_counts() #intermediary steps

Documentaries                          214
Action, Adventure, Anime               171
Horror, Thriller                        82
Action, Adventure, Drama                81
Sports                                  74
                                      ... 
Action, Drama, Horror                    1
Black Stories, Drama, LGBTQ+             1
Comedy, Latino, News                     1
Adventure, Drama, History                1
Classics, Science Fiction, Thriller      1
Name: listed_in, Length: 442, dtype: int64

In [10]:
categories = ", ".join(data['listed_in']).split(", ")
counter_list = Counter(categories).most_common(50)
counter_list #intermediary steps

[('Drama', 907),
 ('Comedy', 667),
 ('Adventure', 556),
 ('Action', 555),
 ('Documentaries', 524),
 ('Anime', 329),
 ('Horror', 304),
 ('Reality', 247),
 ('Thriller', 231),
 ('Crime', 217),
 ('International', 216),
 ('Family', 205),
 ('Romance', 186),
 ('Kids', 162),
 ('Lifestyle & Culture', 155),
 ('Sports', 139),
 ('Science Fiction', 122),
 ('Sitcom', 118),
 ('Black Stories', 113),
 ('News', 106),
 ('Latino', 92),
 ('Mystery', 81),
 ('Cooking & Food', 80),
 ('Music', 79),
 ('History', 70),
 ('LGBTQ+', 67),
 ('Science & Technology', 48),
 ('Adult Animation', 37),
 ('Classics', 35),
 ('Teen', 34),
 ('Cartoons', 34),
 ('Game Shows', 18),
 ('Stand Up', 12),
 ('Health & Wellness', 8),
 ('Late Night', 4),
 ('Sketch Comedy', 3)]

In [11]:
Genres = pd.DataFrame(counter_list, columns=['Genre', 'Genre_count'])
top_10_genres = Genres.head(10)
top_10_genres #intermediary steps

Unnamed: 0,Genre,Genre_count
0,Drama,907
1,Comedy,667
2,Adventure,556
3,Action,555
4,Documentaries,524
5,Anime,329
6,Horror,304
7,Reality,247
8,Thriller,231
9,Crime,217


In [12]:
data.info() #intermediary steps

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3073 entries, 0 to 3072
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   show_id       3073 non-null   object 
 1   type          3073 non-null   object 
 2   title         3073 non-null   object 
 3   director      3 non-null      object 
 4   cast          0 non-null      float64
 5   country       1620 non-null   object 
 6   date_added    3045 non-null   object 
 7   release_year  3073 non-null   int64  
 8   rating        2553 non-null   object 
 9   duration      2594 non-null   object 
 10  listed_in     3073 non-null   object 
 11  description   3069 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 288.2+ KB


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english') #need to ensure we don't come across stop words

In [14]:
tfidf_matrix = tfidf.fit_transform(data['listed_in']) #I chose listed in to recommed similarly genred works
tfidf_matrix.shape #the matrix we will use for our recommendations 

(3073, 44)

In [15]:
from sklearn.metrics.pairwise import linear_kernel 
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) #recommendation algorithm

In [16]:
indices = pd.Series(data.index, index=data['title']).drop_duplicates() #intermediary steps

In [17]:
#here we will build the recommendation function itself. We will call this with an existing title.
def recs(title):
    idx = indices[title]
    scores = list(enumerate(cosine_sim[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    scores = scores[1:11]

    show_indices = [i[0] for i in scores]
    return data['title'].iloc[show_indices]

In [18]:
recs('Settlers') #testing

123     Species: The Awakening
168              Minor Premise
884                  Intersect
978                       2067
1501                  Zoo-Head
277                    Gattaca
1781                      Devs
117                      Signs
120                    Species
290                   Phase IV
Name: title, dtype: object

In [19]:
recs('The Bachelorette') #testing

350           Bachelor in Paradise
687                   The Bachelor
694        The Farmer Wants A Wife
738               Are You the One?
750            Married to Medicine
753                Shahs of Sunset
906               Dress to Impress
2843    The Millionaire Matchmaker
2970                 Daisy of Love
2971                  Rock of Love
Name: title, dtype: object

In [20]:
recs('Demon Slayer Kimetsu No Yaiba') #testing

1034    Is It Wrong to Try to Pick Up Girls in a Dungeon?
1062                               The Promised Neverland
1145                                 Lupin the 3rd Part 5
1228                             By the Grace of the Gods
1235               Wandering Witch: The Journey of Elaina
1340                                    Dragon Ball Super
1449                                         No Guns Life
1451                                           DECA-DENCE
1480                         A Certain Scientific Railgun
1525                                         Black Butler
Name: title, dtype: object

In [21]:
quest = input("Please enter a Hulu title: ")
try:
    recs(quest)
except:
    print(f' Either {quest} is not on Hulu or is invalid. Please try again or select a different service.')
recs(quest)

Please enter a Hulu title: Signs


120                           Species
290                          Phase IV
310     The X-Files: Fight the Future
529                     Let's Be Evil
1004                        Possessor
1199                          Sputnik
1669               The Mandela Effect
1739                         The Host
1880                        Coherence
4                            Settlers
Name: title, dtype: object

### This is where we build an additional recommendation algorithm using two different datasets

In [25]:
new = pd.read_csv('tmdbC.csv')
new1 = pd.read_csv('tmdbM.csv') #create the dataframes

In [26]:
new.columns = ['id','title','cast','crew']
moredata= new1.merge(new,on='id') #merge the two separate dfs

In [27]:
moredata.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [28]:
moredata['overview'] = moredata['overview'].fillna('') #get rid of nulls

In [29]:
new_matrix = tfidf.fit_transform(moredata['overview']) #build the matrix we will use

In [30]:
new_sim = linear_kernel(new_matrix, new_matrix) #create the similarity calculation

In [34]:
ind = pd.Series(moredata.index, index=moredata['title_y']).drop_duplicates() #get rid of repeating titles

In [35]:
#building the new function
def improvedrecs(name, new_sim=new_sim):
    idx = ind[name]
    sim_scores = list(enumerate(new_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:11]

    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return moredata['title_y'].iloc[movie_indices]


In [36]:
improvedrecs('The Godfather') #testing

2731     The Godfather: Part II
1873                 Blood Ties
867     The Godfather: Part III
3727                 Easy Money
3623                       Made
3125                     Eulogy
3896                   Sinister
4506            The Maid's Room
3783                        Joe
2244      The Cold Light of Day
Name: title_y, dtype: object

In [38]:
improvedrecs('Avatar')

3604                       Apollo 18
2130                    The American
634                       The Matrix
1341            The Inhabited Island
529                 Tears of the Sun
1610                           Hanna
311     The Adventures of Pluto Nash
847                         Semi-Pro
775                        Supernova
2628             Blood and Chocolate
Name: title_y, dtype: object