In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

In [11]:
df = pd.read_csv("./netflix_titles.csv")

In [12]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [13]:
titles = df.title.to_numpy()
descriptions = df.description.to_numpy()


In [14]:
all_genres = np.unique(np.concatenate(df.listed_in.str.split(',').apply(lambda x: [name.strip() for name in x])))

In [15]:
all_actors = np.concatenate(df.cast.dropna().reset_index(drop=True).str.split(',').apply(lambda x: [name.strip() for name in x]))
unique_actor, actor_appearance = np.unique(all_actors, return_counts=True) 
indices_recurring = actor_appearance > 1
recurring_actor, recurring_count = unique_actor[indices_recurring], actor_appearance[indices_recurring] 

In [16]:
genre_dict = {'Action & Adventure' :'Action & Adventure', 'Anime Features': 'Anime', 'Anime Series': 'Anime', 'British TV Shows' : 'UK',
 'Children & Family Movies' : 'Family', 'Classic & Cult TV': 'Classic', 'Classic Movies': 'Classic',
 'Comedies': 'Comedy', 'Crime TV Shows' :'Crime', 'Cult Movies': 'Cult', 'Documentaries': 'Documentary', 'Docuseries':'Documentary',
 'Dramas':'Drama', 'Faith & Spirituality': 'Spiritual', 'Horror Movies':'Horror', 'Independent Movies' : 'Indie',
 'International Movies': 'International', 'International TV Shows': 'International', "Kids' TV":'Family',
 'Korean TV Shows' : 'Korean', 'LGBTQ Movies': 'LGBTQ', 'Movies': 'Movies', 'Music & Musicals': 'Music', 'Reality TV': 'Reality',
 'Romantic Movies': 'Romance', 'Romantic TV Shows': 'Romance', 'Sci-Fi & Fantasy': 'Fantasy/SciFi',
 'Science & Nature TV': 'Science/Nature', 'Spanish-Language TV Shows': 'Spanish', 'Sports Movies': 'Sports',
 'Stand-Up Comedy': 'Comedy', 'Stand-Up Comedy & Talk Shows': 'Comedy', 'TV Action & Adventure':'Action & Adventure',
 'TV Comedies': 'Comedy', 'TV Dramas':'Drama', 'TV Horror':'Horror', 'TV Mysteries': 'Mystery',
 'TV Sci-Fi & Fantasy': 'Fantasy/SciFi', 'TV Shows': 'TV', 'TV Thrillers': 'Thriller', 'Teen TV Shows': 'Teen',
 'Thrillers': 'Thriller'}

In [17]:
genre_weights = {}

In [18]:
genre_matrix = df.loc[:, ["listed_in", "show_id"]]
genre_matrix.loc[:, "listed_in"] = genre_matrix.listed_in.str.split(',').map(lambda x: [genre_dict[i.strip()] for i in x])

In [19]:
mlb = MultiLabelBinarizer(classes=genre_matrix.listed_in.explode().unique())
binary_labels = mlb.fit_transform(genre_matrix["listed_in"])
genres = mlb.classes_

In [20]:
mlb_df = pd.DataFrame(binary_labels,columns=mlb.classes_)
mlb_df.drop(["TV", "Movies"], axis=1, inplace=True)

In [21]:
similarity = cosine_similarity(mlb_df, dense_output =True)

In [22]:
genre_similarities = pd.DataFrame(similarity, columns=df["show_id"], index=df["show_id"].to_numpy())

In [23]:
genre_similarities

show_id,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,...,s8798,s8799,s8800,s8801,s8802,s8803,s8804,s8805,s8806,s8807
s1,1.000000,0.000000,0.000000,0.707107,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
s2,0.000000,1.000000,0.333333,0.000000,0.333333,0.666667,0.000000,0.666667,0.0,0.408248,...,0.000000,0.666667,0.816497,0.666667,0.666667,0.333333,0.000000,0.000000,0.000000,0.666667
s3,0.000000,0.333333,1.000000,0.000000,0.333333,0.000000,0.000000,0.333333,0.0,0.000000,...,0.000000,0.333333,0.408248,0.333333,0.333333,0.000000,0.000000,0.000000,0.000000,0.333333
s4,0.707107,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.5,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
s5,0.000000,0.333333,0.333333,0.000000,1.000000,0.000000,0.000000,0.333333,0.0,0.408248,...,0.000000,0.666667,0.408248,0.666667,0.333333,0.000000,0.333333,0.408248,0.408248,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
s8803,0.000000,0.333333,0.000000,0.000000,0.000000,0.333333,0.000000,0.333333,0.0,0.408248,...,0.000000,0.333333,0.408248,0.333333,0.666667,1.000000,0.000000,0.000000,0.000000,0.333333
s8804,0.000000,0.000000,0.000000,0.000000,0.333333,0.000000,0.577350,0.000000,0.0,0.408248,...,0.577350,0.333333,0.000000,0.000000,0.000000,0.000000,1.000000,0.408248,0.816497,0.000000
s8805,0.000000,0.000000,0.000000,0.000000,0.408248,0.408248,0.000000,0.000000,0.0,0.500000,...,0.000000,0.408248,0.000000,0.000000,0.000000,0.000000,0.408248,1.000000,0.500000,0.000000
s8806,0.000000,0.000000,0.000000,0.000000,0.408248,0.000000,0.707107,0.000000,0.0,0.500000,...,0.707107,0.408248,0.000000,0.000000,0.000000,0.000000,0.816497,0.500000,1.000000,0.000000


In [24]:
model = SentenceTransformer('all-MiniLM-L6-v2')
encoding = model.encode(df["description"].to_numpy())

In [25]:
cos_sim = util.cos_sim(encoding, encoding)

In [26]:
cos_sim[0][1:].max()

tensor(0.4997)

In [27]:
print(descriptions[0], "\n",
descriptions[cos_sim[0][1:].argmax()])

As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable. 
 Four best friends navigate loss and major life changes – and smoke a lot of weed – during their last two weeks of high school.


In [28]:
sum_cos = cos_sim + similarity*0.05

In [29]:
sum_cos

tensor([[1.0500, 0.1937, 0.0732,  ..., 0.2222, 0.2088, 0.1809],
        [0.1937, 1.0500, 0.0293,  ..., 0.2947, 0.1171, 0.1109],
        [0.0732, 0.0293, 1.0500,  ..., 0.1468, 0.3781, 0.3183],
        ...,
        [0.2222, 0.2947, 0.1468,  ..., 1.0500, 0.3199, 0.2461],
        [0.2088, 0.1171, 0.3781,  ..., 0.3199, 1.0500, 0.2236],
        [0.1809, 0.1109, 0.3183,  ..., 0.2461, 0.2236, 1.0500]],
       dtype=torch.float64)

In [30]:
sum_cos = np.array(sum_cos)

sum_cos[sum_cos >= 1.05] = 0.0
sum_cos  /= 1.05


# sum_cos[sum_cos == 1.05] = 0.0

In [31]:
sum_cos

array([[0.        , 0.18451925, 0.0696943 , ..., 0.21161274, 0.19884632,
        0.17228283],
       [0.18451925, 0.        , 0.02786184, ..., 0.28062111, 0.11156865,
        0.10561996],
       [0.0696943 , 0.02786184, 0.        , ..., 0.13979435, 0.36010436,
        0.30314694],
       ...,
       [0.21161274, 0.28062111, 0.13979434, ..., 0.        , 0.3046938 ,
        0.23434954],
       [0.1988463 , 0.11156865, 0.36010433, ..., 0.3046938 , 0.        ,
        0.21290597],
       [0.17228283, 0.10561996, 0.30314691, ..., 0.23434954, 0.21290597,
        0.        ]])

In [32]:
similar_movies = sum_cos.argmax(axis=0)

In [33]:
print(descriptions[0], descriptions[similar_movies[0]])

As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable. A record company exec joins his estranged dad, a famous photographer who's dying, on a road trip to the last lab still developing Kodachrome film.


In [34]:
similar_movies

array([4921, 1357, 3297, ...,  696, 4586, 1841], dtype=int64)

In [35]:
sum_cos

array([[0.        , 0.18451925, 0.0696943 , ..., 0.21161274, 0.19884632,
        0.17228283],
       [0.18451925, 0.        , 0.02786184, ..., 0.28062111, 0.11156865,
        0.10561996],
       [0.0696943 , 0.02786184, 0.        , ..., 0.13979435, 0.36010436,
        0.30314694],
       ...,
       [0.21161274, 0.28062111, 0.13979434, ..., 0.        , 0.3046938 ,
        0.23434954],
       [0.1988463 , 0.11156865, 0.36010433, ..., 0.3046938 , 0.        ,
        0.21290597],
       [0.17228283, 0.10561996, 0.30314691, ..., 0.23434954, 0.21290597,
        0.        ]])

In [36]:
sum_cos[3685]

array([0.14186943, 0.37886689, 0.0950331 , ..., 0.27473958, 0.19998865,
       0.31748011])

In [37]:
df.loc[2847].to_numpy()

array(['s2848', 'Movie', 'Freaks', 'Adam B. Stein, Zach Lipovsky',
       'Emile Hirsch, Bruce Dern, Lexy Kolker, Grace Park, Amanda Crew, Aleks Paunovic, Michelle Harrison',
       'United States, Canada', 'March 3, 2020', 2018, 'R', '105 min',
       'Independent Movies, Sci-Fi & Fantasy, Thrillers',
       'Hidden away by her eccentric father, a mysterious young girl uncovers frightening truths when she starts to make contact with the outside world.'],
      dtype=object)

In [38]:
sum_cos[329].argmax()

8615

In [39]:
df.loc[329].listed_in

'Dramas'

In [40]:
df.loc[8615].listed_in

'Dramas, Thrillers'

In [41]:
df[df.title == "Cuties"]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
2031,s2032,Movie,Cuties,Maïmouna Doucouré,"Fathia Youssouf, Médina El Aidi-Azouni, Esther...",France,"September 9, 2020",2020,TV-MA,96 min,"Dramas, International Movies",Eleven-year-old Amy starts to rebel against he...
