In [25]:
import pandas as pd
import numpy as np
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## STEP 1

Preprocessing...
Merging 2 datatasetd with the general data about movies and the cast of movies.
Keeping only the movies that have been released.
Removing movies with missing values in columns ['overview', 'genres', 'keywords'].

In [26]:
df_movies = pd.read_csv('../../datasets/tmdb_5000_movies.csv') 
df_credits = pd.read_csv('../../datasets/tmdb_5000_credits.csv') 
df_credits.rename(columns={'movie_id': 'id'}, inplace=True)
movies = df_movies.merge(df_credits,on='id')
movies = movies[(movies['status'] == 'Released') & (movies['overview'].isna()==False) & (movies['genres'].isna()==False) & (movies['keywords'].isna()==False)]

movies['year'] = movies['release_date'].astype(str).str[:-6].fillna('')
movies['year']=movies['year'].replace('', 'нет данных о годе')

#deleting  duplicated columns
movies.rename(columns={'title_x': 'title'}, inplace=True)
movies.drop(columns = ['title_y'],axis = 1, inplace=True)
movies.head(5)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,year
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",2009
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",2007
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",2015
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",2012
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...",2012


## Step 2

To implement a content-based recommendation algorithm, we will merge the columns (overview) and (keywords) and create a Tf-Idf matrix for movie descriptions.

In [27]:
#merging columns, extracting tags (keywords)
movies['overview'].fillna('')
def extract_tags(s):
    list_of_dict = json.loads(s)
    tags=[d["name"] for item in list_of_dict for d in (item if isinstance(item, list) else [item])]
    result = ", ".join(tags)
    return result

movies['keywords2'] = movies['keywords'].apply(extract_tags)
movies['full_description'] = movies[['overview', 'keywords2']].apply(lambda x: ' '.join(x.astype(str)), axis=1)
movies['full_description'].iloc[0]


'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. culture clash, future, space war, space colony, society, space travel, futuristic, romance, space, alien, tribe, alien planet, cgi, marine, soldier, battle, love affair, anti war, power relations, mind and soul, 3d'

In [28]:
vectorizer = TfidfVectorizer(stop_words = 'english', max_features=10000)
tfidf_matrix = vectorizer.fit_transform(movies['full_description'], )
feature_names = vectorizer.get_feature_names_out()
print(tfidf_matrix.shape)

(4792, 10000)


## Step 3

Calculating cosine similarity
Exporting DataFrames to .csv

In [29]:
cosine_sim = linear_kernel(tfidf_matrix)
cosine_sim.shape
cos_sim_df = pd.DataFrame(cosine_sim, columns=movies['id'], index=movies['id'])
cos_sim_df = cos_sim_df.rename_axis('movie_id')
cos_sim_df.head(5)

id,19995,285,206647,49026,49529,559,38757,99861,767,209112,...,182291,286939,124606,14337,67238,9367,72766,231617,126186,25975
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
19995,1.0,0.006698,0.0,0.011519,0.193628,0.043216,0.008125,0.048032,0.0,0.012418,...,0.0,0.0,0.017703,0.033747,0.0,0.0,0.0,0.00494,0.013178,0.0
285,0.006698,1.0,0.0,0.014123,0.040554,0.023146,0.004609,0.014998,0.0,0.0,...,0.0,0.0,0.014527,0.0,0.0,0.009755,0.0,0.019802,0.0,0.0
206647,0.0,0.0,1.0,0.018515,0.014561,0.014341,0.019677,0.04723,0.01692,0.006142,...,0.031525,0.0,0.0,0.0,0.017365,0.014662,0.0,0.011729,0.0,0.0
49026,0.011519,0.014123,0.018515,1.0,0.006272,0.02484,0.024703,0.040374,0.011192,0.226748,...,0.006161,0.0,0.004955,0.018283,0.010281,0.0,0.0,0.028046,0.035781,0.01766
49529,0.193628,0.040554,0.014561,0.006272,1.0,0.020283,0.020632,0.05631,0.0,0.014569,...,0.005251,0.0,0.0,0.010926,0.0,0.0,0.0,0.004216,0.0,0.0


In [30]:
cos_sim_df.to_csv (r'../assets/distance.csv', index= True)
movies.to_csv (r'../assets/movies.csv', index= False)

# RecFunction

In [31]:

def get_recommendations(movies_dataset, title, cosine_sim, top_k=10):

    indices = pd.Series(movies_dataset.index, index=movies_dataset['title']) #connecting the rows in the sparse matrix with the titles (same rows)
    idx= indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_k + 1]
    movie_indices = [i[0] for i in sim_scores]
    return movies_dataset['title'].iloc[movie_indices]

In [32]:
aaa='''The Pirates! In an Adventure with Scientists!'''
get_recommendations(movies_dataset=movies, title=aaa, cosine_sim=cosine_sim,top_k = 5)

3820                                           The Pirate
199     Pirates of the Caribbean: The Curse of the Bla...
2590       VeggieTales: The Pirates Who Don't Do Anything
1709                         Space Pirate Captain Harlock
340                                      Cutthroat Island
Name: title, dtype: object