# Movie Recommendation
Data: scraped from imdb top 200
Goal:
- content based recommendation system

In [2]:
import pandas as pd

pd.set_option('display.max_colwidth', None)

In [3]:
movies_df = pd.read_csv("./data/movies/movies.csv").set_index("id")
movies_df.head()

Unnamed: 0_level_0,name,metascore,imdbscore,categories,director_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt15239678,Dune: Part Two,79.0,8.7,"['Action', 'Abenteuer', 'Drama']",nm0898288
tt23289160,Godzilla Minus One,81.0,8.3,"['Action', 'Abenteuer', 'Drama']",nm0945724
tt23849204,12th Fail,,9.0,"['Biografie', 'Drama']",nm0006765
tt15398776,Oppenheimer,90.0,8.3,"['Biografie', 'Drama', 'Geschichte']",nm0634240
tt9362722,Spider-Man: Across the Spider-Verse,86.0,8.6,"['Animation', 'Action', 'Abenteuer']",nm1690966


In [10]:
# content based approach with tfidf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = movies_df.copy().reset_index()
df['Text'] = df['name'] + ' ' + df['categories'] + ' ' + df['director_id']
cv = CountVectorizer(stop_words='english') # CountVectorizer probably better in this context with categories
cv_matrix = cv.fit_transform(df['Text'])
# Compute the cosine similarity matrix
# After TF-IDF calculating the dot product gives us directly the cosine simularity, thats why we can use linear_kernel instead of cosine_simularity
cosine_sim = cosine_similarity(cv_matrix)
indices = pd.Series(df.index, index=df['Text']) # reverse mapping of row numbers to 
print(f"Movies: {cv_matrix.shape[0]}\nDifferent Words: {cv_matrix.shape[1]}")

Books: 250
Different Words: 700


In [12]:
def get_recommendations(title, k, cosine_sim, df):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    book_indices = [i[0] for i in sim_scores[1:k]]  # exclude 0, because its the search term
    return df['Text'].iloc[book_indices]

In [15]:
get_recommendations("Dune: Part Two ['Action', 'Abenteuer', 'Drama'] nm0898288", 10, cosine_sim, df)

100             Gladiator ['Action', 'Abenteuer', 'Drama'] nm0000631
1      Godzilla Minus One ['Action', 'Abenteuer', 'Drama'] nm0945724
15      Avengers: Endgame ['Action', 'Abenteuer', 'Drama'] nm0751577
5                    Top Gun: Maverick ['Action', 'Drama'] nm2676052
10                  1917 ['Action', 'Drama', 'Geschichte'] nm0005222
38                 Prisoners ['Krimi', 'Drama', 'Mystery'] nm0898288
44                Warrior ['Action', 'Drama', 'Sportfilm'] nm0640334
64       Into the Wild ['Abenteuer', 'Biografie', 'Drama'] nm0000576
82                   Oldboy ['Action', 'Drama', 'Mystery'] nm0661791
Name: Text, dtype: object

In [17]:
get_recommendations("Oppenheimer ['Biografie', 'Drama', 'Geschichte'] nm0634240", 10, cosine_sim, df)

8                              Hamilton ['Biografie', 'Drama', 'Geschichte'] nm2371802
76                         Hotel Ruanda ['Biografie', 'Drama', 'Geschichte'] nm0313623
77                        Der Untergang ['Biografie', 'Drama', 'Geschichte'] nm0386570
133                    Schindlers Liste ['Biografie', 'Drama', 'Geschichte'] nm0000229
39                     12 Years a Slave ['Biografie', 'Drama', 'Geschichte'] nm2588606
24     Hacksaw Ridge - Die Entscheidung ['Biografie', 'Drama', 'Geschichte'] nm0000154
244        Die Passion der Jeanne d'Arc ['Biografie', 'Drama', 'Geschichte'] nm0003433
2                                           12th Fail ['Biografie', 'Drama'] nm0006765
10                                    1917 ['Action', 'Drama', 'Geschichte'] nm0005222
Name: Text, dtype: object

In [18]:
get_recommendations("Spider-Man: Across the Spider-Verse ['Animation', 'Action', 'Abenteuer'] nm1690966", 10, cosine_sim, df)

16       Spider-Man: A New Universe ['Animation', 'Action', 'Abenteuer'] nm2130108
6             Spider-Man: No Way Home ['Action', 'Abenteuer', 'Fantasy'] nm1218281
75                Die Unglaublichen ['Animation', 'Action', 'Abenteuer'] nm0083348
114             Prinzessin Mononoke ['Animation', 'Action', 'Abenteuer'] nm0594503
51     Drachenzähmen leicht gemacht ['Animation', 'Action', 'Abenteuer'] nm0213450
105          Der Gigant aus dem All ['Animation', 'Action', 'Abenteuer'] nm0083348
0                        Dune: Part Two ['Action', 'Abenteuer', 'Drama'] nm0898288
11                           Klaus ['Animation', 'Abenteuer', 'Komödie'] nm0655053
57                            Oben ['Animation', 'Abenteuer', 'Komödie'] nm0230032
Name: Text, dtype: object