## IMPORT & READ CSV

In [19]:
import pandas as pd

content = pd.read_csv('./src/preprocessed_content_3_3.csv')
movies_details = pd.read_csv('./src/TMDB_content.csv')

## TF-IDF VECTORIZER

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Transform each text into its vectorial form and compute the frequency of each word:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(content['soup'])

# Compute cosine similarity scores:
cosine_sim = cosine_similarity(count_matrix, count_matrix)
cosine_sim.shape

## CONTENT-BASED FILTERING RECOMMENDER

In [7]:
movie_ids = pd.Series(content.index, index=content['tmdb_id']).drop_duplicates()

def get_recommendations(movie_id, cosine_sim=cosine_sim):
    
    # Get the index of the movie that matches the ID
    idx = movie_ids[movie_id]

    # Get the pairwise similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 30 most similar movies
    similarity_scores = similarity_scores[1:31]

    # Get the movie indices, scores and tmdb_id
    movie_indices = [i[0] for i in similarity_scores]
    movie_scores = [i[1] for i in similarity_scores]
    movie_tmdb_id = content['tmdb_id'].iloc[movie_indices]

    # Put results into dataframe
    df = pd.DataFrame(data = {'tmdb_id': movie_tmdb_id,
                              'score': movie_scores
                            })
    df = df.reset_index(drop=True)

    # Return the top 30 most similar movies
    return df

## GET RECOMMENDATIONS

In [21]:
# Test on Harry Potter movie:
result = get_recommendations(671)

# Display titles and genres:
result_enriched = pd.merge(result, movies_details, how='left', left_on='tmdb_id', right_on='tmdb_id')
result_enriched = result_enriched.drop(['poster_path','year', 'keywords', 'cast', 'director', 'watch_providers'], axis=1)
result_enriched.head(15)

Unnamed: 0,tmdb_id,score,title,genres
0,672,0.7,Harry Potter and the Chamber of Secrets,"Adventure, Fantasy"
1,32657,0.521749,Percy Jackson & the Olympians: The Lightning T...,"Adventure, Fantasy, Family"
2,674,0.476731,Harry Potter and the Goblet of Fire,"Adventure, Fantasy"
3,767,0.466667,Harry Potter and the Half-Blood Prince,"Adventure, Fantasy"
4,673,0.456435,Harry Potter and the Prisoner of Azkaban,"Adventure, Fantasy"
5,12445,0.414039,Harry Potter and the Deathly Hallows: Part 2,"Fantasy, Adventure"
6,12444,0.407046,Harry Potter and the Deathly Hallows: Part 1,"Adventure, Fantasy"
7,675,0.4,Harry Potter and the Order of the Phoenix,"Adventure, Fantasy"
8,772,0.379536,Home Alone 2: Lost in New York,"Comedy, Family, Adventure"
9,9441,0.367423,Stepmom,"Drama, Romance, Comedy"


## GATHER & SAVE ALL SCORES

In [12]:
all_movies =  content['tmdb_id'].values
all_scores = []

for movie in all_movies:
    scores = get_recommendations(movie).to_dict(orient='records')
    all_scores.append([movie, scores])

df_final = pd.DataFrame(all_scores, columns=['tmdb_id', 'score_cb'])
df_final.to_csv('./src/TMDB_content_based_3_3.csv')

## (FOR TEST)

In [23]:
film1 = get_recommendations(671)
film2 = get_recommendations(259316)
film3 = get_recommendations(411)
combined_results = pd.concat([film1, film2, film3])
combined_results.to_csv('./results/recommendations_from_cb.csv', index=False)