# Recommendation Movie Using Bert Model

In [None]:
!pip install -q sentence-transformers faiss-cpu tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m96.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [52]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm
from sklearn.model_selection import train_test_split


## Step 1: Load movie metadata, user profiles, and ratings

In [53]:
# Assuming you have the data files available
try:
    movie_df = pd.read_parquet('cbf_movie.parquet') # Using parquet as in your original notebook
    user_profile_df = pd.read_csv('user_last_genres.csv')
    ratings_df = pd.read_parquet('user_ratings_200users_30each_int.parquet') # Using parquet as in your original notebook
except FileNotFoundError:
    print("Make sure 'cbf_movie.parquet', 'user_last_genres.csv', and 'user_ratings_200users_30each.parquet' are in the correct directory.")

In [54]:
movie_df.head()

Unnamed: 0,movie_id,status,title,adult,overview,original_language,release_date,vote_count,vote_average,popularity,budget,revenue,runtime,genres,keywords,companies,languages,countries
0,1094579,Released,How we made Asterix & Obelix: Mission Cleopatra,False,Behind the scenes of Chabat's take on Asterix.,fr,2002-08-28,6,7.833,3.181,0,0,115,"Comedy, Documentary","tutorial, dvd extra, chabat, bonus feature, ma...","Dreamlight Entertainment, Preface",French,france
1,102884,Released,The War at Sea from Hawaii to Malaya,False,Japanese Navy air cadets train for the attacks...,ja,1942-12-02,0,0.0,2.492,0,0,116,"Drama, War, History","fighter pilot, pacific war, military training,...",Toho Film (Eiga) Co. Ltd.,Japanese,japan
2,47692,Released,Felicia's Journey,False,"Seventeen and pregnant, Felicia travels to Eng...",en,1999-10-08,93,6.253,5.782,0,0,116,Drama,"birmingham, series of murders","Marquis Films Ltd, Alliance Atlantis, Icon Ent...","English, French, Irish","united kingdom, canada"
3,11285,Released,Cocoon: The Return,False,The reinvigorated elderly group that left Eart...,en,1988-09-13,932,6.552,11.883,17500000,25024919,116,"Science Fiction, Comedy","shape shifting alien, ice cream sundae, expect...","20th Century Fox, Zanuck/Brown Productions",English,united states of america
4,48198,Released,Wings of Fame,False,A famous movie actor claims that he has writte...,en,1990-03-23,15,6.6,3.544,0,0,116,"Fantasy, Comedy, Drama","afterlife, murder, revenge",First Floor Features,English,netherlands


In [55]:
user_profile_df.head()

Unnamed: 0,user_id,Last_genres0,Last_genres1,Last_genres2
0,1,Drama,Drama,Drama
1,2,Adventure,TV Movie,Adventure
2,3,Adventure,Adventure,Adventure
3,4,Adventure,Fantasy,Fantasy
4,5,Romance,Adventure,Adventure


In [56]:
ratings_df.head()

Unnamed: 0,user_id,movie_id,title,genres,keywords,rating
0,1,386564,"Hang in There, Kids!","Drama, Family, Comedy",woman director,3.08
1,1,67077,The Plane Tree,"Drama, Family, Comedy",woman director,3.59
2,1,189005,Pennies from Heaven,"Drama, Music, Comedy","welfare, singer, welfare worker",2.91
3,1,154764,Miyamoto Musashi,"Drama, History, Action","jidaigeki, miyamoto musashi, chambara",3.2
4,1,512062,Vinayapoorvam Vidyadharan,"Drama, Family, Comedy","dysfunctional marriage, romance",3.79


## Step 2: Prepare movie text for embedding

In [57]:
# Handle potential missing values in relevant columns
movie_df.dropna(subset=['title', 'overview', 'genres'], inplace=True)

# Combine features into a single text column
movie_df['text'] = movie_df['title'] + '. ' + movie_df['overview'].fillna('') + ' Genres: ' + movie_df['genres'].fillna('')

# Select only necessary columns
movie_df = movie_df[['movie_id', 'text']]

display(movie_df.head())

Unnamed: 0,movie_id,text
0,1094579,How we made Asterix & Obelix: Mission Cleopatr...
1,102884,The War at Sea from Hawaii to Malaya. Japanese...
2,47692,"Felicia's Journey. Seventeen and pregnant, Fel..."
3,11285,Cocoon: The Return. The reinvigorated elderly ...
4,48198,Wings of Fame. A famous movie actor claims tha...


## Step 3: Generate BERT embeddings for movie descriptions

In [59]:
# Load a pre-trained BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')
model = model.to('cuda')  # Moves model to GPU

batch_size = 64
movie_texts = movie_df['text'].tolist()
movie_ids = movie_df['movie_id'].tolist()

In [None]:
all_embeddings = []
for i in tqdm(range(0, len(movie_texts), batch_size)):
    batch_texts = movie_texts[i:i+batch_size]
    batch_embeds = model.encode(batch_texts, device="cuda")
    all_embeddings.append(batch_embeds)

all_embeddings = np.vstack(all_embeddings).astype('float32')

In [61]:
# Create a dictionary mapping movie_id to its embedding
movie_embed_dict = dict(zip(movie_ids, all_embeddings))

print(f"Generated embeddings for {len(movie_embed_dict)} movies.")

Generated embeddings for 135886 movies.


In [60]:
# np.save("movie_embeddings.npy", all_embeddings)
all_embeddings = np.load("movie_embeddings.npy")

## Step 4: Generate user embeddings from last genres

In [62]:
user_embeddings = {}
for _, row in user_profile_df.iterrows():
    user_id = row['user_id']
    genres = ', '.join([str(row[col]) for col in row.index if col.startswith('Last_genres')])
    text = f"User's last watched genres: {genres}"
    user_embeddings[user_id] = model.encode(text, device="cuda")

In [38]:
# print(f"Number of users embedded: {len(user_embeddings)}")

Number of users embedded: 200


## Step 5: Index movie embeddings using FAISS

In [63]:
index = faiss.IndexFlatL2(all_embeddings.shape[1])
index.add(all_embeddings)

## Step 6: Prepare train/test data for evaluation

In [40]:
# ratings_df['relevant'] = (ratings_df['rating'] >= 1.0).astype(int)
# ratings_df = ratings_df[ratings_df['relevant'] == 1]

# train_rows, test_rows = [], []
# for uid, group in ratings_df.groupby("user_id"):
#     if len(group) >= 5:
#         train, test = train_test_split(group, test_size=0.2, random_state=42)
#         train_rows.append(train)
#         test_rows.append(test)

# train_df = pd.concat(train_rows)
# test_df = pd.concat(test_rows)

In [64]:
ratings_df['relevant'] = (ratings_df['rating'] >= 0.0).astype(int)
test_df = ratings_df[ratings_df['relevant'] == 1]

## Step 7: Evaluate Precision@5 and Recall@5

In [65]:
def evaluate(user_id, k=5):
    if user_id not in user_embeddings:
        return 0, 0
    user_vec = np.expand_dims(user_embeddings[user_id], axis=0).astype('float32')
    D, I = index.search(user_vec, k)
    recommended = [movie_ids[i] for i in I[0]]
    actual = test_df[test_df['user_id'] == user_id]['movie_id'].tolist()
    hits = len(set(recommended) & set(actual))
    precision = hits / k
    recall = hits / len(actual) if actual else 0
    return precision, recall

In [66]:
results = [evaluate(uid, k=5) for uid in test_df['user_id'].unique()]
avg_precision = np.mean([r[0] for r in results])
avg_recall = np.mean([r[1] for r in results])

print(f"Precision@5: {avg_precision:.4f}")
print(f"Recall@5: {avg_recall:.4f}")

Precision@5: 0.0000
Recall@5: 0.0000


In [70]:
results = [evaluate(uid, k=10) for uid in test_df['user_id'].unique()]
avg_precision = np.mean([r[0] for r in results])
avg_recall = np.mean([r[1] for r in results])

print(f"Precision@10: {avg_precision:.4f}")
print(f"Recall@10: {avg_recall:.4f}")

Precision@10: 0.0000
Recall@10: 0.0000


In [67]:
def recommend_movies(user_id, top_k=10):
    if user_id not in user_embeddings:
        print(f"User ID {user_id} not found in user_embeddings.")
        return

    # Get user vector and search similar movies
    user_vec = np.expand_dims(user_embeddings[user_id], axis=0).astype('float32')
    D, I = index.search(user_vec, top_k)

    print(f"\nTop {top_k} recommended movies for user {user_id}:\n")
    results = []

    for i, movie_idx in enumerate(I[0]):
        movie_id = movie_ids[movie_idx]
        similarity_score = 1 / (1 + D[0][i])  # Convert L2 distance to pseudo-similarity
        movie_info = movie_df[movie_df['movie_id'] == movie_id].iloc[0]

        results.append({
            "Rank": i + 1,
            "Movie ID": movie_id,
            "Title": movie_info['text'].split('.')[0],
            "Genres": movie_info['text'].split('Genres: ')[-1],
            "Score": round(similarity_score, 4)
        })

    return pd.DataFrame(results)

In [68]:
recommendation_df = recommend_movies(1, top_k=10)
recommendation_df


Top 10 recommended movies for user 1:



Unnamed: 0,Rank,Movie ID,Title,Genres,Score
0,1,411381,Afterglow,Drama,0.6364
1,2,861871,Only One,Drama,0.627
2,3,391434,Passion,"Romance, Drama",0.6186
3,4,1358295,Burnt Earth,Drama,0.6166
4,5,1081597,Behold,Drama,0.6099
5,6,557281,What Do You Think? (Number Three),Drama,0.5967
6,7,689418,Fast Forward,"Romance, Drama",0.595
7,8,49488,Kill the Day,Drama,0.5937
8,9,391193,The Road To,Drama,0.5918
9,10,74612,Men's Group,Drama,0.5904
