In [1]:
!pip install sentence-transformers -q

In [20]:
from sentence_transformers import SentenceTransformer, InputExample, util
import pandas as pd
from torch.utils.data import DataLoader
import re
import string
import numpy as np

In [2]:
data = pd.read_csv('movies.csv', sep=',')

In [3]:
data.drop(['Year', 'Rating'], axis=1 ,inplace=True)

In [12]:
data['Description'][0]

'Barbie and Ken are having the time of their lives in the colorful and seemingly perfect world of Barbie Land. However, when they get a chance to go to the real world, they soon discover the joys and perils of living among\xa0humans.'

In [13]:
def clean(text):
    text = str(text)
    text = text.lower()  # нижний регистр
    text = re.sub(r"http\S+", " ", text)  # удаляем ссылки
    text = re.sub(r"@\w+", " ", text)  # удаляем упоминания пользователей
    text = re.sub(r"#\w+", " ", text)  # удаляем хэштеги
    text = re.sub(r"\d+", " ", text)  # удаляем числа
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"<.*?>", " ", text)  #
    text = re.sub(r"[️«»—]", " ", text)
    text = text.lower()
    return text

In [9]:
type(data['Description'][0])

str

In [14]:
clean_text = [clean(text) for text in data['Description']]

In [15]:
data['cleaned_text'] = clean_text

In [16]:
data['cleaned_text'][0]

'barbie and ken are having the time of their lives in the colorful and seemingly perfect world of barbie land however when they get a chance to go to the real world they soon discover the joys and perils of living among\xa0humans'

In [21]:
examples = []
for i in range(len(data)):
    for j in range(i + 1, len(data)):
        examples.append(InputExample(texts=[data['Description'][i], data['Description'][j]]))

In [24]:
next(iter(examples))

<sentence_transformers.readers.InputExample.InputExample at 0x7d262596ebf0>

In [19]:
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [None]:
descriptions = data['cleaned_text'].tolist()
film_embeddings = model.encode(descriptions, convert_to_tensor=True)

In [24]:
film_embeddings.size()

torch.Size([4860, 768])

In [25]:
np.save('film_embedded.npy', film_embeddings)

In [26]:
film_embeddings = np.load('film_embedded.npy')

In [27]:
def get_top_10_recommendations(query, top_k=10):
    query_embedding = model.encode(query, convert_to_tensor=True).cpu()
    similarities = util.pytorch_cos_sim(query_embedding, film_embeddings)[0]
    top_results = similarities.cpu().numpy().argsort()[::-1][:top_k]
    top_books = data.iloc[top_results].copy()
    similarity_scores = similarities.cpu().numpy()[top_results].copy()
    top_books['similarity_score'] = similarity_scores
    return top_books

In [32]:
query = 'corruption in city'

In [33]:
reccomendations = get_top_10_recommendations(query)

In [34]:
reccomendations

Unnamed: 0,Title,Description,Poster URL,Page URL,cleaned_text,similarity_score
3310,Urban Legend,A college campus is plagued by a vicious seria...,https://a.ltrbxd.com/resized/sm/upload/4d/ak/k...,https://letterboxd.com/film/urban-legend/,a college campus is plagued by a vicious seria...,0.444257
3338,The Young and the Damned,A group of juvenile delinquents lives a crimin...,https://a.ltrbxd.com/resized/sm/upload/kg/6s/a...,https://letterboxd.com/film/the-young-and-the-...,a group of juvenile delinquents lives a crimin...,0.384566
3540,Sin City: A Dame to Kill For,Some of Sin City’s most hard-boiled citizens c...,https://a.ltrbxd.com/resized/sm/upload/0w/yh/4...,https://letterboxd.com/film/sin-city-a-dame-to...,some of sin city’s most hardboiled citizens cr...,0.372968
4484,Sharper,"A small, wealthy family in New York City gets ...",https://a.ltrbxd.com/resized/sm/upload/ka/7m/i...,https://letterboxd.com/film/sharper/,a small wealthy family in new york city gets p...,0.362835
2809,A Most Violent Year,A thriller set in New York City during the win...,https://a.ltrbxd.com/resized/sm/upload/6q/z8/d...,https://letterboxd.com/film/a-most-violent-year/,a thriller set in new york city during the win...,0.347674
3838,Detroit,A police raid in Detroit in 1967 results in on...,https://a.ltrbxd.com/resized/sm/upload/qi/nw/u...,https://letterboxd.com/film/detroit/,a police raid in detroit in results in one o...,0.346753
1057,Sin City,Welcome to Sin City. This town beckons to the ...,https://a.ltrbxd.com/resized/sm/upload/ob/ac/1...,https://letterboxd.com/film/sin-city/,welcome to sin city this town beckons to the t...,0.346244
1932,Falling Down,An ordinary man frustrated with the various fl...,https://a.ltrbxd.com/resized/sm/upload/7j/05/d...,https://letterboxd.com/film/falling-down/,an ordinary man frustrated with the various fl...,0.342199
4598,Gods of Egypt,A common thief joins a mythical god on a quest...,https://a.ltrbxd.com/resized/sm/upload/d5/i8/p...,https://letterboxd.com/film/gods-of-egypt/,a common thief joins a mythical god on a quest...,0.338787
2374,A Scanner Darkly,An undercover cop in a not-too-distant future ...,https://a.ltrbxd.com/resized/sm/upload/e3/kt/u...,https://letterboxd.com/film/a-scanner-darkly/,an undercover cop in a nottoodistant future be...,0.33395


In [41]:
from sklearn.metrics.pairwise import cosine_similarity

In [54]:
from scipy.spatial.distance import euclidean

In [74]:
import numpy as np

In [81]:
data.columns

Index(['Title', 'Description', 'Poster URL', 'Page URL', 'cleaned_text',
       'embeddings', 'similarity'],
      dtype='object')

In [85]:
def recommend_books(query, dataframe_, top_n=20):
    # Clean the query (if needed)
    query_cleaned = clean(query)
    #print(query_cleaned)
    # Generate the embedding for the query
    query_embedding = model.encode([query_cleaned])
    #print(type(query_embedding),query_embedding.shape)
    # Calculate cosine similarity between the query and each book's annotation
    dataframe_['similarity'] = dataframe_['embeddings'].apply(lambda x: cosine_similarity([x], query_embedding).flatten()[0])
    # Sort the books by similarity and get the top_n results
    recommended_books = dataframe_.sort_values(by='similarity', ascending=False).head(top_n)

    # Return the relevant columns
    return recommended_books[['Description', 'similarity', 'Title', 'Poster URL', 'Page URL']]

In [83]:
query = 'Man who fighting with corruption in Gotham city'

In [86]:
recommend_books(query, data)# cosine similarity

Unnamed: 0,Description,similarity,Title,Poster URL,Page URL
713,Batman must face his most ruthless nemesis whe...,0.710602,Batman,https://a.ltrbxd.com/resized/sm/upload/qg/q3/m...,https://letterboxd.com/film/batman-1989/
14,"In his second year of fighting crime, Batman u...",0.700508,The Batman,https://a.ltrbxd.com/resized/sm/upload/h1/on/1...,https://letterboxd.com/film/the-batman/
16,Batman raises the stakes in his war on crime. ...,0.627921,The Dark Knight,https://a.ltrbxd.com/resized/sm/upload/zu/51/m...,https://letterboxd.com/film/the-dark-knight/
173,"Driven by tragedy, billionaire Bruce Wayne ded...",0.619559,Batman Begins,https://a.ltrbxd.com/resized/sm/upload/08/e5/a...,https://letterboxd.com/film/batman-begins/
695,An ambitious carnival man with a talent for ma...,0.562763,Nightmare Alley,https://a.ltrbxd.com/resized/sm/upload/pw/kj/w...,https://letterboxd.com/film/nightmare-alley-2021/
353,Obsessive master thief Neil McCauley leads a t...,0.560189,Heat,https://a.ltrbxd.com/resized/sm/upload/el/67/4...,https://letterboxd.com/film/heat-1995/
205,"To take down South Boston’s Irish Mafia, the p...",0.560165,The Departed,https://a.ltrbxd.com/resized/sm/upload/33/lu/2...,https://letterboxd.com/film/the-departed/
523,Fearing the actions of a god-like Super Hero l...,0.556899,Batman v Superman: Dawn of Justice,https://a.ltrbxd.com/resized/sm/upload/ua/mn/p...,https://letterboxd.com/film/batman-v-superman-...
1494,"When an armed, masked gang enter a Manhattan b...",0.555001,Inside Man,https://a.ltrbxd.com/resized/sm/upload/9j/92/f...,https://letterboxd.com/film/inside-man/
1441,Batman must battle a disfigured district attor...,0.53663,Batman Forever,https://a.ltrbxd.com/resized/sm/upload/6u/zy/2...,https://letterboxd.com/film/batman-forever/


In [80]:
#  NDCG@K function
def ndcg_at_k(actual, predicted, k):
    def dcg(predicted):
        return sum((1 / np.log2(idx + 2) for idx, item in enumerate(predicted) if item in actual))

    dcg_max = dcg(actual[:k])
    if not dcg_max:
        return 0.0
    return dcg(predicted[:k]) / dcg_max

# Example usage:
# Define a search query
query = "A boy who survived"

# Generate top N recommendations using the recommend_books function
recommended_books_df = recommend_books(query, data, top_n=10)

# Extract the recommended titles
predicted_titles = recommended_books_df['Description'].tolist()

# Assume you have a list of actual relevant titles (this could come from a user’s ratings or some other ground truth)
actual_relevant_titles = ["Film Title 1", "Film Title 2", "Film Title 3"]  # Example list

# Calculate NDCG@K for k=10
k = 10
ndcg_score = ndcg_at_k(actual_relevant_titles, predicted_titles, k)
print(f"NDCG@{k}: {ndcg_score:.4f}") == recommend_books(query, data)
print(recommended_books_df)

NDCG@10: 0.0000
                                            Description  similarity
578   A struggling salesman takes custody of his son...    0.376437
993   The story of August Pullman – a boy with facia...    0.299535
425   Held captive for 7 years in an enclosed space,...    0.291777
1345  A boy experiences first love, friendships and ...    0.291410
584   A young boy learns that he has extraordinary p...    0.290662
1806  After 20 years abroad, Mark Renton returns to ...    0.288750
1117  A 12-year-old boy searches for the one thing t...    0.277160
290   Elastigirl springs into action to save the day...    0.271562
1440  A mother lives quietly with her son. One day, ...    0.269286
670   In the wake of his dramatic escape from captiv...    0.263908


In [87]:
def save_embeddings_to_txt(books_df, filename='embeddings.txt'):
    with open(filename, 'w') as f:
        for i, row in books_df.iterrows():
            title = row['Description']
            embedding = row['embeddings']
            # Convert the embedding to a string
            embedding_str = ','.join(map(str, embedding))
            # Write to the file with the title and embedding
            f.write(f"{title}\t{embedding_str}\n")

# Step 2: Generate the embeddings (this should have been done already)
# Assuming `books` is your DataFrame with `embeddings` calculated
# Call the function to save to a text file
save_embeddings_to_txt(data, 'book_embeddings.txt')

print("Embeddings saved to book_embeddings.txt")

Embeddings saved to book_embeddings.txt


In [88]:
data

Unnamed: 0,Title,Description,Poster URL,Page URL,cleaned_text,embeddings,similarity
0,Barbie,Barbie and Ken are having the time of their li...,https://a.ltrbxd.com/resized/sm/upload/mm/bt/i...,https://letterboxd.com/film/barbie/,barbie and ken are having the time of their li...,"[-0.32578936, -0.033459865, 0.96370727, 0.2552...",0.173046
1,Parasite,"All unemployed, Ki-taek’s family takes peculia...",https://a.ltrbxd.com/resized/sm/upload/oi/ha/7...,https://letterboxd.com/film/parasite-2019/,all unemployed kitaek’s family takes peculiar ...,"[0.35172287, 0.42902425, 0.9482514, -0.1909772...",0.269765
2,Everything Everywhere All at Once,An aging Chinese immigrant is swept up in an i...,https://a.ltrbxd.com/resized/sm/upload/qo/9b/x...,https://letterboxd.com/film/everything-everywh...,an aging chinese immigrant is swept up in an i...,"[-0.2394578, -0.1444907, 0.33220854, -0.949551...",0.267534
3,Fight Club,A ticking-time-bomb insomniac and a slippery s...,https://a.ltrbxd.com/resized/sm/upload/b0/iz/e...,https://letterboxd.com/film/fight-club/,a tickingtimebomb insomniac and a slippery soa...,"[-0.09523395, 0.13974749, 0.88120854, -0.25531...",0.332923
4,La La Land,"Mia, an aspiring actress, serves lattes to mov...",https://a.ltrbxd.com/resized/sm/upload/a6/th/c...,https://letterboxd.com/film/la-la-land/,mia an aspiring actress serves lattes to movie...,"[0.9547254, 0.57873636, 0.7349273, -0.5343978,...",0.150125
...,...,...,...,...,...,...,...
1855,Pitch Perfect 3,After the highs of winning the world champions...,https://a.ltrbxd.com/resized/sm/upload/9h/ur/a...,https://letterboxd.com/film/pitch-perfect-3/,after the highs of winning the world champions...,"[0.6471697, -0.02907161, 0.53647745, -0.053459...",0.150860
1856,Star Trek Beyond,The USS Enterprise crew explores the furthest ...,https://a.ltrbxd.com/resized/sm/upload/3r/7k/g...,https://letterboxd.com/film/star-trek-beyond/,the uss enterprise crew explores the furthest ...,"[-0.98714054, 0.26338544, -0.29051733, -0.5166...",0.123505
1857,Color Out of Space,The Gardner family moves to a remote farmstead...,https://a.ltrbxd.com/resized/sm/upload/hn/4x/x...,https://letterboxd.com/film/color-out-of-space/,the gardner family moves to a remote farmstead...,"[-0.17554207, 0.7062792, 0.4019348, -1.3173394...",0.162444
1858,Ghostbusters II,Having lost their status and credibility five ...,https://a.ltrbxd.com/resized/sm/upload/1j/2r/v...,https://letterboxd.com/film/ghostbusters-ii/,having lost their status and credibility five ...,"[-0.008193281, 0.13329205, 1.0149481, -0.62428...",0.287521


In [89]:
data.to_csv('films_to_streamlit.csv', index=False)

In [91]:
print(type(data['embeddings'][0]))

<class 'numpy.ndarray'>


In [93]:
data['Poster URL'][10]

'https://a.ltrbxd.com/resized/sm/upload/sk/f3/f1/b2/whiplash-1200-1200-675-675-crop-000000.jpg?v=771f1a019d'