In [6]:
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel

In [7]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
sentences = [
    "The weather is lovely today.",
    "Today class will be over at 3:30",
    "He drove to the stadium.",
    "Tommorow there won't be class",
    "She drove to the gym."
]
embeddings = model.encode(sentences)

similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)

torch.Size([5, 5])


In [8]:
similarities

tensor([[1.0000, 0.2484, 0.1046, 0.0367, 0.0627],
        [0.2484, 1.0000, 0.0590, 0.5646, 0.1128],
        [0.1046, 0.0590, 1.0000, 0.0454, 0.4679],
        [0.0367, 0.5646, 0.0454, 1.0000, 0.1149],
        [0.0627, 0.1128, 0.4679, 0.1149, 1.0000]])

In [9]:
df = pd.read_csv("titles.csv")
df_indian_movies = df[(df['production_countries'].str.contains('IN')) & (df['type']=="MOVIE" )]
df_indian_movies['description'].isnull().sum()

np.int64(0)

In [10]:
all_desc = list( df_indian_movies['description'] )
out_embs = []
for r in tqdm(all_desc):
    embd = model.encode(r)
    out_embs.append(embd)
out_embs = np.array(out_embs)
out_embs.shape

100%|██████████| 577/577 [00:17<00:00, 32.64it/s]


(577, 384)

In [None]:
query = "irritating girl who fakes accent"
qy_emb = model.encode([query])
sims = []
for e in tqdm(out_embs):
    sims.append(model.similarity(qy_emb,e).numpy()[0])
df_ind = df_indian_movies[['title','description']]
df_ind['sims'] = np.array(sims)
df_ind.sort_values('sims', ascending=False)

  0%|          | 0/577 [00:00<?, ?it/s]

100%|██████████| 577/577 [00:00<00:00, 3729.01it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ind['sims'] = np.array(sims)


Unnamed: 0,title,description,sims
4130,Indoo Ki Jawani,A feisty girl from Ghaziabad makes her profile...,0.419743
4336,Bulbul Can Sing,Coming-of-age drama about a young girl living ...,0.349811
2254,Secret Superstar,Insia Malik is a talented 15-year-old school g...,0.325644
4308,Kappela,"Set in the village areas of Wayanad, Malappura...",0.321447
4275,Miss India,"Uprooted to America, an aspiring Indian entrep...",0.315878
...,...,...,...
2727,Rajma Chawal,A father attempts to reconnect with his estran...,-0.068461
614,Dhan Dhana Dhan Goal,A bankrupt soccer team must win the championsh...,-0.076244
3368,Magamuni,Two long-separated brothers end up in life-thr...,-0.084172
5674,Laabam,The president of a farmers' association wants ...,-0.090820


100%|██████████| 577/577 [00:00<00:00, 3734.05it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ind.loc[:,'sims'] = np.array(sims)
100%|██████████| 577/577 [00:00<00:00, 4512.93it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ind.loc[:,'sims'] = np.array(sims)
100%|██████████| 577/577 [00:00<00:00, 4573.37it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [15]:
# query = "husband finds his wife as a serial killer"
# qy_emb = model.encode([query])
# sims = []
# for e in tqdm(out_embs):
#     sims.append(model.similarity(qy_emb,e).numpy()[0])
# df_ind = df_indian_movies[['id', 'title','description']]
# df_ind['sims'] = np.array(sims)
# df_ind.sort_values('sims', ascending=False)

In [17]:
import gradio as gr
     

def search_similar( query ):
    qy_emb = model.encode([query])
    sims = []
    for e in tqdm(out_embs):
        sims.append(model.similarity(qy_emb,e).numpy()[0])

    df_ind = df_indian_movies[['id', 'title','description']]
    df_ind.loc[:,'sims'] = np.array(sims)
    return df_ind.sort_values('sims', ascending=False)[:5]
     

demo = gr.Interface(fn = search_similar,
                    inputs=[gr.Textbox(label="Your Review:")],
                    outputs=[gr.Dataframe(label="Most Similar Description Movies:")])
demo.launch()
     


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


