In [None]:
!pip install fastembed

Installing collected packages: py-rust-stemmers, mmh3, loguru, humanfriendly, coloredlogs, onnxruntime, fastembed
Successfully installed coloredlogs-15.0.1 fastembed-0.7.1 humanfriendly-10.0 loguru-0.7.3 mmh3-5.2.0 onnxruntime-1.22.1 py-rust-stemmers-0.1.5


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from fastembed import TextEmbedding
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('imdb.csv')
df.shape

(250, 13)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   rank         250 non-null    int64  
 1   name         250 non-null    object 
 2   year         250 non-null    object 
 3   rating       250 non-null    float64
 4   genre        250 non-null    object 
 5   certificate  250 non-null    object 
 6   run_time     250 non-null    object 
 7   tagline      250 non-null    object 
 8   budget       250 non-null    object 
 9   box_office   250 non-null    object 
 10  casts        250 non-null    object 
 11  directors    250 non-null    object 
 12  writers      250 non-null    object 
dtypes: float64(1), int64(1), object(11)
memory usage: 25.5+ KB


In [None]:
df['year'] = df['year'].astype(str)

In [None]:
df.shape

(250, 12)

In [None]:
df.drop("certificate",axis=1,inplace=True)

In [None]:
df['Overview'] = df[['name','tagline','genre','casts','directors','writers']].apply(lambda x: ' '.join(x), axis=1)

In [None]:
df['Overview']

In [None]:
s = "funny comedy movie"

In [None]:
[s]

In [None]:
class TFIDFRecommender:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
        self.tfidf_matrix = None
        self.df = None

    def fit(self, df, text_column):
        self.df = df
        texts = df[text_column].astype(str)
        self.tfidf_matrix = self.vectorizer.fit_transform(texts)

    def recommend(self, query, top_k=5):
        query_vector = self.vectorizer.transform(query.split())
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()

        results_df = pd.DataFrame({
            'name': self.df['name'],
            'similarity': similarities
        })
        results_df = results_df.sort_values('similarity', ascending=False).head(top_k)

        return results_df[['name', 'similarity']].to_dict('records')

In [None]:
class DenseRecommender:
    def __init__(self):
        self.model = TextEmbedding("jinaai/jina-embeddings-v2-base-en")
        self.embeddings = None
        self.df = None

    def fit(self, df, text_column):
        self.df = df
        texts = df[text_column].astype(str).tolist()
        self.embeddings = np.array(list(self.model.embed(texts)))

    def recommend(self, query, top_k=5):
        query_embedding = np.array(list(self.model.embed([query])))[0]
        similarities = cosine_similarity([query_embedding], self.embeddings).flatten()

        # Create DataFrame with similarities and sort
        results_df = pd.DataFrame({
            'name': self.df['name'],
            'similarity': similarities
        })
        results_df = results_df.sort_values('similarity', ascending=False).head(top_k)

        return results_df[['name', 'similarity']].to_dict('records')

In [None]:
tfidf_recommender = TFIDFRecommender()
dense_recommender = DenseRecommender()

In [None]:
text_column = 'Overview'

print("Fitting TF-IDF recommender...")
tfidf_recommender.fit(df, text_column) # training

Fitting TF-IDF recommender...


In [None]:
print("Fitting dense recommender...")
dense_recommender.fit(df, text_column)

Fitting dense recommender...


In [None]:
query = "funny comedy movie"

In [None]:
tfidf_results = tfidf_recommender.recommend(query, top_k=3)

In [None]:
tfidf_results

[{'name': 'The Exorcist', 'similarity': 0.15829510639195857},
 {'name': 'The Apartment', 'similarity': 0.14817741837385537},
 {'name': 'Toy Story 3', 'similarity': 0.0698034239605489}]

In [None]:
dense_results = dense_recommender.recommend(query, top_k=3)

In [None]:
dense_results

[{'name': 'Snatch', 'similarity': 0.754506028435991},
 {'name': 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb',
  'similarity': 0.7448568512086527},
 {'name': 'Sherlock Jr.', 'similarity': 0.7421745630841639}]