In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from langchain_huggingface import HuggingFaceEmbeddings
import json
import pickle




In [2]:
# Read the dataset
df = pd.read_csv("data/movie_dataset.csv")

# Display basic information about the dataset
df.info()

# Display first few rows of the dataset
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [3]:
# Check for duplicates
print("Raw duplicates:", df.duplicated().sum())
print("Duplicated ID:",df['id'].duplicated().sum())
print("Duplicated Title:", df['original_title'].duplicated().sum())


Raw duplicates: 0
Duplicated ID: 0
Duplicated Title: 2


In [4]:
# Reduce the dataset to meet the requirements of around 500 rows
# We want to keep the rows that provide most information
# For that purpose, we will prioritize movies which less nan values over the columns
df['n_null'] = df.isnull().sum(axis=1)
df.sort_values(by = 'n_null', ascending=True, inplace=True)
df = df.head(500)
df.describe()

Unnamed: 0,index,budget,id,popularity,revenue,runtime,vote_average,vote_count,n_null
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,1937.246,27149020.0,60997.674,28.754919,85672420.0,108.004,6.2838,959.378,0.0
std,936.987056,17659180.0,89595.122879,22.492237,154747000.0,16.920419,0.913009,1195.672809,0.0
min,0.0,0.0,28.0,0.126676,0.0,63.0,1.9,4.0,0.0
25%,1332.0,20000000.0,7263.5,13.361109,15509720.0,96.0,5.8,243.0,0.0
50%,1702.0,28000000.0,21341.5,23.719995,50889270.0,107.0,6.4,572.0,0.0
75%,2163.25,36000000.0,63000.75,37.126731,104592900.0,118.0,6.9,1127.75,0.0
max,4773.0,237000000.0,385383.0,150.437577,2787965000.0,195.0,8.3,11800.0,0.0


In [5]:
# Explor overview content
for idx,row in df.sample(5).iterrows():
    print(row['overview'])


Two policemen must join forces to take on an international drug- smuggling gang - one, an unorthodox Irish policeman and the other, a straitlaced FBI agent. Sergeant Gerry Boyle is an eccentric small-town cop with a confrontational and crass personality and a subversive sense of humor. A longtime policeman in County Galway, Boyle is a maverick with his own moral code. He has seen enough of the world to know there isn't much to it and has had plenty of time to think about it. When a fellow police officer disappears and Boyle's small town becomes key to a large drug trafficking investigation, he is forced to at least feign interest when dealing with the humorless FBI agent Wendell Everett assigned to the case.
In 1987, five young men, using brutally honest rhymes and hardcore beats, put their frustration and anger about life in the most dangerous place in America into the most powerful weapon they had: their music.  Taking us back to where it all began, Straight Outta Compton tells the t

In [6]:
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director', 'n_null'],
      dtype='object')

In [7]:
# Explore content/ format of columns
for col in ['genres','original_language','spoken_languages']:
    print("Column:",col)
    print(df[col].value_counts(dropna=False))
    print('*'*50)

Column: genres
genres
Drama                        42
Comedy                       33
Drama Romance                16
Comedy Romance               13
Horror Thriller              12
                             ..
Crime Drama                   1
Drama Mystery Crime           1
Action Comedy Romance         1
Thriller Action Crime         1
Action Comedy Crime Drama     1
Name: count, Length: 237, dtype: int64
**************************************************
Column: original_language
original_language
en    482
zh      5
fr      4
ru      1
fa      1
el      1
ja      1
cn      1
ko      1
hi      1
de      1
pt      1
Name: count, dtype: int64
**************************************************
Column: spoken_languages
spoken_languages
[{"iso_639_1": "en", "name": "English"}]                                                                                                                                           341
[{"iso_639_1": "en", "name": "English"}, {"iso_639_1": "es", "name": "

In [8]:
# Function to extract and process language names
def extract_languages(json_string):
    languages = json.loads(json_string)  # Parse JSON string
    return ','.join([lang['name'].lower().replace('ñ','n').replace('ê','e') for lang in languages] ) # Extract and lowercase names

In [9]:
# Clean Spoken languages
df['spoken_languages_processed'] = df['spoken_languages'].apply(extract_languages)

In [10]:
df['spoken_languages_processed']

0             english,espanol
1893                  english
1892                  english
1888    english,deutsch,norsk
1884                  english
                ...          
4347                  english
4348                  english
3418                  english
3417                portugues
3416                  english
Name: spoken_languages_processed, Length: 500, dtype: object

## First Approach: TF-IDF and cosine similarity
In this version we will consider the basic similarity approach between the user query and some representative information from the movie
For that purpose we will pick some features that may help to provide further information about the movide
So we can recommend movies that match the user query

In [11]:

## Based on the exploration , we consider that the columns 'genres', 'spoken_languages', 'keywords' and 'overview' can provide good information that can help
# Merge all text-based columns into one
df["combined_features"] = ("Genres: "+
    df["genres"].str.lower() + " \nLanguages: " +
    df["spoken_languages_processed"] + " \nKeywords:" +
    df["keywords"].str.lower() + " \nOverview" +
    df["overview"].str.lower()
)

In [12]:
df["combined_features"]

0       Genres: action adventure fantasy science ficti...
1893    Genres: fantasy horror thriller \nLanguages: e...
1892    Genres: action adventure crime mystery thrille...
1888    Genres: comedy drama history \nLanguages: engl...
1884    Genres: thriller crime drama \nLanguages: engl...
                              ...                        
4347    Genres: thriller horror \nLanguages: english \...
4348    Genres: comedy drama thriller \nLanguages: eng...
3418    Genres: horror \nLanguages: english \nKeywords...
3417    Genres: drama action crime \nLanguages: portug...
3416    Genres: action comedy crime drama \nLanguages:...
Name: combined_features, Length: 500, dtype: object

In [13]:
# Function to compute cosine similarity and return top recommendations
def get_recommendations(query, df, tfidf_vectorizer, tfidf_matrix, top_n=5):
    query_tfidf = tfidf_vectorizer.transform([query])
    cosine_similarities = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    return df.iloc[top_indices][['title', 'overview', 'popularity']]

In [14]:
# Compute TF-IDF for the 'overview' column
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_features'])

### Run some examples:

In [15]:
query_example_1 = "I like action movies set in space"
get_recommendations(query_example_1,df,tfidf_vectorizer,tfidf_matrix)

Unnamed: 0,title,overview,popularity
1531,Moonraker,During the transportation of a Space Shuttle a...,29.887404
1354,Space Chimps,Circus monkey Ham III works in a circus where ...,6.293217
0,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577
2198,Lockout,"Set in the near future, Lockout follows a fals...",35.232278
4606,Tupac: Resurrection,"Home movies, photographs, and recited poetry i...",1.314576


In [16]:
query_example_2 = "I like action movies but never in the space"
get_recommendations(query_example_2,df,tfidf_vectorizer,tfidf_matrix)

Unnamed: 0,title,overview,popularity
1531,Moonraker,During the transportation of a Space Shuttle a...,29.887404
1354,Space Chimps,Circus monkey Ham III works in a circus where ...,6.293217
0,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577
2198,Lockout,"Set in the near future, Lockout follows a fals...",35.232278
4606,Tupac: Resurrection,"Home movies, photographs, and recited poetry i...",1.314576


In [17]:
query_example_3 = "I love musical funny movies"
get_recommendations(query_example_3,df,tfidf_vectorizer,tfidf_matrix)

Unnamed: 0,title,overview,popularity
2238,Bandslam,A high school social outcast and the popular g...,7.716606
4606,Tupac: Resurrection,"Home movies, photographs, and recited poetry i...",1.314576
1874,August Rush,"A drama with fairy tale elements, where an orp...",29.533437
1695,Aladdin,Princess Jasmine grows tired of being forced t...,92.982009
1080,Across the Universe,Musical based on The Beatles songbook and set ...,11.872841


## Second Approach: Use embeddings


In [18]:
# Convert embeddings to a NumPy array for similarity computation  
# Compute embeddings for each movie
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
df["embedding"] = df["combined_features"].apply(lambda text: embeddings.embed_query(text))
embedding_matrix = np.vstack(df["embedding"].values)

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# Function to find similar movies
def find_similar_movies(query, top_n=5):
    query_embedding = np.array(embeddings.embed_query(query)).reshape(1, -1)
    similarities = cosine_similarity(query_embedding, embedding_matrix)[0]
    df["similarity"] = similarities  # Store similarity scores
    top_movies = df.sort_values(by="similarity", ascending=False).head(top_n)
    
    return top_movies[["title", "similarity", "combined_features"]]



### Run some examples:

In [20]:
query_example_1 = "I like action movies set in space"
find_similar_movies(query_example_1)

Unnamed: 0,title,similarity,combined_features
1319,Riddick,0.428826,Genres: science fiction action thriller \nLang...
1271,Pandorum,0.425957,Genres: action horror mystery science fiction ...
1198,Escape from Planet Earth,0.412677,Genres: animation comedy adventure family scie...
2194,Disaster Movie,0.411781,Genres: action comedy \nLanguages: english \nK...
0,Avatar,0.408533,Genres: action adventure fantasy science ficti...


In [21]:
query_example_2 = "I like action movies but never in the space"
find_similar_movies(query_example_2)

Unnamed: 0,title,similarity,combined_features
1271,Pandorum,0.416869,Genres: action horror mystery science fiction ...
1354,Space Chimps,0.414458,Genres: animation family \nLanguages: english ...
1198,Escape from Planet Earth,0.408543,Genres: animation comedy adventure family scie...
1319,Riddick,0.405433,Genres: science fiction action thriller \nLang...
0,Avatar,0.40362,Genres: action adventure fantasy science ficti...


In [22]:
query_example_3 = "I love musical funny movies"
find_similar_movies(query_example_3)

Unnamed: 0,title,similarity,combined_features
1601,Step Up 3D,0.422445,Genres: drama romance \nLanguages: english \nK...
2136,Team America: World Police,0.418468,Genres: music adventure animation action comed...
2194,Disaster Movie,0.379709,Genres: action comedy \nLanguages: english \nK...
1695,Aladdin,0.37922,Genres: animation family comedy adventure fant...
4687,Good Dick,0.374908,Genres: comedy drama romance \nLanguages: engl...


## Save artifacts for later inference

In [23]:
df.to_csv("artifacts/Processed_dataset.csv")
import scipy.sparse as sp
sp.save_npz("artifacts/sparse_matrix.npz", tfidf_matrix)

with open('artifacts/tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)