In [3]:
import pandas as pd
from ast import literal_eval
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
movies = pd.read_csv("../../data/clean/english_movies_clean.csv")

In [11]:
movies = movies[movies["overview"].isna() == False]

In [7]:
movies.shape

(328020, 9)

In [5]:
movies.columns

Index(['id', 'title', 'overview', 'popularity', 'release_date',
       'clean_keywords', 'clean_genres', 'cast', 'director'],
      dtype='object')

In [12]:
columns = ["clean_keywords", "clean_genres", "cast", "director"]
for c in columns:
    movies[c] = movies[c].fillna("[]").apply(literal_eval)

In [8]:
movies = movies[movies["overview"].isna() == False]

In [9]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 320977 entries, 0 to 328019
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              320977 non-null  float64
 1   title           320976 non-null  object 
 2   overview        320977 non-null  object 
 3   popularity      320977 non-null  float64
 4   release_date    284466 non-null  object 
 5   clean_keywords  100978 non-null  object 
 6   clean_genres    213989 non-null  object 
 7   cast            205672 non-null  object 
 8   director        253599 non-null  object 
dtypes: float64(2), object(7)
memory usage: 24.5+ MB


In [12]:
def preprocess(text):
    # Convert to lowercase
    text = text.lower()

    # Remove special characters and digits
    text = re.sub(r'\W+|\d+', ' ', text)

    # Remove stopwords and lemmatize
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])

    return text

In [13]:
movies["clean_overview"] = movies["overview"].apply(preprocess)

In [50]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(movies['clean_overview'])

In [26]:
def tfidf_search(query, tfidf_matrix, vectorizer):
    query_vec = vectorizer.transform([query])
    similarity_scores = cosine_similarity(query_vec, tfidf_matrix)
    ranked_indices = np.argsort(-similarity_scores).flatten()
    return ranked_indices

In [1]:
x = [
  "Lawyer",
  "Law",
  "Courtroom",
  "Justice",
  "Morality",
  "Ethics"
]

In [2]:
" ".join(x)

'Lawyer Law Courtroom Justice Morality Ethics'

In [54]:
res = tfidf_search(" ".join(x), tfidf_matrix, vectorizer)

In [55]:
movies.iloc[res[:5]]

Unnamed: 0,id,title,overview,popularity,release_date,clean_keywords,clean_genres,cast,director,clean_overview,clean_keywords_soup
304473,1169405.0,Fatal Justice,"When the law fails, family justice is the only...",0.213,2023-07-14,[],[Thriller],[],[],law fails family justice way,
202948,723838.0,All the Beautiful Girls,a film by Justice,0.867,2020-07-12,[],[],"[Christophe Mulai, Christoph Mulai, Chris Mula...",[Justice],film justice,
146687,539588.0,Billy Rango,Death is the only justice that we all have.,0.214,,[],[],[],[],death justice,
305749,1175658.0,James Bulger: The Trial,Thirty years on from the trial that shocked th...,0.322,2023-09-06,"[child murder, murder, courtroom, true crime, ...",[Documentary],[],[],thirty year trial shocked world new document r...,child murder murder courtroom true crime child...
135301,504996.0,In the Shadow of a Killer,Courtroom drama involving the mafia inspired b...,3.468,1992-04-27,[],[Crime],"[Scott Bakula, Robert Clohessy, James Russo, L...",[Alan Metzger],courtroom drama involving mafia inspired true ...,


In [37]:
movies["clean_keywords_soup"] = movies["clean_keywords"].apply(lambda x : " ".join(x))

In [38]:
tfidf_matrix_keywords = vectorizer.fit_transform(movies['clean_keywords_soup'])

In [43]:
res2 = tfidf_search(" ".join(x), tfidf_matrix_keywords, vectorizer)

In [44]:
movies.iloc[res2[:5]]

Unnamed: 0,id,title,overview,popularity,release_date,clean_keywords,clean_genres,cast,director,clean_overview,clean_keywords_soup
275138,1031922.0,Worthy,Fresh off a winning case that brought her unwa...,1.41,2018-10-31,"[law and ethics, morality, justice, social jus...","[Drama, Crime]","[Kally Khourshid, Tom Jenkins, Matthew Hancock...",[Michael Vaughn Hernandez],fresh winning case brought unwanted internet f...,law and ethics morality justice social justice...
48168,178819.0,Midnight Court,"After losing his bid for district attorney, an...",4.632,1937-03-06,"[lawyer, courtroom]","[Romance, Crime, Drama]","[John Litel, William B. Davidson, Joan Woodbur...",[Frank McDonald],losing bid district attorney aspiring young la...,lawyer courtroom
124096,464683.0,Creating a Monster,Creating a Monster is about reality television...,0.203,2016-12-17,[ethics],[Documentary],[],[Gena Lida Riess],creating monster reality television sub textua...,ethics
35069,112977.0,Deadlocked,A young man is accused of rape and murder and ...,3.915,2000-06-18,"[lawyer, racism, justice, framed for murder, law]","[Crime, Thriller, Drama, Mystery, TV Movie]","[Tom Butler, Charles S. Dutton, John Finn, Dav...",[Michael W. Watkins],young man accused rape murder placed trial fat...,lawyer racism justice framed for murder law
59172,239220.0,The Advocate,A grieving defense attorney gets caught in a t...,2.864,2013-02-28,"[detective, murder, lawyer, courtroom]","[Thriller, Crime, Mystery]","[Kristina Klebe, Steffinnie Phrommany, Michael...",[Tamas Harangi],grieving defense attorney get caught twisted g...,detective murder lawyer courtroom
