# Imports and Installs

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
!pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True



---



---



# Input Query

In [2]:
q_terms = ['crime', 'war', 'gangster']
df_q = pd.DataFrame(q_terms, columns=['Query'])
df_q

Unnamed: 0,Query
0,crime
1,war
2,gangster




---



---



# Importing Dataset

In [3]:
df0 = pd.read_csv('imdb_top_1000.csv') 
df0.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000




---



---



# Dataset Preprocessing & Reduction

In [4]:
df = df0[["Series_Title", "Genre", "Overview"]]
df = df.rename(columns={'Series_Title': "Title"})
df.head()

Unnamed: 0,Title,Genre,Overview
0,The Shawshank Redemption,Drama,Two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",An organized crime dynasty's aging patriarch t...
2,The Dark Knight,"Action, Crime, Drama",When the menace known as the Joker wreaks havo...
3,The Godfather: Part II,"Crime, Drama",The early life and career of Vito Corleone in ...
4,12 Angry Men,"Crime, Drama",A jury holdout attempts to prevent a miscarria...


In [5]:
df['Overview'] = df['Overview'].str.lower()
df["Overview"] = df['Overview'].str.replace('[^\w\s]','', regex=True)

In [6]:
df.head()

Unnamed: 0,Title,Genre,Overview
0,The Shawshank Redemption,Drama,two imprisoned men bond over a number of years...
1,The Godfather,"Crime, Drama",an organized crime dynastys aging patriarch tr...
2,The Dark Knight,"Action, Crime, Drama",when the menace known as the joker wreaks havo...
3,The Godfather: Part II,"Crime, Drama",the early life and career of vito corleone in ...
4,12 Angry Men,"Crime, Drama",a jury holdout attempts to prevent a miscarria...


In [7]:
documents=df['Overview'].unique()
print(documents.shape)
documents[:10]

(1000,)


array(['two imprisoned men bond over a number of years finding solace and eventual redemption through acts of common decency',
       'an organized crime dynastys aging patriarch transfers control of his clandestine empire to his reluctant son',
       'when the menace known as the joker wreaks havoc and chaos on the people of gotham batman must accept one of the greatest psychological and physical tests of his ability to fight injustice',
       'the early life and career of vito corleone in 1920s new york city is portrayed while his son michael expands and tightens his grip on the family crime syndicate',
       'a jury holdout attempts to prevent a miscarriage of justice by forcing his colleagues to reconsider the evidence',
       'gandalf and aragorn lead the world of men against saurons army to draw his gaze from frodo and sam as they approach mount doom with the one ring',
       'the lives of two mob hitmen a boxer a gangster and his wife and a pair of diner bandits intertwine 



---



---



# Stop Word Removal

In [8]:
stop_words = stopwords.words('english')
#stop_words.append('') 
df['Overview'] = df['Overview'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_q['Query'] = df_q['Query'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [9]:
df.Overview[:10]

0    two imprisoned men bond number years finding s...
1    organized crime dynastys aging patriarch trans...
2    menace known joker wreaks havoc chaos people g...
3    early life career vito corleone 1920s new york...
4    jury holdout attempts prevent miscarriage just...
5    gandalf aragorn lead world men saurons army dr...
6    lives two mob hitmen boxer gangster wife pair ...
7    germanoccupied poland world war ii industriali...
8    thief steals corporate secrets use dreamsharin...
9    insomniac office worker devilmaycare soapmaker...
Name: Overview, dtype: object



---



---



# Lemmatization

In [10]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word,pos='v') for word in words]
    return ' '.join(words)
df['Overview'] = df['Overview'].apply(lemmatize_words)
df_q['Query'] = df_q['Query'].apply(lemmatize_words)

In [11]:
df.Overview[:10]

0    two imprison men bond number years find solace...
1    organize crime dynastys age patriarch transfer...
2    menace know joker wreak havoc chaos people got...
3    early life career vito corleone 1920s new york...
4    jury holdout attempt prevent miscarriage justi...
5    gandalf aragorn lead world men saurons army dr...
6    live two mob hitmen boxer gangster wife pair d...
7    germanoccupied poland world war ii industriali...
8    thief steal corporate secrets use dreamsharing...
9    insomniac office worker devilmaycare soapmaker...
Name: Overview, dtype: object



---



---



# Documents

In [12]:
documents=df['Overview'].unique()
print(documents.shape)
documents[:10]

(1000,)


array(['two imprison men bond number years find solace eventual redemption act common decency',
       'organize crime dynastys age patriarch transfer control clandestine empire reluctant son',
       'menace know joker wreak havoc chaos people gotham batman must accept one greatest psychological physical test ability fight injustice',
       'early life career vito corleone 1920s new york city portray son michael expand tighten grip family crime syndicate',
       'jury holdout attempt prevent miscarriage justice force colleagues reconsider evidence',
       'gandalf aragorn lead world men saurons army draw gaze frodo sam approach mount doom one ring',
       'live two mob hitmen boxer gangster wife pair diner bandits intertwine four tales violence redemption',
       'germanoccupied poland world war ii industrialist oskar schindler gradually become concern jewish workforce witness persecution nazis',
       'thief steal corporate secrets use dreamsharing technology give inverse task 



---



---



# Resultant Indexed Query 

In [13]:
qi_terms = df_q['Query']
qi_terms

0       crime
1         war
2    gangster
Name: Query, dtype: object



---



---



# Vectorization

In [14]:
vectorizer = CountVectorizer(stop_words='english')
documents_vectorized = vectorizer.fit_transform(df.Overview)
vocabulary = vectorizer.get_feature_names_out()

In [15]:
print('We have a {} document corpus with a {} term vocabulary'.format(*documents_vectorized.shape))

We have a 1000 document corpus with a 4831 term vocabulary


In [16]:
df = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
doc_ids = df.index.values
df.head()

Unnamed: 0,00,007,10000,100000,100th,10yearold,1183,11yearold,12,12yearold,...,zero,zodiac,zombie,zombiefilled,zombies,zone,zorg,zuckerberg,zulu,édith
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0




---



---



# BM25 Model

In [17]:
def BM25_IDF_df(df):

  dfs = (df > 0).sum(axis=0)
  N = df.shape[0]
  idfs = -np.log(dfs / N)
  
  k_1 = 1.2
  b = 0.8
  dls = df.sum(axis=1) 
  avgdl = np.mean(dls)

  numerator = np.array((k_1 + 1) * df)
  denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) + np.array(df)

  BM25_tf = numerator / denominator

  idfs = np.array(idfs)

  BM25_score = BM25_tf * idfs
  return pd.DataFrame(BM25_score, columns=vocabulary)



---



---



# BM25 Weights

In [18]:
bm25_df = BM25_IDF_df(df) 
print(bm25_df.shape)
bm25_df.head()

(1000, 4831)


Unnamed: 0,00,007,10000,100000,100th,10yearold,1183,11yearold,12,12yearold,...,zero,zodiac,zombie,zombiefilled,zombies,zone,zorg,zuckerberg,zulu,édith
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
def retrieve_ranking(query, bm25_df):
  q_terms = query.split(' ')
  q_terms_only = bm25_df[q_terms]
  score_q_d = q_terms_only.sum(axis=1)
  return sorted(zip(documents, score_q_d.values), key = lambda tup:tup[1], reverse=True)

In [20]:
result_war = retrieve_ranking('war', bm25_df) #input query here to get top 20 ranked results
columns1 = ['Indexed Overview', 'BM25_score']
df_r_war = pd.DataFrame(result_war, columns = columns1)
df_r_war[:20]

Unnamed: 0,Indexed Overview,BM25_score
0,ally prisoners war plan several hundred number...,3.963332
1,story te lawrence english officer successfully...,3.468042
2,former prisoner war brainwash unwitting assass...,3.308007
3,american civil war veteran embark journey resc...,3.181591
4,three trappers protect daughters british colon...,3.181591
5,young boy little sister struggle survive japan...,3.064481
6,1948 american court occupy germany try four na...,3.064481
7,ancient india adventurous dare man become invo...,3.064481
8,world war ii phase career controversial americ...,3.064481
9,young english boy struggle survive japanese oc...,3.064481




---



---



# Relevance Judgments

In [21]:
def retrieve_ranking(query, bm25_df):
  q_terms = query.split(' ')
  q_terms_only = bm25_df[q_terms]
  score_q_d = q_terms_only.sum(axis=1)
  return sorted(zip(bm25_df.index.values, score_q_d.values), key = lambda tup:tup[1], reverse=True)

In [22]:
rj_crime = retrieve_ranking('crime', bm25_df) #input query to get top 20 ranked result indexes
df_rj_crime = pd.DataFrame(rj_crime, columns = ['Index', 'BM25_score'])
df_rj_crime[:20]

Unnamed: 0,Index,BM25_score
0,186,4.464397
1,299,4.464397
2,53,4.13574
3,87,4.13574
4,196,4.13574
5,888,4.13574
6,1,3.988914
7,86,3.988914
8,136,3.988914
9,974,3.988914


In [23]:
rj_war = retrieve_ranking('war', bm25_df) #input query to get top 20 ranked result indexes
df_rj_war = pd.DataFrame(rj_war, columns = ['Index', 'BM25_score'])
df_rj_war[:20]

Unnamed: 0,Index,BM25_score
0,179,3.963332
1,116,3.468042
2,548,3.308007
3,553,3.181591
4,821,3.181591
5,46,3.064481
6,182,3.064481
7,331,3.064481
8,542,3.064481
9,832,3.064481


In [24]:
#Relevance judgments for top 20 ranked results
# QUERIES dictionary with {query_id: query}
queries = dict(enumerate([
    'crime',
    'war',
    
]))
# RELEVANCE JUDGEMENTS list with [(query_id, document_id, judgement), ...] judgement 0 | 1 with 1 = relevant
qrels = [
         (0, 186, 1),#crime
         (0, 299, 1),
         (0, 53, 0),
         (0, 87, 1),
         (0, 196, 1),
         (0, 888, 1),
         (0, 1, 1),
         (0, 86, 0),
         (0, 136, 1),
         (0, 974, 1),
         (0, 450, 1),
         (0, 814, 1),
         (0, 827, 1),
         (0, 298, 0),
         (0, 255, 1),
         (0, 620, 0),
         (0, 474, 1),
         (0, 147, 1),
         (0, 3, 1),
         (0, 702, 1),


         (1, 179, 1),#war
         (1, 116, 1),
         (1, 548, 0),
         (1, 553, 0),
         (1, 821, 1),
         (1, 46, 1),
         (1, 182, 1),
         (1, 331, 0),
         (1, 542, 1),
         (1, 832, 1),
         (0, 38, 1),
         (0, 256, 1),
         (0, 461, 1),
         (0, 822, 0),
         (0, 270, 1),
         (0, 314, 0),
         (0, 454, 1),
         (0, 595, 0),
         (0, 747, 1),
         (0, 936, 1),

        
]



---



---



# Evaluation

In [25]:
def precision_at_k(query_id, k=20):

  doc_ranking = retrieve_ranking(queries[query_id], bm25_df)
  retrieved = [doc[0] for doc in doc_ranking[:k]] 

  TP = np.array([int((query_id, doc, 1) in qrels) for doc in retrieved]).sum()
  FP = np.array([int((query_id, doc, 0) in qrels) for doc in retrieved]).sum()

  precision = TP / (TP+FP)

  return TP, FP, precision

In [26]:
def f1_score_at_k(query_id, k=20):
  # calculate f_1 score
  doc_ranking = retrieve_ranking(queries[query_id], bm25_df)
  retrieved = [doc[0] for doc in doc_ranking[:k]] 
  
  TP, FP, precision = precision_at_k(query_id, k)
  relevant_docs = np.array(qrels)
  relevant_docs = relevant_docs[relevant_docs[:, 0] == query_id][:,2].sum()
  FN = relevant_docs - TP

  recall = TP / (TP+FN)
  f1 = (2 * precision * recall) / (precision + recall)
  
  return f1, recall

In [27]:
#retrieve and calculate accuracy metrics for each query by looping over them
k = 20
for query_id, query in queries.items():
  tp, fp, precision = precision_at_k(query_id, k=k)
  f1_score, recall = f1_score_at_k(query_id, k=k)
  print('retrieved query "{}" with Precision@{} = {} Recall@{} = {} and F1-score = {}'.format(query, k, precision, k, recall, f1_score));

retrieved query "crime" with Precision@20 = 0.8 Recall@20 = 0.6956521739130435 and F1-score = 0.7441860465116279
retrieved query "war" with Precision@20 = 0.7 Recall@20 = 1.0 and F1-score = 0.8235294117647058




---



---



# Results for Input Query

In [28]:
q_terms_only_df = bm25_df[df_q['Query']]

score_q_d = q_terms_only_df.sum(axis=1)
score_q_d

0      0.000000
1      3.988914
2      0.000000
3      3.194932
4      0.000000
         ...   
995    0.000000
996    0.000000
997    0.000000
998    2.759734
999    0.000000
Length: 1000, dtype: float64

In [29]:
df_fin = df0.drop(['Poster_Link', 'Certificate', 'No_of_Votes', 'Gross', 'Runtime', 'Meta_score','Director', 'Star1', 'Star2', 'Star3', 'Star4', 'Overview'], axis = 1)
df_fin = df_fin.rename(columns={'Series_Title': "Title",
                        'Released_Year': 'Year',
                        'IMDB_Rating': 'Rating'})

title = df_fin['Title'].values.tolist()
year = df_fin['Year'].values.tolist()
genre = df_fin['Genre'].values.tolist()
rating = df_fin['Rating'].values.tolist()

In [30]:
result = sorted(zip(title, score_q_d.values, year, rating, genre, documents), key = lambda tup:tup[1], reverse=True)
result = [x for x in result if not 0.0 in x]
result[:3]    # Top 3 results

[('Key Largo',
  7.313097127435036,
  '1948',
  7.8,
  'Action, Crime, Drama',
  'man visit war buddys family hotel find gangster run things hurricane approach two end confront'),
 ("Miller's Crossing",
  6.484197291013432,
  '1990',
  7.7,
  'Crime, Drama, Thriller',
  'tom reagan advisor prohibitionera crime boss try keep peace war mob get catch divide loyalties'),
 ('Munna Bhai M.B.B.S.',
  5.9291469455656625,
  '2003',
  8.1,
  'Comedy, Drama, Musical',
  'gangster set fulfill father dream become doctor')]

In [33]:
pd.set_option('display.max_colwidth', None)
og_ov = df0['Overview']

columns = ['Title', 'Score', 'Year', 'IMBD Rating', 'Genre', 'Overview Documents']
def_res = pd.DataFrame(result, columns=columns)
def_res = def_res.sort_values(by=['Score'], ascending=False)
def_res = def_res.drop(['Overview Documents'], axis=1)
def_res = def_res.assign(Overview=og_ov)
def_res

Unnamed: 0,Title,Score,Year,IMBD Rating,Genre,Overview
0,Key Largo,7.313097,1948,7.8,"Action, Crime, Drama","Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency."
1,Miller's Crossing,6.484197,1990,7.7,"Crime, Drama, Thriller",An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.
2,Munna Bhai M.B.B.S.,5.929147,2003,8.1,"Comedy, Drama, Musical","When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice."
3,Lucky Number Slevin,5.562296,2006,7.7,"Action, Crime, Drama","The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and tightens his grip on the family crime syndicate."
4,A Bronx Tale,5.457973,1993,7.8,"Crime, Drama, Romance",A jury holdout attempts to prevent a miscarriage of justice by forcing his colleagues to reconsider the evidence.
...,...,...,...,...,...,...
93,Hacksaw Ridge,2.181288,2016,8.1,"Biography, Drama, History","In Nazi-occupied France during World War II, a plan to assassinate Nazi leaders by a group of Jewish U.S. soldiers coincides with a theatre owner's vengeful plans for the same."
94,Gran Torino,2.181288,2008,8.1,Drama,"When their relationship turns sour, a couple undergoes a medical procedure to have each other erased from their memories."
95,Mandariinid,2.125597,2013,8.2,"Drama, War","Amélie is an innocent and naive girl in Paris with her own sense of justice. She decides to help those around her and, along the way, discovers love."
96,1917,2.072679,2019,8.3,"Drama, Thriller, War","Unscrupulous boxing promoters, violent bookmakers, a Russian gangster, incompetent amateur robbers and supposedly Jewish jewelers fight to track down a priceless stolen diamond."
