In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
!pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')

pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
def app():

  #Welcome, user input Query
  print('Hello and welcome to the movie matcher application!\nPlease enter 3 key words describing the movie you want to watch.\nFor example, you can enter "gangster", "war", "crime".')
  query1, query2, query3 = input(''), input(''), input('')

  df0 = pd.read_csv('imdb_top_1000.csv') 
  og_ov = df0['Overview']
  df= pd.read_csv('imdb_top_1000.csv') 


  q_terms = [query1, query2, query3]
  df_q = pd.DataFrame(q_terms, columns=['Query'])

  # Indexing
  df['Overview'] = df['Overview'].str.lower()
  df["Overview"] = df['Overview'].str.replace('[^\w\s]','', regex=True)
  documents=df['Overview'].unique()

  stop_words = stopwords.words('english')
  df['Overview'] = df['Overview'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
  df_q['Query'] = df_q['Query'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

  lemmatizer = WordNetLemmatizer()
  def lemmatize_words(text):
      words = text.split()
      words = [lemmatizer.lemmatize(word,pos='v') for word in words]
      return ' '.join(words)
  df['Overview'] = df['Overview'].apply(lemmatize_words)
  df_q['Query'] = df_q['Query'].apply(lemmatize_words)

  documents=df['Overview'].unique()

  # Vectorization
  vectorizer = CountVectorizer(stop_words='english')
  documents_vectorized = vectorizer.fit_transform(df.Overview)
  vocabulary = vectorizer.get_feature_names_out()

  df = pd.DataFrame(documents_vectorized.toarray(), columns=vocabulary)
  doc_ids = df.index.values

  # Retrieval Framework
  def BM25_IDF_df(df):

    dfs = (df > 0).sum(axis=0)
    N = df.shape[0]
    idfs = -np.log(dfs / N)
    
    k_1 = 1.2
    b = 0.8
    dls = df.sum(axis=1) 
    avgdl = np.mean(dls)

    numerator = np.array((k_1 + 1) * df)
    denominator = np.array(k_1 *((1 - b) + b * (dls / avgdl))).reshape(N,1) + np.array(df)
    BM25_tf = numerator / denominator
    idfs = np.array(idfs)
    BM25_score = BM25_tf * idfs

    return pd.DataFrame(BM25_score, columns=vocabulary)

  bm25_df = BM25_IDF_df(df) 

  # BM25 Scores
  q_terms_only_df = bm25_df[q_terms]
  score_q_d = q_terms_only_df.sum(axis=1)

  # Results
  df_fin = df0.drop(['Poster_Link', 'Certificate', 'No_of_Votes', 'Gross', 'Runtime', 'Meta_score','Director', 'Star1', 'Star2', 'Star3', 'Star4', 'Overview'], axis = 1)
  df_fin = df_fin.rename(columns={'Series_Title': "Title", 'Released_Year': 'Year', 'IMDB_Rating': 'Rating'})

  title = df_fin['Title'].values.tolist()
  year = df_fin['Year'].values.tolist()
  genre = df_fin['Genre'].values.tolist()
  rating = df_fin['Rating'].values.tolist()

  result = sorted(zip(title, score_q_d.values, year, rating, genre, documents), key = lambda tup:tup[1], reverse=True)
  result = [x for x in result if not 0.0 in x]

  # Results Dataframe, assign un-indexed overview
  columns2 = ['Title', 'Score', 'Year', 'IMBD Rating', 'Genre', 'Overview']
  def_res = pd.DataFrame(result, columns=columns2)
  def_res = def_res.sort_values(by=['Score'], ascending=False)
  def_res = def_res.drop(['Overview'], axis=1)
  def_res = def_res.assign(Overview=og_ov)

  # Optimisation: Apply Shuffle?
  print('\nWould you like to shuffles movie match results?\nPlease enter "Yes" or "No"')
  choice = input()

  if choice == 'No':
    print('\nThank you for using our movie recommendation engine. Enjoy!\n')

    return def_res

  if choice == 'Yes':
    print('\nThank you for using our movie recommendation engine. Enjoy!\n')

    split = int(len(def_res)/2)

    shuffle1 = def_res[:split]
    shuffle2 = def_res[split:]

    s1 = shuffle1.sample(frac=1)
    s2 = shuffle2.sample(frac=1)

    return pd.concat([s1, s2])

app()

Hello and welcome to the movie matcher application!
Please enter 3 key words describing the movie you want to watch.
For example, you can enter "gangster", "war", "crime".
gangster
war
crime

Would you like to shuffles movie match results?
Please enter "Yes" or "No"
No

Thank you for using our movie recommendation engine. Enjoy!



Unnamed: 0,Title,Score,Year,IMBD Rating,Genre,Overview
0,Key Largo,7.313097,1948,7.8,"Action, Crime, Drama","Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency."
1,Miller's Crossing,6.484197,1990,7.7,"Crime, Drama, Thriller",An organized crime dynasty's aging patriarch transfers control of his clandestine empire to his reluctant son.
2,Munna Bhai M.B.B.S.,5.929147,2003,8.1,"Comedy, Drama, Musical","When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice."
3,Lucky Number Slevin,5.562296,2006,7.7,"Action, Crime, Drama","The early life and career of Vito Corleone in 1920s New York City is portrayed, while his son, Michael, expands and tightens his grip on the family crime syndicate."
4,A Bronx Tale,5.457973,1993,7.8,"Crime, Drama, Romance",A jury holdout attempts to prevent a miscarriage of justice by forcing his colleagues to reconsider the evidence.
...,...,...,...,...,...,...
93,Hacksaw Ridge,2.181288,2016,8.1,"Biography, Drama, History","In Nazi-occupied France during World War II, a plan to assassinate Nazi leaders by a group of Jewish U.S. soldiers coincides with a theatre owner's vengeful plans for the same."
94,Gran Torino,2.181288,2008,8.1,Drama,"When their relationship turns sour, a couple undergoes a medical procedure to have each other erased from their memories."
95,Mandariinid,2.125597,2013,8.2,"Drama, War","Amélie is an innocent and naive girl in Paris with her own sense of justice. She decides to help those around her and, along the way, discovers love."
96,1917,2.072679,2019,8.3,"Drama, Thriller, War","Unscrupulous boxing promoters, violent bookmakers, a Russian gangster, incompetent amateur robbers and supposedly Jewish jewelers fight to track down a priceless stolen diamond."
