# Setting Up

In [None]:
!pip install scikit-learn
!pip install nltk

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

import json

# Reading data from file

In [None]:
class Movie:
    def __init__(self, _id, title, plot, directors, writers, actors):
        self.imdb_id = _id
        self.title = title.split('. ')[-1]
        self.plot = plot
        self.directors = directors
        self.writers = writers
        self.actors = actors
    def to_process(self):
        actors_list = []
        for actor in self.actors:
            actors_list.append(actor['name'])
        result = self.title + ' ' + self.plot + ' ' + ' '.join(self.directors) + ' ' + ' '.join(self.writers) + ' ' + ' '.join(actors_list)
        return result
    def __str__(self):
        return f'{self.imdb_id} - "{self.title}"'

In [None]:
def as_movie(dct):
  return Movie(dct['imdb_id'], dct['title'], dct['plot'],
               dct['directors'], dct['writers'], dct['actors'],)

In [None]:
movies = []
with open('./imdb_search/imdb-movies.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        try:
            movie_data = json.loads(line)
            movie = as_movie(movie_data)
            movies.append(movie)
        except json.JSONDecodeError as e:
            continue

# Preprocessing

In [None]:
def preprocess_text(text):
    # case folding
    text = text.lower()

    # stopword removal
    stop_words = set(stopwords.words('english'))
    text_tokenized = word_tokenize(text)
    text_tokenized = [word for word in text_tokenized if not word in stop_words]

    # stemming
    ps = PorterStemmer()
    text_tokenized = [ps.stem(word) for word in text_tokenized]

    text = ' '.join(text_tokenized)
    return text

# Information Retrieval


In [None]:
def documents_to_cosine_similarities(documents, search_query):
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
    query_tfidf_vector = tfidf_vectorizer.transform([search_query])
    cosine_similarities_tfidf = cosine_similarity(query_tfidf_vector, tfidf_matrix)
    return cosine_similarities_tfidf


# IR System

In [None]:
query = input()
processed_movies = [preprocess_text(str(movie.to_process())) for movie in movies]
query = preprocess_text(query)
cosine_movies = documents_to_cosine_similarities(processed_movies, query)
print(movies[cosine_movies.flatten().argmax()])

