#**some preprocessing stuff**

In [None]:
import nltk
import string
import pickle
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from google.colab import drive

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def generate_tokens(txt: str) -> list:
    """
    Processes a string and returns a list of tokens.
    :param txt: The string to process.
    :return: A list of tokens.
    """
    stop_words = stopwords.words('english') + list(string.punctuation)
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word.lower()) for word in nltk.word_tokenize(txt) \
               if word.lower() not in stop_words and word.isalpha()]
    return tokens

In [None]:
def create_revert_index(tokens: list) -> dict:
    """
    Creates a reverse index of the tokens.
    :param tokens: A list of tokens.
    :return: A dictionary of tokens.
    """
    revent_index = {}
    for index, token in enumerate(tokens):
        if token not in revent_index:
            revent_index[token] = {
                'repeat': 1,
                'indexes': [
                    index # this is not actually the index of token in tokens, 
                                        # this must be set to token index in orginal string???
                ]
            }
        else:
            revent_index[token]['repeat'] += 1
            revent_index[token]['indexes'].append(index)

    return revent_index

In [None]:
def process_data(path) -> dict:
    """
        for each movie in the dataset:
            get the summary text
            generate tokens
            generate revert index
    """
    processed = {}
    print('Started generating...')
    with open(path) as file:
        for line in file:
            movie_id, summary = line.split('\t')
            tokens = generate_tokens(summary)
            index = create_revert_index(tokens)
            processed[movie_id] = {
                'summary': summary,
                'tokens': tokens,
                'index': index
            }
            # print(processed[movie_id])
    print('Finished...')
    return processed

In [None]:
processed = process_data("/content/drive/MyDrive/Information Retrival/plot_summaries.txt")

Started generating...
Finished...


In [None]:
query = process_data("/content/drive/MyDrive/Information Retrival/queries.txt")

Started generating...
Finished...


merge index of queries and documents:

In [None]:
processed.update(query)

#**Calculate TFIDF:**

calculate Document Frequency(DF) by adding each document id that contains specific word 

In [None]:
DF = {}
for id, detail in processed.items():
  for word, r in detail['index'].items():
    try:
      DF[word].add(id)
    except:
      DF[word] = {id}

calculate tfidf:

**statistics = {'movieid'= {words}}**</br>
**words = {'tf', 'tfidf'}** </br>
idf is not considered as an important key cause: 1. we have it in another dict 2. it's not a reliable factor(it'll chsnge by adding a new movie plot!)

due to final multiplication of tf and idf, it's unnecessary to calculate idf for all words in a doc or query </br>
if we don't have X word in the Document its tf will be 0 and then result of multiplication will be 0;

In [None]:
import math
statistics = {}
for id, detail in processed.items():
  words = {}
  for word, r in detail['index'].items():
    idf = math.log(len(DF)/len(DF[word]), 10)
    tf = math.log(r['repeat'], 10) + 1
    words[word] = {
      'tf': tf,
      'tfidf': tf * idf 
    }
  statistics[id] = words

#**Cosine similarity**
between queries and documents

normalize each plot vector by dividing tfidf of each word by sum of tfidf squares of all words.</br>
**tfidf of a word = wi</br>
wi/sum(wi)**</br>
update statics and add normolized scores to each word statistics: **words = {'tf', 'tfidf', 'normalized'}**

In [None]:
for id, words in statistics.items():
  nrmlzdSum = sum([math.sqrt(stat['tfidf']) for word, stat in words.items()])
  for word, stat in words.items():
    statistics[id][word]['normalized'] = stat['tfidf']/nrmlzdSum 

calculate the semilarity bet. queries and plots by multiplying each word that exist in query to all words in plots.

In [None]:
output = {}
for i in range(1,11):
  result = {}
  for docs in set(statistics)-{'1','2','3','4','5','6','7','8','9','10'}:
    result[docs] = sum([stat['normalized']*statistics[str(docs)][word]['normalized']\
                   for word, stat in statistics[str(i)].items()\
                   if statistics[str(docs)].get(word)!=None])
  output[str(i)] = result

sort results and find 10 most similar plots to each query

In [None]:
import heapq
from operator import itemgetter
topitems = []
for i in range(1, 11):
 topitems.append(heapq.nlargest(10, output[str(i)].keys(), key=itemgetter(1)))
print(topitems)

[['29443845', '9951615', '29991802', '19310709', '6954675', '9962261', '2918617', '29320666', '893465', '4965004'], ['29443845', '9951615', '29991802', '19310709', '6954675', '9962261', '2918617', '29320666', '893465', '4965004'], ['29443845', '9951615', '29991802', '19310709', '6954675', '9962261', '2918617', '29320666', '893465', '4965004'], ['29443845', '9951615', '29991802', '19310709', '6954675', '9962261', '2918617', '29320666', '893465', '4965004'], ['29443845', '9951615', '29991802', '19310709', '6954675', '9962261', '2918617', '29320666', '893465', '4965004'], ['29443845', '9951615', '29991802', '19310709', '6954675', '9962261', '2918617', '29320666', '893465', '4965004'], ['29443845', '9951615', '29991802', '19310709', '6954675', '9962261', '2918617', '29320666', '893465', '4965004'], ['29443845', '9951615', '29991802', '19310709', '6954675', '9962261', '2918617', '29320666', '893465', '4965004'], ['29443845', '9951615', '29991802', '19310709', '6954675', '9962261', '2918617'

Finally..
create 10 files to save result of queries.

In [None]:
for i in range(1, 11):
  p = '/content/drive/MyDrive/Information Retrival/query'+str(i)+'.txt'
  with open(p, 'w') as writefile:
    writefile.write('Top 10 of similar movie plot with "'+ processed[str(i)]['summary']+ '"')
    for docId in queryResult:
      writefile.write(docId + "    " + processed[docId]['summary'])

# **Junk**

In [None]:
from itertools import chain
uniqueWords = set(chain.from_iterable(plot['index'].keys() for plot in processed.values()))

creating a count vector by mean if we have: </br>
...     'This is the first document.', </br>
...     'This document is the second document.',</br>
...     'And this is the third one.',</br>
...     'Is this the first document?',</br>
generate a vocab array that looks like: </br>
array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], ...)</br>
and finally our count vector will be: </br>
[[0 1 1 1 0 0 1 0 1]</br>
 [0 2 0 1 0 1 1 0 1]</br>
 [1 0 0 1 1 0 1 1 1]</br>
 [0 1 1 1 0 0 1 0 1]]

In [None]:
import numpy as np
vocabulary = {b: a for a, b in enumerate(uniqueWords)}
movies = {b:a for a,b in enumerate(processed.keys())}
countVector = []
for id, detail in processed.items():
  movie = np.zeros(len(uniqueWords))
  for word, r in detail['index'].items():
    movie[vocabulary[word]]=r['repeat']
    countVector.append(movie)


In [None]:
from itertools import chain
def UniqueWords(dic) -> list:
    # Stores the list of unique keys
    res = list(set(chain.from_iterable(plot['index'].keys() for plot in dic.values())))
    return res
    # # Print the list
    # print(str(res))