In [2]:
#import used 
import re
import csv
import pandas as pd
from os import listdir
from os.path import isfile, join
import os.path

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
from nltk.tokenize import sent_tokenize, word_tokenize

import math
import numpy as np
import scipy 
import heapq
from tqdm import tqdm_notebook

# Used for Output
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))

# Download HTML pages

In [8]:
#import used
import requests
import urllib
import time
from bs4 import BeautifulSoup

movies1 = BeautifulSoup(open(r"moviespart1.html"), "html.parser") #open all html files with BeautifulSoup from github saved pages

movies1.prettify() #just to see all the html files

urls = [] #create a list with all the urls in github pages
for url in movies1.findAll('a', href=True):
    urls.append(url['href'])
    
print(urls[0:10]) #print first 10 urls how example. 


['https://en.wikipedia.org/wiki/Love_by_the_Light_of_the_Moon', 'https://en.wikipedia.org/wiki/The_Martyred_Presidents', 'https://en.wikipedia.org/wiki/Terrible_Teddy,_the_Grizzly_King', 'https://en.wikipedia.org/wiki/Jack_and_the_Beanstalk_(1902_film)', 'https://en.wikipedia.org/wiki/Alice_in_Wonderland_(1903_film)', 'https://en.wikipedia.org/wiki/The_Great_Train_Robbery_(1903_film)', 'https://en.wikipedia.org/wiki/The_Suburbanite', 'https://en.wikipedia.org/wiki/The_Little_Train_Robbery', 'https://en.wikipedia.org/wiki/The_Night_Before_Christmas_(1905_film)', 'https://en.wikipedia.org/wiki/Dream_of_a_Rarebit_Fiend_(1906_film)']


#### We implemented a for loop instead of a function.####

In [None]:
#create a for loop to download all the url in urls list
c = 0
for url in urls:
    
    try :
        urllib.request.urlretrieve(url, "article{}.html".format(c)) #Copy a network object denoted by a URL to a local file.
        r = requests.get(url)
        time.sleep(2)  #wait 2 seconds each request
        c += 1
        if r.status_code != 200: # used in case the request doesn't return code 200, that means an error has occured
            raise Exception("Could not download URL" + url)
    except Exception:
        time.sleep(1200) #in the case of the exception happens we stop the program for 20 minutes

# Parser Html Files

#### These are the functions used to get INTRO, PLOT and INFOBOX from the TSV files ####

In [None]:
def title (soup):
    # We get the title of the movie
    title = soup.select("#firstHeading")[0].text
    return title 

def intro(soup):
    # In this part we get the INTRO of the movie
    
    try: # check if intro exists in the TSV file
        sec = soup.findAll('p')[0]
        if sec == soup.find("p", class_="mw-empty-elt"):
            section_intro = soup.findAll('p')[1]
        else:
            section_intro = sec
        nextNode = section_intro
        intro = []
        intro.append(nextNode.text)
        
        
        while True:
            nextNode = nextNode.find_next_sibling()
            if nextNode and nextNode.name == 'p': # looks for the next sibling until it's a paragraph
                intro.append(nextNode.text) 

            else: # if there isn't a paragraph then break
                break          
        intro_s = "" # we create an empty string where we can append every element of INTRO

        for ele in intro: 
            intro_s += ele
            
        return intro_s
    
    except IndexError: # Return NONE if INTRO doesn't exists
        intro_s = None
        return intro_s

def plot (soup):
    try:    # check if PLOT exists in the TSV file
        # In this part we get the PLOT of the movie
        sec = soup.findAll('h2')[0]
        if sec.text == 'Contents' or sec.text == 'Cast': # this if is used because some times the dataframe's first row is empty
            section_plot = soup.findAll('h2')[1]         # plus sometimes the first heading it finds is Content or Cast
            if section_plot.text == 'Cast':
                section_plot = soup.findAll('h2')[1] 
        else:
            section_plot = sec
        nextNode = section_plot.find_next_sibling('p')

        plot = []
       
        while True:

            if nextNode and nextNode.name == 'p': # looks for the next sibling until it's a paragraph
                plot.append(nextNode.text)
                nextNode = nextNode.find_next_sibling()
            else:
                break          
        plot_s = "" # we create an empty string where we can append every element of PLOT

        for ele in plot: 
            plot_s += ele
        return plot_s
    
    except IndexError: # Return NONE if PLOT doesn't exists
        plot_s = None
        return plot_s

def infobox(soup):
    
    try:
        table = soup.find('table', class_='infobox vevent')
        nextNode = table
        table2 = table.find_all('tr')

        dic={}
        for th in table2[1:]:
            if th.find('th'):            
                dic[th.find('th').text] = th.find('td').get_text(strip=True, separator='|').split('|')

        standard_dic = {
        "Directed by" : "",
        "Produced by": "",
        "Written by": "",
        "Starring": "",
        "Music by": "", 
        "Release date": "",
        "Running time": "",
        "Country": "",
        "Language": "",
        "Budget": ""} 

        # In this part we check if the keys of the infobox are the same as the ones requested   
        shared_items = {k: dic[k] for k in dic.keys() & standard_dic.keys()}



        # We transform the list into strings
        for k, v in shared_items.items():
            shared_items[k] = ", ".join(v)

        # Difference, we would like to find the missing INFO of this movie
        value = { k : standard_dic[k] for k in set(standard_dic) - set(dic) }

        # Replace missing INFO with NaN
        value = {k: None if not v else v for k, v in value.items() }


        # Let's combine these two dictionaries
        final = dict(shared_items, **value)

        return final
    
    except AttributeError:
        final = {
        "Directed by" : None,
        "Produced by": None,
        "Written by": None,
        "Starring": None,
        "Music by": None, 
        "Release date": None,
        "Running time": None,
        "Country": None,
        "Language": None,
        "Budget": None} 
        
        
        return final    

### We decided to transform everything in a dictionary at first, appending the urls of every film as well even if it wasn't reqeusted explicitly. We thought it was the fastest way of retrieving this information that we should output in the search engines ###

In [None]:
dir_path = r"C:\Users\loren\Downloads\HW3\Movie1"
 

for i in tqdm_notebook(range (len(os.listdir(dir_path)))):
    file_name = os.path.join(dir_path, "article{}.html".format(i))
    with open(file_name, encoding="utf8") as html_file:

        soup = BeautifulSoup(html_file)
        t = title(soup)
        k = intro(soup)
        p = plot(soup)
        
        
        canonical_link = soup.find_all("link",{"rel" : "canonical"})
        url = canonical_link[0].get('href')

            
        # Write TSV file for each movie, we create a unique dictionary
        dic_title = {'Title' : t}
        dic_intro = {'Intro' : k}
        dic_plot  = {'Plot' : p}
        dic_url = {'Url' : url}
        dic_infobox = infobox(soup)
        
        temp = dict(dic_title, **dic_intro) 
        temp2 = dict(temp, **dic_plot)
        temp3 = dict(temp2, **dic_url)
        final = dict(temp3, **dic_infobox) # it's the unique dictionary we were talking of before
        
        with open(r'TSV\article_{}.tsv'.format(i), 'wt', encoding="utf8") as out_file:
            tsv_writer = csv.DictWriter(out_file, final.keys(), delimiter ='\t')
            tsv_writer.writeheader()
            tsv_writer.writerow(final)

# Vocabulary and inverted index

In [None]:
def save_dict_to_file(dic, file): #defining a function to save files right away
    f = open('{}.txt'.format(file), 'w',encoding="utf8") #open a new file in write mode (empty)
    f.write(str(dic)) #write in it what we need (it will always be a dictionary, hence 'dic')
    f.close() #closing it
    
def information(data_frame):
    col = []
    #data_frame.drop(df.columns.difference(['Intro', 'Plot']), 1, inplace=True)
    
    for column in df:
        
        # Intro and Plot
        if column == 'Plot' or column == 'Intro':
            if pd.isnull(data_frame[column][0]):     

                try:
                    info = str(data_frame[column][1])
                except:
                    print(filename, "Intro")
                    return col,('Continue')
            else :
                info = str(data_frame[column][0])
                
            col.append(info.replace('\n',' '))
        
        else:
            # Infobox 
            if pd.isnull(df[column][0]):      

                try:
                    infobox = str(df[column][1])
                except:
                    infobox = ''
            else :
                infobox = str(df[column][0])

            col.append(infobox.replace('\n',' '))

    
    
    return col, ''

#### Let's start to build up the indexes #### 

In [None]:
documentlist = {}  #to keep track of the words in all files. -->  e.g: {doc_i : ['love, 'movie'.....]}
vocabulary = {} #to keep track of all the pre-processed terms and their ids. --> e.g (term_ids) = {'love':3, 'movie':2}
inverted_index = {}  #Is the inverted index. term_id as the key and name of the documents as a list of their values. --> e.g: {1: [doc_1, doc_5, ....], 2: [doc_2, doc_4, ....]}
word_index = 0 #this is used to give the id to the words in the vocabulary

In [None]:
dir_path = r"C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Primo semestre\ADM\Homeworks\Homework #3\TSV"
 
for i in tqdm_notebook(range (len(os.listdir(dir_path)))):
    filename = os.path.join(dir_path, "article_{}.tsv".format(i))
    
    # Creating a dataframe for each movie
    df = pd.read_csv(filename, sep='\t', encoding  = 'utf-8')
    doc = 'article_{}.tsv'.format(i)
    
    col = []
    col, message = information(df)  
    if message == 'Continue':
        continue
    elif message == 'Pass':
        pass
     

    # Step 2 Taking all the info
    
    # Column 0, 1 and 2 are respectively : Title, Intro and Plot
    # The rest of the columns are part of the infobox
    to_tokenize = col[0]+' '+col[1]+' '+col[2]+' '+col[3]+' '+col[4]+' '+col[5]+' '+col[6]+' '+col[7]+' '+col[8]+' '+col[9]+' '+col[10]+' '+col[11]+' '+col[12]+' '+col[13]
    tokens = nltk.word_tokenize(to_tokenize) #tokenization
    filtered_words = [nltk.stem.PorterStemmer().stem(word) #removing stopwords, special characters, stemming
                                for word in tokens if word not in nltk.corpus.stopwords.words('english') and word not in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~·']
    
    # Step 3
    # Creating a document list: for every document we will have the tokenized words 
    documentlist[doc] = filtered_words 
    for w in filtered_words: 
        
        # VOCABULARY
        if w not in vocabulary: 
            word_index += 1    
            vocabulary[w] = word_index  
        
        # INVERTED INDEX
        if vocabulary[w] not in inverted_index: 
            temp = [] 
            temp.append(doc)  
            inverted_index[vocabulary[w]] = temp 
        
        # If the key exists, append the document's name
        elif doc not in inverted_index[vocabulary[w]]: 
            inverted_index[vocabulary[w]].append(doc)  

        

### Inverted index with TD*IDF score

In [None]:
docpaths = r"C:\Users\Luca\Desktop\-\Università\Magistrale\Primo anno\Primo semestre\ADM\Homeworks\Homework #3\TSV"

vocabulary = open('vocabulary.txt', 'r', encoding = 'utf-8')
vocabulary = eval(vocabulary.read()) 

inverted_index = open('inverted_index.txt', 'r', encoding = 'utf-8') 
inverted_index = eval(inverted_index.read())

documentlist = open('documentlist.txt', 'r', encoding = 'utf-8')
documentlist = eval(documentlist.read())

new_inverted_index = {}
for key,doc in tqdm_notebook(documentlist.items()): #taking the keys (doc_i) and the values 
    
    for w in doc: 
        score = tfidf(w,doc) 
        w_index = (key,score) 
        if vocabulary[w] not in new_inverted_index: 
            temp = [] 
            temp.append(w_index) 
            new_inverted_index[vocabulary[w]] = temp 
        elif doc not in new_inverted_index[vocabulary[w]]: 
            new_inverted_index[vocabulary[w]].append(w_index) 

# Removing duplicates
for key in new_inverted_index:
    new_inverted_index[key] = list(set(new_inverted_index[key]))
    


In [None]:
# Saving
save_dict_to_file(new_inverted_index,"inverted_index_tfidf")

# Search engine n. 1

### At this moment, we narrow our interest on the intro and plot of each document. It means that the first Search Engine will evaluate queries with respect to the aforementioned information.

In [9]:
# Taking the user's query
query = input() 

# Tokenizing the query
tokens = nltk.word_tokenize(query) 
query_tokens = [nltk.stem.PorterStemmer().stem(token) # Removing stopwords, special characters, stemming
                for token in tokens if token not in stopwords.words('english') if token not in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~·']

# Let's open vocabulary and inverted index in read mode
vocabulary = open('vocabulary.txt', 'r', encoding = 'utf-8') 
vocabulary = eval(vocabulary.read()) 
inverted_index = open('inverted_index.txt', 'r', encoding = 'utf-8')  
inverted_index = eval(inverted_index.read())   

# Taking the term_ids of the query's terms (returns a list of terms_ids) from Vocabulary
term_ids = [vocabulary[token] for token in query_tokens if token in vocabulary] 

# Returns the documents in the inverted idx for that have that same ID
search_results = [inverted_index[i] for i in term_ids if i in inverted_index] 

# Removing duplicates and preparing for intersection (conjunction)
new_list = [set(list_) for list_ in search_results]

# Returns the documents that have all the words of the query
intersect = set.intersection(*new_list) 

 disney 


#### Output

In [None]:
col_names = ["Title","Intro", 'Url']
search1 = pd.DataFrame(columns = col_names) 
for j, doc in enumerate(intersect): 
    with open(r'TSVFile\{}'.format(doc), 'r',encoding="utf-8") as file: 
        df = pd.read_csv(file, sep='\t', encoding  = 'utf-8')
        # Drop every column that is not TITLE, INTRO and URL
        df.drop(df.columns.difference(['Title','Intro', 'Url']), 1, inplace=True)
        search1 = pd.concat([df,search1], axis = 0, ignore_index=True, sort = False)

# Making the rows more readabale    
printmd("The intitial query was: ***{}***".format(query))       
pd.set_option('max_colwidth',500) 
display(search1.style.set_table_styles([ {'selector': '.row_heading, .blank', 'props': [('display', 'none;')]}]))

# Search engine n.2

### Part 1. Create Inverted Index containing TF*IDF score

The TF*IDF algorithm is used to weigh a keyword in any content and assign the importance to that keyword based on the number of times it appears in the document. 
* The __TF__ (term frequency) of a word is the frequency of a word (i.e. number of times it appears) in a document. When you know it, you’re able to see if you’re using a term too much or too little.
* The __IDF__ (inverse document frequency) of a word is the measure of how significant that term is in the whole corpus.


In [None]:
# Normalized value tf. 'Term frequency' divided by 'document length'. In this way the bias of having a long document doesn't count
def tf(word, document):
    return document.count(word) / len(document) 

# Number of documents with the same word
def document_frequency(word):
    if word in vocabulary:
        term_id = vocabulary[word]
    return len(inverted_index[term_id])

# IDF(word) = log(Total Number Of Documents / Number Of Documents containing the certain term (word))
def idf(word):
    return math.log(len(docpaths) / document_frequency(word))

def tfidf(word, document):
    return tf(word, document) * idf(word)

## Part 1. Query

#### Let's open all the .txt files we need

In [None]:
# Tokenizing the query
query = input() 
tokens = nltk.word_tokenize(query) 
query_tokens = [nltk.stem.PorterStemmer().stem(token)
                for token in tokens if token not in stopwords.words('english') if token not in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~·']

In [None]:
# Score for the query
tfidf_query_array = [] 
for w in query_tokens:  
    score = tf(w, query_tokens) * idf(w) 
    tfidf_query_array.append(score) 

In [None]:
# Opening the new inverted index  
inverted_index_tfidf = open('inverted_index_tfidf.txt', 'r', encoding = 'utf-8')  
inverted_index_tfidf = eval(inverted_index_tfidf.read())

# Terms Id's of the term/s in the query
term_ids = [vocabulary[token] for token in query_tokens if token in vocabulary] 
# List of tuples (doc_id, score) for each id
search_results = [inverted_index_tfidf[idx] for idx in term_ids if idx in inverted_index_tfidf] 

# Cosine similarity
cos_arrays = {} 
for list_ in search_results: 
    for tuple_ in list_: 
        if tuple_[0] not in cos_arrays: 
            temp = []
            temp.append(tuple_[1]) 
            cos_arrays[tuple_[0]] = temp 
        else:
            cos_arrays[tuple_[0]].append(tuple_[1]) 

# Take the documents that have all of the words
final = [(key,cos_arrays[key]) for key in cos_arrays if len(cos_arrays[key]) == len(term_ids)] 
print(final)

In [None]:
doc_sim = {} #dic for {doc_i : similarity_score, ...}
for tuple_ in final: #take the tuple in final (the values are the list of the tf/idf scores)
    # Compute the the cosine similarity
    sim = 1 - (scipy.spatial.distance.cosine(np.array(tfidf_query_array) , np.array(tuple_[1]))) 
    # key = doc_id, value = similarity
    doc_sim[tuple_[0]] =  sim 
print(doc_sim)

#### Output

In [None]:
# List that has a tuple ordered by the smallest to the largest
heap = [(-value, key) for key, value in doc_sim.items()] 
# Taking the largest
largest = heapq.nsmallest(10, heap) 
# Ordering keys and values
largest = [(key, -value) for value, key in largest] 

#Printing the search results
col_names = ["Title","Intro", "Similarity", "Url"]
score = pd.DataFrame(columns = col_names) 

# We use reversed(largest) so that we can add values from the smallest to the greatest
for j, doc in enumerate(reversed(largest)): 
    with open(r'TSV\{}'.format(doc[0]), 'r',encoding="utf-8") as file: 
        df = pd.read_csv(file, sep='\t', encoding  = 'utf-8')
        df.drop(df.columns.difference(['Title','Intro', 'Url', 'Similarity']), 1, inplace=True)
        df.loc[0, 'Similarity'] = doc[1]
        score = pd.concat([df,score], axis = 0, ignore_index=True, sort = False)

# Output
printmd("The intitial query was: ***{}***".format(query))       
pd.set_option('max_colwidth',500)
score.style.background_gradient(cmap='Blues')

# Search engine n. 3

In [None]:
# Function to define the weight of RUNTIME and RELEASE DATE
def importance(dataframe):
    
    if user == 1:
        score_runtime = 0.7
        score_release = 0.3
    elif user == 2:
        score_runtime = 0.3
        score_release = 0.7
    else:
        score_runtime = 0.5
        score_release = 0.5 
    return score_runtime, score_release

## Part 1

First you pick the language. The main query will be based on this.

Second, you pick if the **lenght of the movie** is more relevant to you than the **release date**, the other way around or if it's the same.


The score is calculated based on the aswer answer:
* If someone answers 1 ---> weight is 0.7 for runtime and 0.3 for release date
* If someone answers 2 ---> weight is 0.3 for runtime and 0.7 for release dates 
* If someone answers 3 ---> weight is 0.5 for runtime and 0.5 fo release date


Now that we have our weights we can compute the score:

$$score = (runtime\_score  {1 \over \ \mid runtime-user\_runtime \mid +1 }) + (release\_score  {1 \over \ \mid release-user\_release \mid +1 })$$


Where:
* runtime = runtime in TSV files
* user_runtime = runtime that the user inputs
* release = release in TSV files
* user_release = release that the user inputs

In [None]:
# Select the language of the movie you would like to watch
language = input('What language would you like to see the movie in? ') 

# Select the runtime and the release year
user_runtime = int(input('Insert the length of the movie you would like to watch(in minutes): ')) 
user_release = int(input('Insert the release year of the movie you would like to watch: '))  

In [None]:
user = int(input("Now, what is more important to you? Is it the running time or the release date?\nAswer 1 for the first, 2 for the second or 3 for both: "))
our_score = {}
# Just to check how many documents have a Null running time or release date
error = []

# Iterating through the intersection list created in the 1st search engine
for j, doc in enumerate(intersect): 
    score = 0 
    df = pd.read_csv(r'TSV\{}'.format(doc), sep='\t', encoding  = 'utf-8')
    if pd.isnull(df['Release date'][0]) == True or pd.isnull(df['Running time'][0]) == True  :
        print(doc)
        error.append(doc) 
        continue
    else:
        
        # Checking if the cell of the TSV is already an INT or a STRING
        if type(df['Running time'][0]) != np.int64:
            df['Running time'] = re.sub("[^0-9]", "",df['Running time'][0])
        
        if type(df['Release date'][0]) != np.int64:
            # Takes the first four numbers of the TSV cell
            s = set(re.findall(r"\b\d{4}\b", df['Release date'][0]))
            if len(s) == 0:
                continue
            # Taking only the first year of the set
            df['Release date']  = next(iter(s))
        
        # Is the language we picked the same to the movie we are analyzing?
        if language == df['Language'][0]: 

            score_runtime, score_release = importance(user)


            # If the runtime is included between a range of +- 10 minutes the score increments  
            # or the release date is included between a range of +- 5 years.
            if ((abs(user_runtime) <= abs(int(df['Running time'][0]))+10) and (abs(user_runtime)>= abs(int(df['Running time'][0]))-10)) or ((abs(user_release) <= abs(int(df['Release date'][0]))+5) and (abs(user_release)>= abs(int(df['Release date'][0]))-5)):
                score = (score_release*1/(abs(int(df['Release date'][0])- user_release)+1)) + (score_runtime*1/(abs(int(df['Running time'][0])- user_runtime)+1))
            
            # If its outside the range, we apply a penalty = score * 0.5
            else: 
                score = (score_release*1/(abs(int(df['Release date'][0])- user_release)+1)) + (score_runtime*1/(abs(int(df['Running time'][0])- user_runtime)+1))*0.5
        else:
            
            continue

    our_score[doc] = score



#### Output

In [None]:
heap = [(-value, key) for key, value in our_score.items()] 
largest = heapq.nsmallest(10, heap)
largest = [(key, -value) for value, key in largest]
# Printing the search results
col_names = ["Title","Intro", "Url"]
our_score = pd.DataFrame(columns = col_names) #creating an empty df with the list cerated before

# We use reversed(largest) so that we can add values from the smallest to the greatest
for j, doc in enumerate(reversed(largest)): 
    with open(r'TSV\{}'.format(doc[0]), 'r',encoding="utf-8") as file: #opening the files in read mode
        df_final = pd.read_csv(file, sep='\t', encoding  = 'utf-8')
        df_final.drop(df.columns.difference(['Title','Intro', 'Url']), 1, inplace=True)
        percentage = doc[1]
        df_final['Score'] = percentage
        our_score = pd.concat([df_final,our_score], axis = 0, ignore_index=True, sort = False)

# Output
#printmd("The intitial query was: ***{}***".format(query))       
pd.set_option('max_colwidth',500)
our_score.style.background_gradient(cmap='Blues')