# PART1 (Fetching Urls)

In [None]:
import time
import urllib.request
from bs4 import BeautifulSoup
from random import randint

"""
THIS SCRIPT IS FOR DOWNLOADING THE HTML FILES OF THE GIVEN DOCUMENTS
"""
soup = BeautifulSoup(open("movies1.html"), features="lxml") # create a soup object
url_list = []
for url in soup.findAll('a', href=True): # go through all links in the html file
    url_list.append(url['href']) # append urls ot a list
a = 9430, 9671
i = 0
while i <= (len(url_list)): # loop through all list and download the links as html files
    link = url_list[i]
    try:
        response = urllib.request.urlopen(link)
        webContent = response.read()
        with open("movies\\article_"+str(i)+".html", "wb")as file:
            file.write(webContent)
        time.sleep(randint(1, 5))
        print("in try ", i)
        i += 1
    except:
        print("in except ", i)
        time.sleep(120)

# the files with number 9429 and 9670 can not be downloaded because of an unexpected error








# PART2 (Parsing)

In [1]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

"""
This script is for parsing the html pages
for each html page;
    *The title
    *Intro
    *plot
are fetched and cleaned by removing stopwords and punctuation

After that, some relevant information are fetched from the Info box of each html file

At the end, the extracted information is a stored as tsv files for each film 
"""

# create a Data Frame to store the information for each film
columns = ["title", "intro", "plot", "film_name", "director", "producer", "writer", "starring", "music", "release_date","running time", "country", "language", "budget"]
a = np.empty((10000, 14,))
a[:] = np.nan
df = pd.DataFrame(data=a, columns=columns)

# define a function to clean by using the nltk library
def clean_text(sentence):
    stop_words = set(stopwords.words("english"))
    tokens = RegexpTokenizer(r"\w+")
    porter = PorterStemmer()
    stem_words = list(map(porter.stem, tokens.tokenize(sentence)))
    words = filter(lambda x: x not in string.punctuation, stem_words)
    cleaned_text = filter(lambda x: x not in stop_words, words)
    return cleaned_text

# Loop through each film and record the relevant info to the Data Frame
for film in range(10000): # this is for the first file of movies
    if film in [9429, 9671]:
        continue # the files with number 9429 and 9670 can not be downloaded because of an unexpected error because of that, they are excluded from the loop

    soup = BeautifulSoup(open("movies\\article_"+str(film)+".html", encoding="utf8"), "html.parser")
    #exctract the title
    df.iloc[film, 0] = soup.find("h1").text
    df.iloc[film, 3] = soup.find("h1").text
    #extract the Intro
    Intro = ""
    par = soup.p # first paragraph of the html page
    while par.next_element.name != "h2" and par.next_element.name != "h3": # until a heading, combine all paragraphs
        if par.name == "p":
            Intro += par.get_text()
        par = par.next_element
    # clean it with clean text function
    df.iloc[film, 1] = " ".join(list(clean_text(Intro)))

    # Parse the Plot
    for heading in soup.find_all(["h2", "h3"]):
        try:
            if (heading.contents[0].get("id") == "Plot" or heading.contents[0].get("id") == "Plot_summary" or
                heading.contents[0].get("id") == "Plot_Summary" or
                    heading.contents[0].get("id") == "Premise"): # first find the plot heading that can have different id's
                break
        except AttributeError:
            pass
    Plot = ""
    try:
        while heading.next_element.name != "h2" and heading.next_element.name != "h3": # starting from plot heading, concatanete all paragraphs
            if heading.name == "p":
                Plot += heading.get_text()
            heading = heading.next_element
        # clean it with clean text function
        df.iloc[film, 2] = " ".join(list(clean_text(Plot)))
    except AttributeError:
        pass

    # Info box Parsing

    info_box = soup.find("table", {"class": "infobox vevent"})
    if info_box is None:
        df.iloc[film, :].to_csv("parsed_clean\\" + str(film) + ".tsv", sep='\t', encoding='utf-8')
        continue
    tags = info_box.contents[0].contents # find tags in infobox
    for tr in tags:
        if len(tr.contents) == 2:
            flag = True
            s = tr.contents[0].get_text() # get the headings in infobox
            for i in range(len(columns)):
                sub_Str = s[:4].lower()
                target_str = columns[i]
                if sub_Str in target_str: # find correct column to write the relevant information
                    flag = False
                    break
            if flag:
                continue
            l = []
            if len(tr.contents[1].contents) > 1:
                for j in tr.contents[1].contents:
                    if j.string is not None:
                        l.append(j.string)
                df.iloc[film, i] = " ".join(l)
            else:
                df.iloc[film, i] = [tr.contents[1].get_text()]



    df.iloc[film, :].to_csv("parsed_clean\\"+str(film)+".tsv", sep='\t', encoding='utf-8') # create a different folder called parsed and save the tsv files in it


print(df)



KeyboardInterrupt: 

# PART3(Vocabulary)

In [2]:
import pandas as pd
"""
this script creates a vocabulary csv which contains the whole words contained
in the html files
"""
text =""
for film in range(10000):
    if film in [9429, 9671]: # the files with number 9429 and 9670 can not be downloaded because of an unexpected error because of that, they are excluded from the loop
        continue
    df = pd.read_csv("parsed_clean\\"+str(film)+".tsv", sep='\t', encoding='utf-8') #vocab read csv files
    df = df.fillna("")
    # get the intro and plot and combine them.
    intro = df.iloc[0, 1]
    plot = df.iloc[1, 1]
    text = text + " " + intro + " " + plot #concatanete all intros and plots for all films

arr = set(text.split()) # use set to eliminate repeating words
vocab = pd.DataFrame(arr)
vocab.to_csv("vocab.csv") #save vocabulary as csv with indices for each
vocab

Unnamed: 0,0
0,huer
1,ephraim
2,featur
3,kleiner
4,letterhead
5,spokeswoman
6,barti
7,chip
8,octaroon
9,tau


# PART4(inverted indices)

In [3]:
import pandas as pd
"""
This script creates inverted indices file for each word in vocabulary.csv

"""
vocab = pd.read_csv("vocab.csv")
inverted_indices = {}
vocab = vocab.set_index("0").to_dict()["Unnamed: 0"] # set the keys of the inverted indices dictionary as the id's in vocabulary.csv
for index in range(len(vocab.values())):
    inverted_indices[index] = [] # set the values of inverted indices dictionary empty lists, later the documents containig relevant word
                                # would be appended to this list
for film in range(10000):
    text=""
    if film in [9429, 9671]:
        continue
    # get the intro + plot of each film as text
    df = pd.read_csv("parsed_clean\\"+str(film)+".tsv", sep='\t', encoding='utf-8')
    df = df.fillna("")
    intro = df.iloc[0, 1]
    plot = df.iloc[1, 1]
    text = intro + " " + plot
    # go through each word in the text and ad the document to the value in the inverted indicies dictionary
    for word in text.split():
        try:
            term_id =int(vocab[word.lower()])
            inverted_indices[term_id].append("doc_"+str(film))
        except KeyError:
            continue
# save the dictionary as csv
import csv
with open('inverted_indices.csv', "w") as f:  # Just use 'w' mode in 3.x
    w = csv.DictWriter(f, inverted_indices.keys())
    w.writeheader()
    w.writerow(inverted_indices)

op=pd.read_csv("inverted_indices.csv").transpose()

In [6]:
print(inverted_indices)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# PART5(first search engine)

In [4]:
"""
This script is the first search engine without a score
returns title, intro and url for relevant films
"""
import pandas as pd
from bs4 import BeautifulSoup
import re
# get the urls of each movie
soup = BeautifulSoup(open("movies1.html"), features="lxml")
url_list = []
for url in soup.findAll('a', href=True):
    url_list.append(url['href'])

# import the vocab csv
vocab = pd.read_csv("vocab.csv")
vocab = vocab.set_index("0").to_dict()["Unnamed: 0"]
op = pd.read_csv("inverted_indices.csv").transpose()

sentence = "lion king"

htmls = []
for word in sentence.split(): # go trough each word in sentence and match the word with id's in vocab csv
    try:
        id = vocab[word]
    except KeyError:
        word = word[:-1] # sometimes the last letter of a word is droped when nltk library used because of an un known reason
        try:            # to overcome this, the matching is done with eliminating the last letter
            id = vocab[word]
        except KeyError:
            continue

    doc = op.iloc[id].values[0].split()
    doc[0] = doc[0][1:]
    doc[-1] = doc[-1][:-1]
    htmls.append(set(doc))

# use set intersection to get the documents containing all words
intersection = set.intersection(*htmls)
# create a results dataframe to store the info
results = pd.DataFrame({'Title': [], 'Intro': [], 'url': []})
counter = 0
for document in intersection:
    document = re.findall(r'\d+', document)[0]

    soup = BeautifulSoup(open("movies\\article_" + document + ".html", encoding="utf8"), "html.parser")
    # get title ( same in parse)
    title = soup.find("h1").text
    # get intro (same in parse)
    Intro = ""
    par = soup.p # first paragraph of the html page
    while par.next_element.name != "h2" and par.next_element.name != "h3": # until a heading, combine all paragraphs
        if par.name == "p":
            Intro += par.get_text()
        par = par.next_element
    # get url ( same in fetch_urls)
    url = url_list[int(document)]
    results.loc[counter] = [title, Intro, url]
    counter += 1

print(results)



                                     Title  \
0                      The Creeping Terror   
1                      The Egyptian (film)   
2         His Majesty, the Scarecrow of Oz   
3                           The Wiz (film)   
4   The Wonderful Wizard of Oz (1910 film)   
5                         The 300 Spartans   
6                            Paper Bullets   
7           You Never Can Tell (1951 film)   
8             The Adventures of Marco Polo   
9          Nothing but Trouble (1944 film)   
10                     Merry Andrew (film)   
11                      White Witch Doctor   
12         Douglas Fairbanks in Robin Hood   
13                         One More Spring   
14                         The Lion (film)   
15                A Lion Is in the Streets   
16               Tarzan and the Slave Girl   
17   A Midsummer Night's Dream (1935 film)   
18                 Tarzan and the Huntress   
19            Mighty Joe Young (1949 film)   

                                 