# Segmentation - Build input file

## Inizializzazione

### Import

In [60]:
import gensim.downloader as api
from gensim.test.utils import simple_preprocess
from nltk.corpus import stopwords
import random
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
import re
from collections import Counter
import math
import json

#### Variabili Globali

In [61]:
# Link delle pagine di wikipedia che vogliamo estrarre
WIKI_PAGES = ['https://en.wikipedia.org/wiki/Film', 'https://en.wikipedia.org/wiki/Therapy', 'https://en.wikipedia.org/wiki/Napoleon_Bonaparte']

# Nomi dei file in cui vogliamo salvare ogni pagina
FILE_NAMES = ['film', 'therapy', 'napoleon']

# Seed per riproducibilità
random.seed(2.4)

STOPWORDS = stopwords.words("english")

# Iperparametri
FRAGMENTS_LENGHT = 5 # Numero di paragrafi per argomento che inseriremo nel file di input all'algoritmo

## Costruzione del file di input

### Da Wikipedia al file page_[nome].txt

Per estrarre i dati dalle pagine di wikipedia le scarichiamo come testo integrale HTML e utilizziamo BeautifulSoup per navigare l'albero dei tag HTML ed estrarre il testo contenuto in essi.

In [62]:
def save_to_file(wiki_pages, file_names):
    for i, file_name in enumerate(file_names):
        with open(f'resources/building_res/page_{file_name}.txt', 'w', encoding='utf-8') as f:
            data = retrieve_wiki_page(wiki_pages[i])
            f.write(data)
            print(f'Content of Wiki Page "{wiki_pages[i]}" saved in file "page_{file_name}"')

def retrieve_wiki_page(url):
    response = requests.get(url)
    text = to_plaintext(response.text)
    text = re.sub(r'\[[0-9]*\]', '', text)
    text = re.sub(r'\[[a-z]*\]', '', text)
    return text

def to_plaintext(html_text):
    soup = BeautifulSoup(html_text, features="lxml")
    extracted_blocks = _extract_blocks(soup.body)
    extracted_blocks_texts = [block.get_text().strip() for block in extracted_blocks]
    return "\n".join(extracted_blocks_texts)

def _extract_blocks(parent_tag):
    blocks = ["p", "h1", "h2", "h3", "h4", "h5", "blockquote"] # Nomi dei tag html da cui vogliamo estrarre testo
    extracted_blocks = []
    for tag in parent_tag:
        if tag.name in blocks:
            extracted_blocks.append(tag)
            continue
        if isinstance(tag, Tag):
            if len(tag.contents) > 0:
                inner_blocks = _extract_blocks(tag)
                if len(inner_blocks) > 0:
                    extracted_blocks.extend(inner_blocks)
    return extracted_blocks

save_to_file(WIKI_PAGES, FILE_NAMES)

Content of Wiki Page "https://en.wikipedia.org/wiki/Film" saved in file "page_film"
Content of Wiki Page "https://en.wikipedia.org/wiki/Therapy" saved in file "page_therapy"
Content of Wiki Page "https://en.wikipedia.org/wiki/Napoleon_Bonaparte" saved in file "page_napoleon"


### Da page_[nome].txt a data_[nome].txt

Nella prossima funzione partendo dal file "page" costruiamo un file che conterrà i dati da preprocessare filtrando i titoli delle sezioni e le sezioni troppo corte per essere rilevanti. Rimuoviamo inoltre i "...", caratteri molto comuni nelle pagine di Wikipedia.

In [63]:
def create_data_file(file_names):
    res = []
    for file_name in file_names:
        with open(f'resources/building_res/page_{file_name}.txt', 'r', encoding='utf-8') as f:
            data = f.readlines()
            data = list(filter(lambda x: len(x)>100, data))
            data = list(map(lambda x: x.replace('...', '.'), data))
            res.extend(data)
        
        with open(f'resources/building_res/data_{file_name}.txt', 'w',  encoding='utf-8') as f:
            for item in data:
                f.write(item)
        
        print(f'Data file named "data_{file_name}.txt" created.')

create_data_file(FILE_NAMES)

Data file named "data_film.txt" created.
Data file named "data_therapy.txt" created.
Data file named "data_napoleon.txt" created.


### Costruzione del file di input

Costruiamo quindi il documento di input all'algoritmo, prendendo `fragments_length` paragrafi per ogni file `data_[nome].txt`. 
Nella costruzione del file di input salviamo anche le posizioni corrette dove inserire i tagli, variabile che utilizzeremo nella valutazione dell'algoritmo.

In [64]:
def get_data(paths):
    paths = [f'resources/building_res/data_{file_name}.txt' for file_name in paths]
    res = []
    for path in paths:
        with open(path, 'r', encoding='utf-8') as f:
            doc = list(map(lambda x: x.strip('\n'), f.readlines()))
            #doc = list(filter(lambda x: len(x)>0, doc))
            res.append(doc)
    return res

def build_input_doc(documents, fragments_length):
    data = []
    correct_cuts = []
    for doc in documents:
        i = random.randint(0,len(doc)-fragments_length)
        for sent in doc[i:i+fragments_length]:
            data.extend(sent.split('. '))
        correct_cuts.append(len(data))
    return data, correct_cuts[:-1]

documents = get_data(FILE_NAMES)
input_data, correct_cuts = build_input_doc(documents, FRAGMENTS_LENGHT)
print(f'Input document: {input_data}')
len(input_data)

Input document: ['A preview performance refers to a showing of a film to a select audience, usually for the purposes of corporate promotions, before the public film premiere itself', 'Previews are sometimes used to judge audience reaction, which if unexpectedly negative, may result in recutting or even refilming certain sections based on the audience response', "One example of a film that was changed after a negative response from the test screening is 1982's First Blood", 'After the test audience responded very negatively to the death of protagonist John Rambo, a Vietnam veteran, at the end of the film, the company wrote and re-shot a new ending in which the character survives.', 'Trailers or previews are advertisements for films that will be shown in 1 to 3 months at a cinema', 'Back in the early days of cinema, with theaters that had only one or two screens, only certain trailers were shown for the films that were going to be shown there', "Later, when theaters added more screens or

77

### Salvataggio su file

In [65]:
def save_to_file(path, input_data, correct_cuts):
    with open(path, 'w', encoding='utf8') as f:
        for sent in input_data:
            if sent[-1] != '.':
                sent = sent + '. '
            f.write(sent)
        f.write('\n' + str(correct_cuts[0]) + ' ' + str(correct_cuts[1]))
    print("File correctly saved.")

save_to_file('resources/input_data.txt', input_data, correct_cuts)

File correctly saved.


## Costruzione degli embeddings

### Recupero del modello di gensim

In [66]:
word_embeddings_model = api.load('fasttext-wiki-news-subwords-300')

### Individuazione parole più frequenti

In [67]:
def build_freq_dict(input_doc):
    frequency_dict = {}
    prep_input_doc = simple_preprocess(" ".join(input_doc))
    for word in prep_input_doc:
        if word not in STOPWORDS:
            if is_plural(word):
                word = to_singular(word)
            if word not in frequency_dict.keys():
                frequency_dict[word] = 0
            frequency_dict[word] += 1
    return frequency_dict

def is_plural(word):
    return word[-1] == 's' or word[-2:] == 'es'

def to_singular(word):
    if word[-2:] == 'es':
        return word[:-2]
    if  word[-1] == 's':
        return word[:-1]
    return word

def get_most_frequent_words(frequency_dict, n_words):
    freq_list = sorted(frequency_dict, key=frequency_dict.get, reverse=True)
    return freq_list[:n_words]

freq_dict = build_freq_dict(input_data)
most_freq_words = get_most_frequent_words(freq_dict, 9)
print(f'Most frequent words: {most_freq_words}')

Most frequent words: ['film', 'napoleon', 'trailer', 'care', 'therapy', 'corsican', 'shown', 'year', 'french']


### Costruzione file con embeddings delle parole più frequenti

In [68]:
def get_embeddings(words):
    embeddings = dict()
    for word in words:
        embeddings[word] = list(float(x) for x in word_embeddings_model[word])
    return embeddings

def save_embeddings(path, embeddings):
    with open(path, 'w', encoding='utf8') as f:
        f.write(json.dumps(embeddings))
    print("Embeddings correctly saved.")

embedding_path = 'resources/embeddings.json'
embeddings = get_embeddings(most_freq_words)
save_embeddings(embedding_path, embeddings)

Embeddings correctly saved.
