# Preprocessing, Word Association with POS Tagging and BM25 improvement

On this notebook is developed the preprocessing and word association with tagging to our corpus made by newspaper texts. 

## Preprocessing: Normalization with POS tagging

First, we are going to define our functions to prepare our data by tagging each word with its respective Part of Speech. This tagging is done to get better results when we were looking for word associations, because a priori those tags will help us to filter the words that belong to the same POS and then apply the similarity measures detailed later.

In [None]:
import nltk
from bs4 import BeautifulSoup
import re

In [None]:
def extract_text_from_corpus(path):
    # Getting corpus from the directory
    corpus = nltk.corpus.PlaintextCorpusReader(path, '.*')
    file_list = corpus.fileids()
    
    # Reuniting all text content from files on the directory
    all_text = ''
    for file in file_list:
        with open(path + file, encoding = 'utf-8') as rfile:
            text = rfile.read()
            all_text += text
    
    # Cleaning HTML tags
    soup = BeautifulSoup(all_text, 'lxml')
    clean_text = soup.get_text()
    clean_text = clean_text.lower()
    
    return clean_text

In [None]:
def tokenize_by_words(text):
    words = text.split()
    alphabetic_words = list()
    
    for word in words:
        token = []
        for character in word:
            if re.match(r'^[a-záéíóúñü+$]', character):
                token.append(character)
        token = ''.join(token)
        if token != '':
            alphabetic_words.append(token)
    
    return alphabetic_words

def tokenize_by_sents(text):
    tokens = nltk.data.load("tokenizers/punkt/spanish.pickle") 
    sents = tokens.tokenize(text)
    alphabetic_sents = list()
    
    for sent in sents:
        sent_token = tokenize_by_words(sent)
        alphabetic_sents.append(sent_token)
    
    return alphabetic_sents

In [None]:
def remove_stop_words_from_sents(sents, path = './stopwords_es.txt'):
    with open(path, encoding = 'utf-8') as f:
        stop_words = f.readlines()
        stop_words = [w.strip() for w in stop_words]
        
    clean_sents = list()
    for sent in sents:
        clean_sent = [word for word in sent if word not in stop_words]
        clean_sents.append(clean_sent)
    
    return clean_sents

In [None]:
from nltk.corpus import cess_esp
from pickle import dump

def make_and_save_spanish_tagger(fname):
        
    tags_sents = list()
    for sent in cess_esp.tagged_sents():
        tags_sents_aux = [tag for (word, tag) in sent]
        tags_sents = tags_sents + tags_sents_aux
    
    most_used_tag_sents = nltk.FreqDist(tags_sents).max()
    
    default_tagger = nltk.DefaultTagger(most_used_tag_sents)
    
    patterns = [
        (r'.*o$', 'n'),
        (r'.*os$', 'n'),
        (r'.*a$', 'n'),
        (r'.*as$', 'n'),
        (r'.*e$', 'n'),
        (r'.*es$', 'n'),
        (r'.^[0-9]+$', 'z')
    ]
    
    regexp_tagger = nltk.RegexpTagger(patterns, backoff = default_tagger)
    
    cess_tagged_sents = cess_esp.tagged_sents()
    spanish_tagger = nltk.UnigramTagger(cess_tagged_sents, backoff = regexp_tagger)
    
    output = open(fname, 'wb')
    dump(spanish_tagger, output, -1)
    output.close()

In [None]:
make_and_save_spanish_tagger('./spanish_tagger.pkl')

In [None]:
from pickle import load

def tag(text, path = './spanish_tagger.pkl'):
    input_f = open(path, 'rb')
    tagger = load(input_f)
    input_f.close()
    tagged_sentences = [tagger.tag(sent) for sent in text]
    
    return tagged_sentences

In [None]:
def lemmatize_from_sents(text, path = './generate.txt'):
    
    lemmas = dict()
    with open(path, encoding = 'latin1') as file:
        lines = file.readlines()
        for line in lines:
            line = line.strip()
            if line != '':
                words = line.split()
                token = words[0].strip()
                token = token.replace('#', '')
                lemma = words[-1].strip()
                tag = words[-2].strip()
                tag = tag[0].lower()
                lemmas[(token, tag)] = (lemma, tag)
    
    lemmatized_text = list()
    for sent in text:
        lemmatized_sent = list()
        for word in sent:
            if word in lemmas.keys():
                lemmatized_sent.append(lemmas[word])
            else:
                lemmatized_sent.append(word)
        
        lemmatized_text.append(lemmatized_sent)

    return lemmatized_text

In [None]:
def normalize_by_sents(path):
    clean_text = extract_text_from_corpus(path)
    alphabetic_sents = tokenize_by_sents(clean_text)     
    clean_sents = remove_stop_words_from_sents(alphabetic_sents)
    tagged_sents = tag(text = clean_sents)
    
    new_tagged_sents = list()
    for sent in tagged_sents:
        new_sent = list()
        for element in sent:
            word = element[0]
            tag_elem = element[1]
            new_sent.append((word, tag_elem[0].lower()))
        new_tagged_sents.append(new_sent)
    preprocessed_text_sents = lemmatize_from_sents(text = new_tagged_sents)
    
    return preprocessed_text_sents

In [None]:
def preprocessing(path = './../EXCELSIOR_100_files/'):
    try:
        preprocessed_text_sents = normalize_by_sents(path)
        print('\033[1mNormalization by sentence tokens completed\033[0m')
        
        return preprocessed_text_sents

    except Exception as e:
        print('An error has occured: ', e)

#### Preprocessing with sentence tokens

In [None]:
preprocessed_text_sents = preprocessing()

In [None]:
words = list()
for sent in preprocessed_text_sents:
    for word in sent:
        words.append(word)

In [None]:
vocabulary = list(sorted(set(words)))
len(vocabulary)

## Word associations and BM25 improvement

Once we have completed our preprocessing, it's time to define the functions to find the similarity between words, hence get the word associations in our corpus. In order to do that, we must have functions to extract the contexts from the preprocessed text with sentence tokens. Then, we also need a function to calculate the probability's vector for each word. And finally, functions to quantify the similarity between words based on the dot product or the cosine measure between vectors.

In [None]:
def get_contexts_sents(vocabulary, text, window = 8):
    contexts = dict()
    for w in vocabulary:
        context = list()
        for sent in text:
            for i in range(len(sent)):
                if sent[i] == w:
                    for j in range(i - int(window / 2), i):
                        if j >= 0:
                            context.append(sent[j])
                    try:
                        for j in range(i + 1, i + (int(window / 2) + 1)):
                            context.append(sent[j])
                    except IndexError:
                        pass
        contexts[w] = context
    return contexts

In [None]:
import numpy as np

In [None]:
def get_vectors(vocabulary, contexts):
    vectors = dict()
    for v in vocabulary:
        context = contexts[v]
        vector = []
        for voc in vocabulary:
            vector.append(context.count(voc))
        vector = np.array(vector)
        vectors[v] = vector
    dls = list()
    for v in vectors.values():
        dls.append(np.sum(v))
    avdl = np.sum(dls) / len(dls)
    for k, v in vectors.items():
        new_vector = bm25(v, avdl)
        s = np.sum(new_vector)
        if s != 0:
            new_vector = new_vector / s
        vectors[k] = new_vector
    return vectors

def get_idf(vectors):
    num_context = len(vectors)
    total_aparitions = [0 for i in range(num_context)]
    for v in vectors.values():
        i = 0
        for element in v:
            if element != 0:
                total_aparitions[i] = total_aparitions[i] + 1
            i = i + 1
    idf = list()
    for element in total_aparitions:
        if element != 0:
            idf.append(np.log((num_context + 1) / element))
        else:
            idf.append(element)
    return idf

In [None]:
def bm25(vector, avdl, k = 0.25, b = 0.25):
    new_vector = np.divide((k+1) * vector, vector + k * (1 - b + (b * np.sum(vector) / avdl)))
    return new_vector

def s_bm25(word, idf, vectors, aux_path = ''):
    similarities = dict()
    v = np.multiply(idf, vectors[word])
    for w in vectors.keys():
        # v2 = np.multiply(idf, vectors[w])
        v2 = vectors[w]
        similarities[w] = np.dot(v, v2)
    similarities = (sorted(similarities.items(), key = lambda item: item[1], reverse = True))
    
    with open('./bm25_idf/similar_words_to_' + word[0] + '_' + word[1] + '_with_bm25_idf' + aux_path + '.txt', 'w', encoding = 'utf-8') as f:
        for item in similarities:
            f.write(str(item) + '\n')

def s_dot_product(word, idf, vectors, aux_path = ''):
    similarities = dict()
    v = np.multiply(vectors[word], idf)
    for w in vectors.keys():
        v2 = np.multiply(vectors[w], idf)
        similarities[w] = np.dot(v2, v)
    similarities = (sorted(similarities.items(), key = lambda item: item[1], reverse = True))
    
    with open('./dot_product_idf/similar_words_to_' + word[0] + '_' + word[1] + '_with_dot_product_idf' + aux_path + '.txt', 'w', encoding = 'utf-8') as f:
        for item in similarities:
            f.write(str(item) + '\n')


def s_cosine(word, idf, vectors, aux_path = ''):
    similarities = dict()
    v = np.multiply(vectors[word], idf)
    for w in vectors.keys():
        v2 = np.multiply(vectors[w], idf)
        if np.linalg.norm(v) == 0 or np.linalg.norm(v2) == 0:
            similarities[w] = 0
        else:
            similarities[w] = np.dot(v, v2) / (np.linalg.norm(v) * np.linalg.norm(v2))
    similarities = (sorted(similarities.items(), key = lambda item: item[1], reverse = True))
    
    with open('./cosine_idf/similar_words_to_' + word[0] + '_' + word[1] + '_with_cosine_idf' + aux_path + '.txt', 'w', encoding = 'utf-8') as f:
        for item in similarities:
            f.write(str(item) + '\n')

In [None]:
def similar_words_improved(word, vectors, vocabulary, aux_path = '', bm25 = False, dot_product = False, cosine = False):   
    idf = get_idf(vectors)
    
    words = list()
    for v in vocabulary:
        if v[0] == word:
            words.append(v)
    for word in words:
        tag = word[1]
        new_vectors = dict()
        for k, v in vectors.items():
            if k[1] == tag:
                new_vectors[k] = v
                
        if bm25:
            s_bm25(word, idf, new_vectors, aux_path)
        
        if dot_product:
            s_dot_product(word, idf, new_vectors, aux_path)

        if cosine:
            s_cosine(word, idf, new_vectors, aux_path)

In [None]:
contexts_s = get_contexts_sents(vocabulary, preprocessed_text_sents)
vectors_s = get_vectors(vocabulary, contexts_s)
idf = get_idf(vectors_s)
print(np.sum(idf))

In [None]:
w = "empresa"
print(w)
try:
    similar_words_improved(w, vectors_s, vocabulary, aux_path = '', bm25 = True, dot_product = True, cosine = True)
    print('Similarity estimations of \033[1m(' + w + ')\033[0m completed\n')
except Exception as e:
    print('An error has occured: ' + str(e) + ' in word ' + w[0] + ', ' + w[1])

In [None]:
vectors_s