# Preprocessing and Word Association

On this notebook is developed the preprocessing and word association with our corpus made by newspaper texts.

## Preprocessing: Normalization

First, we are going to define our functions to prepare our data before looking for word associations.

In [1]:
import nltk
from bs4 import BeautifulSoup
import re

In [2]:
def extract_text_from_corpus(path):
    # Getting corpus from the directory
    corpus = nltk.corpus.PlaintextCorpusReader(path, '.*')
    file_list = corpus.fileids()
    
    # Reuniting all text content from files on the directory
    all_text = ''
    for file in file_list:
        with open(path + file, encoding = 'utf-8') as rfile:
            text = rfile.read()
            all_text += text
    
    # Cleaning HTML tags
    soup = BeautifulSoup(all_text, 'lxml')
    clean_text = soup.get_text()
    clean_text = clean_text.lower()
    
    return clean_text

In [3]:
def tokenize_by_words(text):
    words = text.split()
    alphabetic_words = list()
    
    for word in words:
        token = []
        for character in word:
            if re.match(r'^[a-záéíóúñü+$]', character):
                token.append(character)
        token = ''.join(token)
        if token != '':
            alphabetic_words.append(token)
    
    return alphabetic_words

def tokenize_by_sents(text):
    tokens = nltk.data.load("tokenizers/punkt/spanish.pickle") 
    sents = tokens.tokenize(text)
    alphabetic_sents = list()
    
    for sent in sents:
        sent_token = tokenize_by_words(sent)
        alphabetic_sents.append(sent_token)
    
    return alphabetic_sents

In [4]:
def remove_stop_words_from_words(words, path = './stopwords_es.txt'):
    with open(path, encoding = 'utf-8') as f:
        stop_words = f.readlines()
        stop_words = [w.strip() for w in stop_words]
    
    clean_words = [word for word in words if word not in stop_words]
    
    return clean_words
    
def remove_stop_words_from_sents(sents, path = './stopwords_es.txt'):
    with open(path, encoding = 'utf-8') as f:
        stop_words = f.readlines()
        stop_words = [w.strip() for w in stop_words]
        
    clean_sents = list()
    for sent in sents:
        clean_sent = [word for word in sent if word not in stop_words]
        clean_sents.append(clean_sent)
    
    return clean_sents

In [5]:
def lemmatize_from_words(text, path = './generate.txt'):
    from pickle import dump
    lemmas = dict()
    with open(path, encoding = 'latin1') as file:
        lines = file.readlines()
        for line in lines:
            line = line.strip()
            if line != '':
                words = line.split()
                token = words[0].strip()
                token = token.replace('#', '')
                lemma = words[-1].strip()
                lemmas[token] = lemma
    
    lemmatized_text = list()
    for word in text:
        if word in lemmas.keys():
            lemmatized_text.append(lemmas[word])
        else:
            lemmatized_text.append(word)

    return lemmatized_text
    

def lemmatize_from_sents(text, path = './generate.txt'):
    from pickle import dump
    lemmas = dict()
    with open(path, encoding = 'latin1') as file:
        lines = file.readlines()
        for line in lines:
            line = line.strip()
            if line != '':
                words = line.split()
                token = words[0].strip()
                token = token.replace('#', '')
                lemma = words[-1].strip()
                lemmas[token] = lemma
    
    lemmatized_text = list()
    for sent in text:
        #words = sent.split()
        lemmatized_sent = list()
        for word in sent:
            if word in lemmas.keys():
                lemmatized_sent.append(lemmas[word])
            else:
                lemmatized_sent.append(word)
        
        lemmatized_text.append(lemmatized_sent)

    return lemmatized_text    

In [6]:
def normalize_by_words(path):
    clean_text = extract_text_from_corpus(path)
    alphabetic_words = tokenize_by_words(clean_text)     
    clean_words = remove_stop_words_from_words(alphabetic_words)
    preprocessed_text_words  = lemmatize_from_words(text = clean_words)
    #print('\n\033[1mLemmatization from words completed\033[0m')
    #print(f'Some words after lemmatization: \n{preprocessed_text_words[:200]}')
    
    return preprocessed_text_words

def normalize_by_sents(path):
    clean_text = extract_text_from_corpus(path)
    alphabetic_sents = tokenize_by_sents(clean_text)     
    clean_sents = remove_stop_words_from_sents(alphabetic_sents)
    preprocessed_text_sents = lemmatize_from_sents(text = clean_sents)
    #print('\n\033[1mLemmatization from sentence completed\033[0m')
    #print(f'Some words after lemmatization: \n{preprocessed_text_sents[:200]}')
    
    return preprocessed_text_sents

In [7]:
def preprocessing(path = './../EXCELSIOR_100_files/', by_words = True, by_sents = False):
    if by_words:
        try:
            preprocessed_text_words  = normalize_by_words(path)
            print('\033[1mNormalization by word tokens completed\033[0m')
            #print(f'Some words after normalization: \n{clean_words[:200]}')
            
            return preprocessed_text_words
        
        except Exception as e:
            print('An error has occured: ', e)
    elif by_sents:
        try:
            preprocessed_text_sents = normalize_by_sents(path)
            print('\033[1mNormalization by sentence tokens completed\033[0m')
            #print(f'Some words after normalization: \n{clean_sents[:200]}')

            return preprocessed_text_sents
            
        except Exception as e:
            print('An error has occured: ', e)

#### Preprocessing with word tokens

In [8]:
preprocessed_text_words = preprocessing(by_words = True, by_sents = False)

[1mNormalization by word tokens completed[0m


#### Preprocessing with sentence tokens

In [9]:
preprocessed_text_sents = preprocessing(by_words = False, by_sents = True)

[1mNormalization by sentence tokens completed[0m


Besides, we are going to make a vocabulary from the preprocessed text made with word tokens.

In [10]:
def make_vocabulary(words):
    vocabulary = sorted(list(set(words)))
    return vocabulary

In [11]:
vocabulary = make_vocabulary(preprocessed_text_words)

In [12]:
len(vocabulary)

5471

## Word associations

Once we have completed our preprocessing, it's time to define the functions to find the similarity between words, hence get the word associations in our corpus. In order to do that, we must have functions to extract the contexts from both text preprocessed gotten with words and sentences. Then, we also need a function to calculate the probability's vector for each word. And finally, functions to quantify the similarity between words based on the dot product or the cosine measure between vectors.

In [13]:
def get_contexts_words(vocabulary, text, window = 8):
    contexts = dict()
    for w in vocabulary:
        context = list()
        for i in range(len(text)):
            if text[i] == w:
                for j in range(i - int(window / 2), i):
                    if j >= 0:
                        context.append(text[j])
                try: 
                    for j in range(i + 1, i + (int(window / 2) + 1)):
                        context.append(text[j])
                except IndexError:
                        pass
        contexts[w] = context
    return contexts


def get_contexts_sents(vocabulary, text, window = 8):
    contexts = dict()
    for w in vocabulary:
        context = list()
        for sent in text:
            words = sent
            for i in range(len(words)):
                if words[i] == w:
                    for j in range(i - int(window / 2), i):
                        if j >= 0:
                            context.append(words[j])
                    try:
                        for j in range(i + 1, i + (int(window / 2) + 1)):
                            context.append(words[j])
                    except IndexError:
                        pass
        contexts[w] = context
    
    return contexts

In [14]:
import numpy as np

def get_vectors(vocabulary, contexts):
    probs = dict()
    for v in vocabulary:
        context = contexts[v]
        vector_normalized = []
        for voc in vocabulary:
            vector_normalized.append(context.count(voc))
        vector_normalized = np.array(vector_normalized)
        s = np.sum(vector_normalized)
        if s != 0:
            vector_normalized = vector_normalized / s
        probs[v] = vector_normalized
    return probs


def s_dot_product(word, vectors, aux_path = ''):

    similarities = dict()
    v = vectors[word]
    for w in vectors.keys():
        similarities[w] = np.dot(vectors[w], v)
    similarities = (sorted(similarities.items(), key = lambda item: item[1], reverse = True))
    
    with open('./dot_product/similar_words_to_' + word + '_with_dot_product_of_' + aux_path + '.txt', 'w', encoding = 'utf-8') as f:
        for item in similarities:
            f.write(str(item) + '\n')


def s_cosine(word, vectors, aux_path = ''):
    similarities = dict()
    v = vectors[word]
    for w in vectors.keys():
        v2 = vectors[w]
        norm1 = np.linalg.norm(v)
        norm2 = np.linalg.norm(v2)
        if norm1 == 0 or norm2 == 0:
            similarities[w] = 0
        else:
            similarities[w] = np.dot(v, v2) / (norm1 * norm2)
    similarities = (sorted(similarities.items(), key = lambda item: item[1], reverse = True))
    
    with open('./cosine/similar_words_to_' + word + '_with_cosine_of_' + aux_path + '.txt', 'w', encoding = 'utf-8') as f:
        for item in similarities:
            f.write(str(item) + '\n')

In [15]:
def similar_words(word, vectors, aux_path = '', tf_idf = False, dot_product = False, cosine = False):   
 
    if dot_product:
        s_dot_product(word, vectors, aux_path)
                
    if cosine:
        s_cosine(word, vectors, aux_path)

#### Word associations from preprocessed text with word tokens

In [16]:
contexts_w = get_contexts_words(vocabulary, preprocessed_text_words)
probs_w = get_vectors(vocabulary, contexts_w)
w = "empresa"
try:
    similar_words(w, probs_w, aux_path = 'prob_vectors_by_words', dot_product = True, cosine = True)
    print('Similarity estimations of \033[1m' + w + '\033[0m completed\n')
except Exception as e:
    print('An error has occured: ' + e + ' in word ' + w)

Similarity estimations of [1mempresa[0m completed



#### Word associations from preprocessed text with sentence tokens

In [17]:
contexts_s = get_contexts_sents(vocabulary, preprocessed_text_sents)
probs_s = get_vectors(vocabulary, contexts_s)
w = "empresa"
try:
    similar_words(w, probs_s, aux_path = 'prob_vectors_by_sents', dot_product = True, cosine = True)
    print('Similarity estimations of \033[1m' + w + '\033[0m completed\n')
except Exception as e:
    print('An error has occured: ' + e + ' in word ' + w)

Similarity estimations of [1mempresa[0m completed

