# Preprocessing, Entropy and Mutual Information between words

On this notebook is developed the preprocessing and the logic behing entropy and mutual information estimations, that will be applied to our corpus made by newspaper texts.

## Preprocessing: Normalization

First, we are going to define our functions to prepare our data before looking for entropy and mutual information estimations.

In [1]:
import nltk
from bs4 import BeautifulSoup
import re

In [2]:
def extract_text_from_corpus(path):
    # Getting corpus from the directory
    corpus = nltk.corpus.PlaintextCorpusReader(path, '.*')
    file_list = corpus.fileids()
    
    # Reuniting all text content from files on the directory
    all_text = ''
    for file in file_list:
        with open(path + file, encoding = 'utf-8') as rfile:
            text = rfile.read()
            all_text += text
    
    # Cleaning HTML tags
    soup = BeautifulSoup(all_text, 'lxml')
    clean_text = soup.get_text()
    clean_text = clean_text.lower()
    
    return clean_text

In [3]:
def tokenize_by_words(text):
    words = text.split()
    alphabetic_words = list()
    
    for word in words:
        token = []
        for character in word:
            if re.match(r'^[a-záéíóúñü+$]', character):
                token.append(character)
        token = ''.join(token)
        if token != '':
            alphabetic_words.append(token)
    
    return alphabetic_words

def tokenize_by_sents(text):
    tokens = nltk.data.load("tokenizers/punkt/spanish.pickle") 
    sents = tokens.tokenize(text)
    alphabetic_sents = list()
    
    for sent in sents:
        sent_token = tokenize_by_words(sent)
        alphabetic_sents.append(sent_token)
    
    return alphabetic_sents

In [4]:
def remove_stop_words_from_sents(sents, path = './stopwords_es.txt'):
    with open(path, encoding = 'utf-8') as f:
        stop_words = f.readlines()
        stop_words = [w.strip() for w in stop_words]
        
    clean_sents = list()
    for sent in sents:
        clean_sent = [word for word in sent if word not in stop_words]
        clean_sents.append(clean_sent)
    
    return clean_sents

In [5]:
def lemmatize_from_sents(text, path = './generate.txt'):
    from pickle import dump
    lemmas = dict()
    with open(path, encoding = 'latin1') as file:
        lines = file.readlines()
        for line in lines:
            line = line.strip()
            if line != '':
                words = line.split()
                token = words[0].strip()
                token = token.replace('#', '')
                lemma = words[-1].strip()
                lemmas[token] = lemma
    
    lemmatized_text = list()
    for sent in text:
        #words = sent.split()
        lemmatized_sent = list()
        for word in sent:
            if word in lemmas.keys():
                lemmatized_sent.append(lemmas[word])
            else:
                lemmatized_sent.append(word)
        
        lemmatized_text.append(lemmatized_sent)

    return lemmatized_text    

In [6]:
def normalize_by_sents(path):
    clean_text = extract_text_from_corpus(path)
    alphabetic_sents = tokenize_by_sents(clean_text)     
    clean_sents = remove_stop_words_from_sents(alphabetic_sents)
    preprocessed_text_sents = lemmatize_from_sents(text = clean_sents)
    #print('\n\033[1mLemmatization from sentence completed\033[0m')
    #print(f'Some words after lemmatization: \n{preprocessed_text_sents[:200]}')
    
    return preprocessed_text_sents

In [7]:
def preprocessing(path = './../EXCELSIOR_100_files/', by_words = True, by_sents = False):
    if by_words:
        try:
            preprocessed_text_words  = normalize_by_words(path)
            print('\033[1mNormalization by word tokens completed\033[0m')
            #print(f'Some words after normalization: \n{clean_words[:200]}')
            
            return preprocessed_text_words
        
        except Exception as e:
            print('An error has occured: ', e)
    elif by_sents:
        try:
            preprocessed_text_sents = normalize_by_sents(path)
            print('\033[1mNormalization by sentence tokens completed\033[0m')
            #print(f'Some words after normalization: \n{clean_sents[:200]}')

            return preprocessed_text_sents
            
        except Exception as e:
            print('An error has occured: ', e)

#### Preprocessing with sentence tokens

In [8]:
preprocessed_text_sents = preprocessing(by_words = False, by_sents = True)

[1mNormalization by sentence tokens completed[0m


Besides, we are going to make a vocabulary from the preprocessed text made with word tokens.

In [9]:
preprocessed_text_sents

[['emodhtm',
  'httpwwwexcelsiorcommxarthtml',
  'excelsior',
  'editorial',
  'martes',
  'abril',
  'monstruoso',
  'diferencia',
  'colosistas',
  'colosismo',
  'luis',
  'gutierrez',
  'gonzalez',
  'luis',
  'gutiérrez',
  'sotomayor',
  'federico',
  'arreola',
  'colosistas',
  'cabal',
  'según',
  'decir',
  'amigo',
  'luis',
  'donaldo'],
 ['ciertamente',
  'nombre',
  'circunstancia',
  'luis',
  'donaldo',
  'colosio',
  'llenar',
  'insistentemente',
  'volumen',
  'espacio',
  'medio',
  'comunicación'],
 ['renovar',
  'actualidad',
  'padecer',
  'frenético',
  'vaivén',
  'ficción',
  'judicial',
  'política',
  'integrar',
  'disgregar',
  'metafísicas',
  'metafísicas',
  'aún',
  'luis',
  'donaldo',
  'desprender',
  'envolver',
  'lado',
  'espejo'],
 ['dos',
  'año',
  'eterno',
  'insolvencias',
  'dar',
  'dar',
  'fantasía',
  'magia',
  'dónde',
  'quedar',
  'bolita',
  'traer',
  'pueblo',
  'hastío',
  'cansancio'],
 ['inminencia',
  'percibir',
  'ir',
 

In [10]:
def make_vocabulary(text):
    aux_words = list()
    for sent in text:
        for word in sent:
            aux_words.append(word)
    vocabulary = sorted(list(set(aux_words)))
    return vocabulary

In [11]:
vocabulary = make_vocabulary(preprocessed_text_sents)

In [12]:
vocabulary

['abandonar',
 'abandono',
 'abarcar',
 'abarrotar',
 'abascal',
 'abasto',
 'abatir',
 'abbasso',
 'abdicar',
 'aberrantes',
 'abierto',
 'abitia',
 'abono',
 'abordaje',
 'aborigen',
 'abraham',
 'abrego',
 'abril',
 'abrir',
 'absalón',
 'absoluto',
 'absuelto',
 'absurdamente',
 'absurdo',
 'abuelo',
 'abundante',
 'abundar',
 'aburto',
 'abusar',
 'abuso',
 'ac',
 'acabado',
 'acabar',
 'acaparar',
 'acapulco',
 'acarreo',
 'acaso',
 'accesible',
 'acceso',
 'accidente',
 'accionar',
 'accionista',
 'acción',
 'acelerar',
 'acento',
 'acentuado',
 'aceptabilidad',
 'aceptar',
 'acercar',
 'acero',
 'acertado',
 'acierto',
 'aclaración',
 'aclarar',
 'acompañado',
 'acompañamiento',
 'acompañar',
 'aconsejable',
 'aconsejar',
 'acontecimiento',
 'acopio',
 'acoplamiento',
 'acoplar',
 'acorde',
 'acotar',
 'acrecentar',
 'acreditar',
 'acreedor',
 'actitud',
 'activar',
 'actividad',
 'activo',
 'acto',
 'actor',
 'actriz',
 'actuación',
 'actual',
 'actualidad',
 'actualizar',
 'a

In [13]:
len(vocabulary)

5471

## Entropy and Mutual Information

Once we have completed our preprocessing, it's time to define the functions to find the probability of a certain word appearance in a sentence, the conditional probability that a word appears given another word in the same sentence, and finally the joint probability of two words in the same sentence. Then, with those functions we could be able to calculate entropy and the mutual information between words.

In [14]:
import numpy as np

In [15]:
def probability_word(text, word, ocurs = True):
    wordocurs = [sent for sent in text if word in sent]
    if ocurs:
        probability = (len(wordocurs) + 0.5) / (len(text) + 1)
    else:
        probability = 1 - ((len(wordocurs) + 0.5) / (len(text) + 1))
    return probability

def joint_probability(text, word1, word2, ocurs1 = True, ocurs2 = True):
    if ocurs1 and ocurs2:
        word1and2ocurs = [sent for sent in text if word1 in sent and word2 in sent]
        probability = (len(word1and2ocurs) + 0.25) / (len(text) + 1)
    elif ocurs1 and not ocurs2:
        word1andnot2ocurs = [sent for sent in text if word1 in sent and word2 not in sent]
        probability = (len(word1andnot2ocurs) + 0.25) / (len(text) + 1)
    elif not ocurs1 and ocurs2:
        wordnot1and2ocurs = [sent for sent in text if word1 not in sent and word2 in sent]
        probability = (len(wordnot1and2ocurs) + 0.25) / (len(text) + 1)
    elif not ocurs1 and not ocurs2:
        wordnot1andnot2ocurs = [sent for sent in text if word1 not in sent and word2 not in sent]
        probability = (len(wordnot1andnot2ocurs) + 0.25) / (len(text) + 1)
    return probability

def conditional_probability(text, word1, word2, ocurs1 = True, ocurs2 = True):
    probability = 0
    if probability_word(text, word1, ocurs1) != 0:
        probability = joint_probability(text, word1, word2, ocurs1, ocurs2) / probability_word(text, word1, ocurs1)
    return probability
 
def entropy(text, word1, word2):
    states = [True, False]
    sum1 = 0
    for i in states:
        sum2 = 0
        for j in states:
            sum2 = sum2 + (-conditional_probability(text, word1, word2, i, j) * np.log2(conditional_probability(text, word1, word2, i, j)))
        sum1 = sum1 + (probability_word(text, word1, i) * sum2)
    return sum1

def get_entropies(text, word1, vocabulary):
    entropies = dict()
    for w in vocabulary:
        entropies[w] = entropy(text, word1, w)
    entropies = (sorted(entropies.items(), key = lambda item: item[1], reverse = True))
    with open('./entropies/entropies_of_words_with_' + word1 + '.txt', 'w', encoding = 'utf-8') as f:
        for item in entropies:
            f.write(str(item) + '\n')
            
def mutual_information(text, word1, word2):
    states = [True, False]
    sum1 = 0
    for i in states:
        sum2 = 0
        for j in states:
            sum2 = joint_probability(text, word1, word2, i, j) * np.log2(joint_probability(text, word1, word2, i, j) / (probability_word(text, word1, i) * probability_word(text, word2, j)))
            sum1 = sum1 + sum2
    return sum1

def get_mutual_informations(text, word1, vocabulary):
    mut_infs = dict()
    for w in vocabulary:
        mut_infs[w] = mutual_information(text, word1, w)
    mut_infs = (sorted(mut_infs.items(), key = lambda item: item[1], reverse = True))
    with open('./mutual_informations/mutual_information_of_words_with_' + word1 + '.txt', 'w', encoding = 'utf-8') as f:
        for item in mut_infs:
            f.write(str(item) + '\n')

#### Entropy estimations from preprocessed text with sentence tokens

In [16]:
w = "empresa"
try:
    get_entropies(preprocessed_text_sents, w, vocabulary)
    print('Entropy estimations of \033[1m' + w + '\033[0m completed\n')
except Exception as e:
    print('An error has occured: ' + e + ' in word ' + w)

Entropy estimations of [1mempresa[0m completed



#### Mutual information estimations from preprocessed text with sentence tokens

In [17]:
w = "empresa"
try:
    get_mutual_informations(preprocessed_text_sents, w, vocabulary)
    print('MI estimations of \033[1m' + w + '\033[0m completed\n')
except Exception as e:
    print('An error has occured: ' + e + ' in word ' + w)

MI estimations of [1mempresa[0m completed

