## Preprocessing

In [1]:
import dask.dataframe as dd
import pandas as pd
import glob, re, os

In [2]:
path = r'/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data'
all_files = glob.glob(path + "/english_sentences*.csv")

# Deletes previous csv files to avoid errors in overwriting
for file in all_files:
    if os.path.exists(file):
        os.remove(file)    

In [3]:
ddf = dd.read_csv("data/old-newspaper.tsv", sep="\t")
df = ddf[ddf["Language"] == "English"]
df = df[["Text"]]

In [4]:
# https://stackoverflow.com/questions/47812785/remove-empty-partitions-in-dask
def cull_empty_partitions(df):
    ll = list(df.map_partitions(len).compute())
    df_delayed = df.to_delayed()
    df_delayed_new = list()
    pempty = None
    for ix, n in enumerate(ll):
        if 0 == n:
            pempty = df.get_partition(ix)
        else:
            df_delayed_new.append(df_delayed[ix])
    if pempty is not None:
        df = dd.from_delayed(df_delayed_new, meta=pempty)
    return df

dd.to_csv(df=cull_empty_partitions(df), filename="data/english_sentences*.csv", index=False)

['/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data/english_sentences0.csv',
 '/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data/english_sentences1.csv',
 '/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data/english_sentences2.csv',
 '/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data/english_sentences3.csv',
 '/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data/english_sentences4.csv']

In [5]:
# Reset all files after recreating
all_files = glob.glob(path + "/english_sentences*.csv")

# https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe
li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

data = pd.concat(li, axis=0, ignore_index=True)

In [6]:
# Transform to string
data["Text"] = data["Text"].apply(lambda t: str(t))

In [7]:
# Removing urls
def limpa_url(texto):
    # Regex obtida de https://www.geeksforgeeks.org/python-check-url-string/
    pattern = r"""
        (?i)  # Ignore case.
        \b  # Inicio de palavra.
        (?:
            https?://
        |
            www
            \d{0,3}
            [.]
        |
            [a-z0-9.\-]+
            [.]
            [a-z]{2,4}
            /
        )
        (?:
            [^\s()<>]+
        |
            \(
            (?:
                [^\s()<>]+
            |
                \(
                [^\s()<>]+
                \)
            )*
            \)
        )+
        (?:
            \(
            (?:
                [^\s()<>]+
            |
                \(
                [^\s()<>]+
                \)
            )*
            \)
        |
            [^\s`!()\[\]{};:'\".,<>?«»“”‘’]
        )
    """
    repl = ""
    matcher = re.compile(pattern, re.VERBOSE)
    return matcher.sub(repl, texto)

data["Text"] = data["Text"].apply(lambda t: limpa_url(t))

In [8]:
# Removing sentences that contain "@, #, $, %, &, *, '"
data = data[~data["Text"].str.contains(r"[\@\#\$\%\&\*\`]")]

In [9]:
# Replacing ? and ! for .
data["Text"] = data["Text"].apply(lambda t: t.replace("?", ".").replace("!", "."))

# Removing punctuation except for apostrophes and full stops
data["Text"] = data["Text"].apply(lambda t: re.sub(r"[^\w\s\'\.]", "", t))

# Replacing all apostrophes for dashes in order to keep same word (don't, i'm)
data["Text"] = data["Text"].apply(lambda t: t.replace("\'", "-"))

In [10]:
# Splitting by . and exploding to generate new rows
data["Text"] = data["Text"].apply(lambda t: t.split("."))
data = data.explode("Text")

In [11]:
# Removing trailing and leading whitespaces
data["Text"] = data["Text"].apply(lambda t: t.strip())

# Removes tabs, newlines and extra whitespaces
data["Text"] = data["Text"].apply(lambda t: t.replace("/\s\s+/g", " "))

# Filtering out empty sentences and sentences that have more than 4 words and less than 10
data = data[(data["Text"] != "") & (data["Text"].str.split(" ").str.len() >= 7) & (data["Text"].str.split(" ").str.len() <= 10)]

In [12]:
# Removing numbers
data["hasNumbers"] = data["Text"].apply(lambda t: any(char.isdigit() for char in t))
data = data[data["hasNumbers"] == False]

In [13]:
# Lowercase EVERYTHING
data["Text"] = data["Text"].apply(lambda t: t.lower())

In [14]:
# Resetting index
data.reset_index(inplace=True)

# Keeping only text column
data = data[["Text"]]

# Visualize data
data.head(10)

Unnamed: 0,Text
0,is it an issue serious enough to merit their a...
1,will it definitely not make the situation worse
2,the revel casino hit the jackpot here at gover...
3,im just up there trying to make good contact
4,let your hair down it looks better
5,two greek words philautos and philargyros succ...
6,the protester seeks to curb the second phenomenon
7,comad with the harried office worker and the c...
8,childwellbeing indicators show disparities fro...
9,there are health disparities in infant mortali...


In [16]:
data.to_csv("data/english_sentences.csv", index=False)

## NLTK

In [17]:
with open("data/english_sentences.csv", "r") as file:
    next(file)
    sentences = [s for s in file]

In [18]:
from nltk import word_tokenize, FreqDist
from nltk.util import ngrams
from nltk.corpus import stopwords

unigrams = []
bigrams = []
trigrams = []
fourgrams = []
tokenized = []

stop_words = set(stopwords.words("english"))

for sentence in sentences:
    words = word_tokenize(sentence)
    
    for word in words:
        if word == ".":
            words.remove(word)
        else:
            unigrams.append(word)
        
        tokenized.append(words)
        
        bigrams.extend(list(ngrams(words, 2)))
        trigrams.extend(list(ngrams(words, 3)))
        fourgrams.extend(list(ngrams(words, 4)))   

In [19]:
def remove_stopwords(ngram: list):
    new_ngram = []
    for sequence in ngram:
        count = 0
        for word in sequence:
            count = count or 0 if word in stop_words else count or 1
        if count == 1:
            new_ngram.append(sequence)
            
    return new_ngram

bigrams = remove_stopwords(bigrams)
trigrams = remove_stopwords(trigrams)
fourgrams = remove_stopwords(fourgrams)

## Probabilidades e Smoothing

Probabilidade de uma palavra no texto:

$\large{P(palavra) = \frac{n_{ocorrencias}}{N_{palavras}}}$

Se a palavra não existe, precisamos adicionar artificialmente para não gerar probabilidade zero. Portanto, adiciona-se 1 (Laplace Smoothing) a todos os membros do vocabulário. Como adiciona-se 1, apenas precisamos contar quantas palavras únicas temos no dicionário para considerar os 1 somados.

$\large{P(palavra) = \frac{n_{ocorrencias} + 1}{N_{palavras} + N_{unicas}}}$

In [40]:
from collections import Counter

n1, n2, n3, n4 = Counter(unigrams), Counter(bigrams), Counter(trigrams), Counter(fourgrams)
s1, s2, s3, s4 = set(unigrams), set(bigrams), set(trigrams), set(fourgrams)

In [41]:
p1 = [[n, (n1[n] + 1)/(len(n1) + len(s1))] for n in s1]
p1 = sorted(p1, key=lambda w: w[1], reverse=True)

In [42]:
p2 = [[n, (n2[n] + 1)/(len(n2) + len(s2))] for n in s2]
p2 = sorted(p2, key=lambda w: w[1], reverse=True)

In [43]:
p3 = [[n, (n3[n] + 1)/(len(n3) + len(s3))] for n in s3]
p3 = sorted(p3, key=lambda w: w[1], reverse=True)

In [44]:
p4 = [[n, (n4[n] + 1)/(len(n4) + len(s4))] for n in s4]
p4 = sorted(p4, key=lambda w: w[1], reverse=True)

## Prediction

In [79]:
example = "buy"

def return_prediction(inp, n_words, model=2):
    # Prediction list
    pred = []
    
    # Tokenize our input
    inp = word_tokenize(inp)
    
    # Probability model
    p = None
    
    # Ngram
    ngram = None
    
    if model == 2:
        p = p2
        
    elif model == 3:
        p = p3
                
    elif model == 4:
        p = p4
    
    if len(inp) < model - 1:
        return f"Could not form a n-gram of size {model} with given input."
    
    # Select n - 1 last words per ngram type
    ngram = list(ngrams(inp, model - 1))[-1] 
    
    count = 0
    
    for wp in p:
        # Cut off last word of ngram prob and see if matches with the last word of the ngram generated
        if wp[0][:-1] == ngram:
            count += 1
            pred.append(wp[0][-1].replace("-", "'"))
            
            if count == n_words:
                break
                
    if count < 5:
        pred += ["N/A"] * (n_words - count)
        
    return pred

pred = return_prediction(example, n_words=5, model=3)
print(pred)

Could not form a n-gram of size 3 with given input.
