In [1]:
import dask.dataframe as dd
import pandas as pd
import glob
import re
import os

In [2]:
path = r'/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data'
all_files = glob.glob(path + "/english_sentences*.csv")

# Deletes previous csv files to avoid errors in overwriting
for file in all_files:
    if os.path.exists(file):
        os.remove(file)    

In [3]:
ddf = dd.read_csv("data/old-newspaper.tsv", sep="\t")
df = ddf[ddf["Language"] == "English"]
df = df[["Text"]]

In [4]:
# https://stackoverflow.com/questions/47812785/remove-empty-partitions-in-dask
def cull_empty_partitions(df):
    ll = list(df.map_partitions(len).compute())
    df_delayed = df.to_delayed()
    df_delayed_new = list()
    pempty = None
    for ix, n in enumerate(ll):
        if 0 == n:
            pempty = df.get_partition(ix)
        else:
            df_delayed_new.append(df_delayed[ix])
    if pempty is not None:
        df = dd.from_delayed(df_delayed_new, meta=pempty)
    return df

dd.to_csv(df=cull_empty_partitions(df), filename="data/english_sentences*.csv", index=False)

['/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data/english_sentences0.csv',
 '/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data/english_sentences1.csv',
 '/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data/english_sentences2.csv',
 '/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data/english_sentences3.csv',
 '/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data/english_sentences4.csv']

In [5]:
# Reset all files after recreating
all_files = glob.glob(path + "/english_sentences*.csv")

# https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe
li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

data = pd.concat(li, axis=0, ignore_index=True)

In [6]:
# Transform to string
data["Text"] = data["Text"].apply(lambda t: str(t))

# Removing urls
def limpa_url(texto):
    # Regex obtida de https://www.geeksforgeeks.org/python-check-url-string/
    pattern = r"""
        (?i)  # Ignore case.
        \b  # Inicio de palavra.
        (?:
            https?://
        |
            www
            \d{0,3}
            [.]
        |
            [a-z0-9.\-]+
            [.]
            [a-z]{2,4}
            /
        )
        (?:
            [^\s()<>]+
        |
            \(
            (?:
                [^\s()<>]+
            |
                \(
                [^\s()<>]+
                \)
            )*
            \)
        )+
        (?:
            \(
            (?:
                [^\s()<>]+
            |
                \(
                [^\s()<>]+
                \)
            )*
            \)
        |
            [^\s`!()\[\]{};:'\".,<>?«»“”‘’]
        )
    """
    repl = ""
    matcher = re.compile(pattern, re.VERBOSE)
    return matcher.sub(repl, texto)

data["Text"] = data["Text"].apply(lambda t: limpa_url(t))

# Removing sentences that contain "@, #, $, %, &, *"
data = data[~data["Text"].str.contains(r"[\@\#\$\%\&\*\`]")]

# Replacing ? and ! for .
data["Text"] = data["Text"].apply(lambda t: t.replace("?", ".").replace("!", "."))

# Removing punctuation
data["Text"] = data["Text"].apply(lambda t: re.sub(r"[^\w\s\'\.]", "", t))

# Splitting by . and exploding to generate new rows
data["Text"] = data["Text"].apply(lambda t: t.split("."))
data = data.explode("Text")

# Removing trailing and leading whitespaces
data["Text"] = data["Text"].apply(lambda t: t.strip())

# Removes tabs, newlines and extra whitespaces
data["Text"] = data["Text"].apply(lambda t: t.replace("/\s\s+/g", " "))

# Filtering out empty sentences and sentences that have more than 8 words
data = data[(data["Text"] != "") & (data["Text"].str.split(" ").str.len() >= 8)]

# Lowercase EVERYTHING
data["Text"] = data["Text"].apply(lambda t: t.lower())

In [7]:
# Resetting index
data.reset_index(inplace=True)

# Keeping only text column
data = data[["Text"]]

# Visualize data
data.head(10)

Unnamed: 0,Text
0,workers had been making cars there since the o...
1,wsu's plans quickly became a hot topic on loca...
2,though most people applauded plans for the new...
3,and when it's often difficult to predict a law...
4,is it an issue serious enough to merit their a...
5,will it definitely not make the situation worse
6,there was a certain amount of scoffing going a...
7,its just another in a long line of failed atte...
8,the revel casino hit the jackpot here at gover...
9,but time and again in the report sullivan call...


In [8]:
data.to_csv("data/english_sentences.csv", index=False)