In [1]:
import dask.dataframe as dd
import pandas as pd
import glob
import re
import os

In [2]:
path = r'/home/brunosd/Documents/Insper/7o_Semestre/NLP/Autocomplete/data'
all_files = glob.glob(path + "/english_sentences*.csv")

# Deletes previous csv files to avoid errors in overwriting
for file in all_files:
    if os.path.exists(file):
        os.remove(file)    

In [3]:
ddf = dd.read_csv("data/old-newspaper.tsv", sep="\t")
df = ddf[ddf["Language"] == "English"]
df = df[["Text"]]

In [None]:
# https://stackoverflow.com/questions/47812785/remove-empty-partitions-in-dask
def cull_empty_partitions(df):
    ll = list(df.map_partitions(len).compute())
    df_delayed = df.to_delayed()
    df_delayed_new = list()
    pempty = None
    for ix, n in enumerate(ll):
        if 0 == n:
            pempty = df.get_partition(ix)
        else:
            df_delayed_new.append(df_delayed[ix])
    if pempty is not None:
        df = dd.from_delayed(df_delayed_new, meta=pempty)
    return df

dd.to_csv(df=cull_empty_partitions(df), filename="data/english_sentences*.csv", index=False)

In [None]:
# https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe
li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

data = pd.concat(li, axis=0, ignore_index=True)

In [None]:
# Transform to string
data["Text"] = data["Text"].apply(lambda t: str(t))

In [None]:
# Removing sentences that contain "@, #, $, %, &, *"
data = data[~data["Text"].str.contains(r"[\@\#\$\%\&\*\`]")]

# Replacing ? and ! for .
data["Text"] = data["Text"].apply(lambda t: t.replace("?", ".").replace("!", "."))

In [None]:
# Removing punctuation
data["Text"] = data["Text"].apply(lambda t: re.sub(r"[^\w\s\'\.]", "", t))

# Splitting by .
data["Text"] = data["Text"].apply(lambda t: t.split("."))

data = data.explode("Text").reset_index()

In [None]:
# Filtering sentences that have more than 8 words
data = data[data["Text"].str.split(" ").str.len() >= 8]

# Remove index col
data = data[["Text"]]

# Visualize data
data.head(10)

In [None]:
data.to_csv("data/english_sentences.csv", index=False)