In [1]:
import re
import pandas as pd
import spacy
import numpy as np

In [2]:
nlp = spacy.load('en', disable=["ner", "parser"])

In [3]:
def cleaning(doc):
    """
    :param doc: spacy Doc object processed by the pipeline
    :return: Text lemmatized and without stopwords
    """
    txt = [token.lemma_ for token in doc if not token.is_stop]

    # Since training with small document don't make great benefits, they are ignored.
    if len(txt) > 2:
        return ' '.join(txt)

In [4]:
from sqlitedict import SqliteDict

In [5]:
pua = SqliteDict("./../Sqlite/split_texts/center.sqlite", tablename="value", flag="r")

In [6]:
pua_clean = SqliteDict("center.sqlite_clean.sqlite", tablename="value", journal_mode="OFF")

In [7]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(value["text"])).lower() for value in pua.values())

In [8]:
txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=32, n_threads=250)]

In [9]:
for text, id_c in zip(txt, pua.keys()):
    pua_clean[id_c] = {"text":text, "timestamp":pua[id_c]["timestamp"]}

In [10]:
pua_clean.commit()
pua_clean.close()

In [11]:
df_clean = pd.DataFrame({'clean': txt})

In [12]:
df_clean = df_clean.dropna().drop_duplicates()

In [13]:
df_clean

Unnamed: 0,clean
0,original interview available free want watch
3,good reason people trust computer human being ...
4,onwards ideological nonsense tell exactly go...
5,yeah kinda like fee robo adviser charge
7,haha love jab trump second
8,man real national treasure
9,guy voice year old
10,thank st jack lose father young age adopt jack...
11,sound young interesting
13,follow jack advice year sure glad listen mr bo...


In [14]:
df_clean.to_pickle(f"center.sqlite_clean.csv")