# Parse Tweets using Spacy

Reference: https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

In [1]:
!python -m spacy download en_core_web_sm
!python -m spacy validate

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/home/ubuntu/anaconda3/lib/python3.7/site-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                            
package   en-core-web-sm   en_core_web_sm   [38;5;2m2.1.0[0m   [38;5;2m✔[0m
link      en               en_core_web_sm   [38;5;2m2.1.0[0m   [38;5;2m✔[0m



In [2]:
import pandas as pd
import spacy

# Read Dataset

In [3]:
def read_dataset(filename):
    dataset_df = pd.read_csv(filename, error_bad_lines=False)
    
    dataset = dataset_df['text']
    
    return dataset


In [4]:
filename_dataset = './datasets/twitter_trump_2019_0101-2019_0531.csv'
raw_docs = read_dataset(filename_dataset)

In [5]:
len(raw_docs)

2125

# Preprocessor

In [16]:
def preprocess(doc):
    """
    Remove Stopwords/Punctuations/Numbers, Lemmatize
    Args:
        doc (spacy.tokens.doc.Doc)
    """
    
    return [token.lemma_ for token in doc if token.pos_ not in ('PUNCT', 'NUM') and not token.is_stop]

#### Define Normalizer

In [33]:
def brief_cleaning(raw_docs):
    for d in raw_docs:
        yield d.lower()
        
# brief_cleaning_lambda = lambda raw_docs: (row.lower() for row in raw_docs)

print(type(brief_cleaning))

<class 'function'>


# Start Preprocess

In [14]:
# Remove NER (Named Entity Recognizer) and Parser (Dependence Parser)
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

In [15]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f17386379b0>)]

#### Parse using multiple threads API ```nlp.pipe()```

In [34]:
%time texts = [preprocess(doc) for doc in nlp.pipe(brief_cleaning(raw_docs), batch_size=100, n_threads=-1)]

CPU times: user 1.76 s, sys: 71.9 ms, total: 1.83 s
Wall time: 1.83 s


In [35]:
print(texts[0])

['robert', 'mueller', 'come', 'oval', 'office', 'potential', 'candidate', 'seek', 'name', 'director', 'fbi', 'position', 'year', 'tell', 'day', 'name', 'special', 'counsel', 'total', 'conflict', 'interest', 'nice']
