# Parse Tweets
Instead of using NLTK to tokenize/lemmatize documents, spacy is used in this notebook.

Reference: https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

In [1]:
!python -m spacy download en_core_web_sm
!python -m spacy validate

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/home/ubuntu/anaconda3/lib/python3.7/site-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                            
package   en-core-web-sm   en_core_web_sm   [38;5;2m2.1.0[0m   [38;5;2m✔[0m
link      en               en_core_web_sm   [38;5;2m2.1.0[0m   [38;5;2m✔[0m



In [2]:
import pandas as pd
import spacy

# Read Dataset

In [3]:
def read_dataset(filename):
    dataset_df = pd.read_csv(filename, error_bad_lines=False)
    
    dataset = dataset_df['text']
    
    return dataset

In [4]:
filename_dataset = './datasets/twitter_trump_2019_0101-2019_0531.csv'
raw_docs = read_dataset(filename_dataset)

In [5]:
len(raw_docs)

2125

In [6]:
raw_docs[0]

'Robert Mueller came to the Oval Office (along with other potential candidates) seeking to be named the Director of the FBI. He had already been in that position for 12 years I told him NO. The next day he was named Special Counsel - A total Conflict of Interest. NICE!'

In [7]:
raw_docs[1]

'“Comey and Brennan are turning on each other.”  @kilmeade'

# Test Spacy

### Tagger (Fine-Grained POS) + Parser + Named Entity Recognizer

In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f6cc0e60b38>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f6cc0d46048>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f6cc0d460a8>)]

In [10]:
index_doc = 0

doc = nlp(raw_docs[index_doc])

In [11]:
print([t.text for t in doc])

['Robert', 'Mueller', 'came', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidates', ')', 'seeking', 'to', 'be', 'named', 'the', 'Director', 'of', 'the', 'FBI', '.', 'He', 'had', 'already', 'been', 'in', 'that', 'position', 'for', '12', 'years', 'I', 'told', 'him', 'NO', '.', 'The', 'next', 'day', 'he', 'was', 'named', 'Special', 'Counsel', '-', 'A', 'total', 'Conflict', 'of', 'Interest', '.', 'NICE', '!']


In [12]:
# show named entity
[(ent.text, ent.label_) for ent in doc.ents]

[('Robert Mueller', 'PERSON'),
 ('the Oval Office', 'ORG'),
 ('FBI', 'ORG'),
 ('12 years', 'DATE'),
 ('The next day', 'DATE'),
 ('Special Counsel - A', 'ORG'),
 ('Conflict of Interest', 'ORG')]

### Disable NER (Named Entity Parser)

In [13]:
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f6cc37692b0>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f6cc0a31e88>)]

In [14]:
index_doc = 0

doc = nlp(raw_docs[index_doc])

In [15]:
[(ent.text, ent.label_) for ent in doc.ents]

[]

In [16]:
print([t.text for t in doc])

['Robert', 'Mueller', 'came', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidates', ')', 'seeking', 'to', 'be', 'named', 'the', 'Director', 'of', 'the', 'FBI', '.', 'He', 'had', 'already', 'been', 'in', 'that', 'position', 'for', '12', 'years', 'I', 'told', 'him', 'NO', '.', 'The', 'next', 'day', 'he', 'was', 'named', 'Special', 'Counsel', '-', 'A', 'total', 'Conflict', 'of', 'Interest', '.', 'NICE', '!']


# Test Disable Some Pipelines

### Disable Parser

In [17]:
nlp = spacy.load('en_core_web_sm', disable=['parser'])
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f6cc0bfb940>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f6cc083b288>)]

In [18]:
index_doc = 0

doc = nlp(raw_docs[index_doc])

In [19]:
[(ent.text, ent.label_) for ent in doc.ents]

[('Robert Mueller', 'PERSON'),
 ('the Oval Office', 'ORG'),
 ('FBI', 'ORG'),
 ('12 years', 'DATE'),
 ('The next day', 'DATE'),
 ('Special Counsel - A', 'ORG'),
 ('Conflict of Interest', 'ORG')]

In [20]:
print([t.text for t in doc])

['Robert', 'Mueller', 'came', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidates', ')', 'seeking', 'to', 'be', 'named', 'the', 'Director', 'of', 'the', 'FBI', '.', 'He', 'had', 'already', 'been', 'in', 'that', 'position', 'for', '12', 'years', 'I', 'told', 'him', 'NO', '.', 'The', 'next', 'day', 'he', 'was', 'named', 'Special', 'Counsel', '-', 'A', 'total', 'Conflict', 'of', 'Interest', '.', 'NICE', '!']


### Disable ALL

In [21]:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner', 'parser'])
nlp.pipeline

[]

In [22]:
index_doc = 0

doc = nlp(raw_docs[index_doc])

In [23]:
[(ent.text, ent.label_) for ent in doc.ents]

[]

In [24]:
print([t.text for t in doc])

['Robert', 'Mueller', 'came', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidates', ')', 'seeking', 'to', 'be', 'named', 'the', 'Director', 'of', 'the', 'FBI', '.', 'He', 'had', 'already', 'been', 'in', 'that', 'position', 'for', '12', 'years', 'I', 'told', 'him', 'NO', '.', 'The', 'next', 'day', 'he', 'was', 'named', 'Special', 'Counsel', '-', 'A', 'total', 'Conflict', 'of', 'Interest', '.', 'NICE', '!']


# Implement Preprocessor

In [25]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

#### Lemmatize

In [26]:
index_doc = 0

doc = nlp(raw_docs[index_doc])

# Lemmatize
text = [token.lemma_ for token in doc]

In [29]:
print(raw_docs[index_doc])

Robert Mueller came to the Oval Office (along with other potential candidates) seeking to be named the Director of the FBI. He had already been in that position for 12 years I told him NO. The next day he was named Special Counsel - A total Conflict of Interest. NICE!


In [28]:
print(text)

['Robert', 'Mueller', 'come', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidate', ')', 'seek', 'to', 'be', 'name', 'the', 'Director', 'of', 'the', 'FBI', '.', '-PRON-', 'have', 'already', 'be', 'in', 'that', 'position', 'for', '12', 'year', '-PRON-', 'tell', '-PRON-', 'no', '.', 'the', 'next', 'day', '-PRON-', 'be', 'name', 'Special', 'Counsel', '-', 'A', 'total', 'conflict', 'of', 'Interest', '.', 'nice', '!']


#### Remove Stopwords

In [None]:
text = [token.lemma_ for token in doc if not token.is_stop]

In [31]:
print(text)

['Robert', 'Mueller', 'come', 'Oval', 'Office', '(', 'potential', 'candidate', ')', 'seek', 'name', 'Director', 'FBI', '.', 'position', '12', 'year', 'tell', '.', 'day', 'name', 'Special', 'Counsel', '-', 'total', 'conflict', 'Interest', '.', 'nice', '!']


# Define Function to Lemmatize and Remove Stopwords

In [35]:
def cleaning(doc_spacy):
    """
    Args:
        tokens_spacy (spacy.tokens.doc.Doc):
    """
    
    text = [token.lemma_ for token in doc_spacy if not token.is_stop]
    
    return text

### Performance with Pipe
Taking advantage of spaCy .pipe() attribute to speed-up the cleaning process:

In [92]:
# brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in raw_docs)

brief_cleaning = (row.lower() for row in raw_docs)

type(brief_cleaning)

# %time texts = [cleaning(doc) for doc in nlp.pipe(raw_docs)]
%time texts = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=100, n_threads=-1)]

CPU times: user 2.01 s, sys: 0 ns, total: 2.01 s
Wall time: 2.01 s


In [91]:
print(texts[0])

['robert', 'mueller', 'come', 'oval', 'office', '(', 'potential', 'candidate', ')', 'seek', 'name', 'director', 'fbi', '.', 'position', '12', 'year', 'tell', '.', 'day', 'name', 'special', 'counsel', '-', 'total', 'conflict', 'interest', '.', 'nice', '!']


### Performance without Pipe

In [62]:
def parse_forloop():
    texts = [] 
    for d in raw_docs:
        d = d.lower()
        text = cleaning(nlp(d))
        texts.append(text)
        
    return texts

%time texts = parse_forloop()

CPU times: user 5.61 s, sys: 0 ns, total: 5.61 s
Wall time: 5.61 s


In [63]:
print(texts[0])

['robert', 'mueller', 'come', 'oval', 'office', '(', 'potential', 'candidate', ')', 'seek', 'name', 'director', 'fbi', '.', 'position', '12', 'year', 'tell', '.', 'day', 'name', 'special', 'counsel', '-', 'total', 'conflict', 'interest', '.', 'nice', '!']
