# Parse Tweets
Instead of using NLTK to tokenize/lemmatize documents, spacy is used in this notebook.

Reference: https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

In [1]:
!python -m spacy download en_core_web_sm
!python -m spacy validate

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/home/ec2-user/anaconda3/lib/python3.7/site-packages/spacy[0m

TYPE      NAME             MODEL            VERSION                            
package   en-core-web-sm   en_core_web_sm   [38;5;2m2.1.0[0m   [38;5;2m✔[0m



In [2]:
import pandas as pd
import spacy

# Read Dataset

In [3]:
def read_dataset(filename):
    dataset_df = pd.read_csv(filename, error_bad_lines=False)
    
    dataset = dataset_df['text']
    
    return dataset

In [4]:
filename_dataset = './datasets/twitter_trump_2019_0101-2019_0531.csv'
raw_docs = read_dataset(filename_dataset)

In [5]:
len(raw_docs)

2125

In [6]:
raw_docs[0]

'Robert Mueller came to the Oval Office (along with other potential candidates) seeking to be named the Director of the FBI. He had already been in that position for 12 years I told him NO. The next day he was named Special Counsel - A total Conflict of Interest. NICE!'

In [7]:
raw_docs[1]

'“Comey and Brennan are turning on each other.”  @kilmeade'

# Test Spacy

### Tagger (Fine-Grained POS) + Parser + Named Entity Recognizer

In [8]:
nlp = spacy.load('en_core_web_sm')

In [9]:
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f7039a437b8>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f70375c4fa8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f70375e1048>)]

In [10]:
index_doc = 0

doc = nlp(raw_docs[index_doc])

In [11]:
print([t.text for t in doc])

['Robert', 'Mueller', 'came', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidates', ')', 'seeking', 'to', 'be', 'named', 'the', 'Director', 'of', 'the', 'FBI', '.', 'He', 'had', 'already', 'been', 'in', 'that', 'position', 'for', '12', 'years', 'I', 'told', 'him', 'NO', '.', 'The', 'next', 'day', 'he', 'was', 'named', 'Special', 'Counsel', '-', 'A', 'total', 'Conflict', 'of', 'Interest', '.', 'NICE', '!']


In [12]:
# show named entity
[(ent.text, ent.label_) for ent in doc.ents]

[('Robert Mueller', 'PERSON'),
 ('the Oval Office', 'ORG'),
 ('FBI', 'ORG'),
 ('12 years', 'DATE'),
 ('The next day', 'DATE'),
 ('Special Counsel - A', 'ORG'),
 ('Conflict of Interest', 'ORG')]

### Disable NER (Named Entity Parser)

In [13]:
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f7037374860>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f70372cce28>)]

In [14]:
index_doc = 0

doc = nlp(raw_docs[index_doc])

In [15]:
[(ent.text, ent.label_) for ent in doc.ents]

[]

In [16]:
print([t.text for t in doc])

['Robert', 'Mueller', 'came', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidates', ')', 'seeking', 'to', 'be', 'named', 'the', 'Director', 'of', 'the', 'FBI', '.', 'He', 'had', 'already', 'been', 'in', 'that', 'position', 'for', '12', 'years', 'I', 'told', 'him', 'NO', '.', 'The', 'next', 'day', 'he', 'was', 'named', 'Special', 'Counsel', '-', 'A', 'total', 'Conflict', 'of', 'Interest', '.', 'NICE', '!']


# Test Disable Some Pipelines

# Disable Tagger (POS)

In [52]:
nlp = spacy.load('en_core_web_sm', disable=['tagger'])
nlp.pipeline

[('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f7035a712e8>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f7035a71348>)]

In [53]:
index_doc = 0
doc = nlp(raw_docs[index_doc])

In [54]:
print([t.lemma_ for t in doc])

['Robert', 'Mueller', 'come', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidate', ')', 'seek', 'to', 'be', 'name', 'the', 'Director', 'of', 'the', 'FBI', '.', 'He', 'have', 'already', 'be', 'in', 'that', 'position', 'for', '12', 'year', 'I', 'tell', 'him', 'NO', '.', 'The', 'next', 'day', 'he', 'be', 'name', 'Special', 'Counsel', '-', 'A', 'total', 'Conflict', 'of', 'Interest', '.', 'NICE', '!']


In [55]:
[(ent.text, ent.label_) for ent in doc.ents]

[('Robert Mueller', 'PERSON'),
 ('the Oval Office', 'ORG'),
 ('FBI', 'ORG'),
 ('12 years', 'DATE'),
 ('The next day', 'DATE'),
 ('Special Counsel - A', 'ORG'),
 ('Conflict of Interest', 'ORG')]

In [56]:
print([(token.text, token.pos_) for token in doc])

[('Robert', ''), ('Mueller', ''), ('came', ''), ('to', ''), ('the', ''), ('Oval', ''), ('Office', ''), ('(', ''), ('along', ''), ('with', ''), ('other', ''), ('potential', ''), ('candidates', ''), (')', ''), ('seeking', ''), ('to', ''), ('be', ''), ('named', ''), ('the', ''), ('Director', ''), ('of', ''), ('the', ''), ('FBI', ''), ('.', ''), ('He', ''), ('had', ''), ('already', ''), ('been', ''), ('in', ''), ('that', ''), ('position', ''), ('for', ''), ('12', ''), ('years', ''), ('I', ''), ('told', ''), ('him', ''), ('NO', ''), ('.', ''), ('The', ''), ('next', ''), ('day', ''), ('he', ''), ('was', ''), ('named', ''), ('Special', ''), ('Counsel', ''), ('-', ''), ('A', ''), ('total', ''), ('Conflict', ''), ('of', ''), ('Interest', ''), ('.', ''), ('NICE', ''), ('!', '')]


### Disable Parser

In [44]:
nlp = spacy.load('en_core_web_sm', disable=['parser'])
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f7035927a90>),
 ('ner', <spacy.pipeline.pipes.EntityRecognizer at 0x7f7035514108>)]

In [18]:
type(nlp)

spacy.lang.en.English

In [19]:
index_doc = 0

doc = nlp(raw_docs[index_doc])

In [20]:
type(doc)

spacy.tokens.doc.Doc

In [45]:
print([t.lemma_ for t in doc])

['Robert', 'Mueller', 'come', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidate', ')', 'seek', 'to', 'be', 'name', 'the', 'Director', 'of', 'the', 'FBI', '.', '-PRON-', 'have', 'already', 'be', 'in', 'that', 'position', 'for', '12', 'year', '-PRON-', 'tell', '-PRON-', 'no', '.', 'the', 'next', 'day', '-PRON-', 'be', 'name', 'Special', 'Counsel', '-', 'A', 'total', 'conflict', 'of', 'Interest', '.', 'nice', '!']


In [21]:
[(ent.text, ent.label_) for ent in doc.ents]

[('Robert Mueller', 'PERSON'),
 ('the Oval Office', 'ORG'),
 ('FBI', 'ORG'),
 ('12 years', 'DATE'),
 ('The next day', 'DATE'),
 ('Special Counsel - A', 'ORG'),
 ('Conflict of Interest', 'ORG')]

In [22]:
print([(token.text, token.pos_) for token in doc])

[('Robert', 'PROPN'), ('Mueller', 'PROPN'), ('came', 'VERB'), ('to', 'ADP'), ('the', 'DET'), ('Oval', 'PROPN'), ('Office', 'PROPN'), ('(', 'PUNCT'), ('along', 'ADP'), ('with', 'ADP'), ('other', 'ADJ'), ('potential', 'ADJ'), ('candidates', 'NOUN'), (')', 'PUNCT'), ('seeking', 'VERB'), ('to', 'PART'), ('be', 'VERB'), ('named', 'VERB'), ('the', 'DET'), ('Director', 'PROPN'), ('of', 'ADP'), ('the', 'DET'), ('FBI', 'PROPN'), ('.', 'PUNCT'), ('He', 'PRON'), ('had', 'VERB'), ('already', 'ADV'), ('been', 'VERB'), ('in', 'ADP'), ('that', 'DET'), ('position', 'NOUN'), ('for', 'ADP'), ('12', 'NUM'), ('years', 'NOUN'), ('I', 'PRON'), ('told', 'VERB'), ('him', 'PRON'), ('NO', 'INTJ'), ('.', 'PUNCT'), ('The', 'DET'), ('next', 'ADJ'), ('day', 'NOUN'), ('he', 'PRON'), ('was', 'VERB'), ('named', 'VERB'), ('Special', 'PROPN'), ('Counsel', 'PROPN'), ('-', 'PUNCT'), ('A', 'PROPN'), ('total', 'ADJ'), ('Conflict', 'NOUN'), ('of', 'ADP'), ('Interest', 'PROPN'), ('.', 'PUNCT'), ('NICE', 'INTJ'), ('!', 'PUNCT'

In [23]:
print([t.text for t in doc])

['Robert', 'Mueller', 'came', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidates', ')', 'seeking', 'to', 'be', 'named', 'the', 'Director', 'of', 'the', 'FBI', '.', 'He', 'had', 'already', 'been', 'in', 'that', 'position', 'for', '12', 'years', 'I', 'told', 'him', 'NO', '.', 'The', 'next', 'day', 'he', 'was', 'named', 'Special', 'Counsel', '-', 'A', 'total', 'Conflict', 'of', 'Interest', '.', 'NICE', '!']


### Disable NER (Named Entity Recognizer)

In [57]:
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.pipeline

[('tagger', <spacy.pipeline.pipes.Tagger at 0x7f7035918160>),
 ('parser', <spacy.pipeline.pipes.DependencyParser at 0x7f703594d6a8>)]

In [58]:
index_doc = 0

doc = nlp(raw_docs[index_doc])

In [59]:
print([t.lemma_ for t in doc])

['Robert', 'Mueller', 'come', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidate', ')', 'seek', 'to', 'be', 'name', 'the', 'Director', 'of', 'the', 'FBI', '.', '-PRON-', 'have', 'already', 'be', 'in', 'that', 'position', 'for', '12', 'year', '-PRON-', 'tell', '-PRON-', 'no', '.', 'the', 'next', 'day', '-PRON-', 'be', 'name', 'Special', 'Counsel', '-', 'A', 'total', 'conflict', 'of', 'Interest', '.', 'nice', '!']


In [60]:
[(ent.text, ent.label_) for ent in doc.ents]

[]

In [61]:
print([(token.text, token.pos_) for token in doc])

[('Robert', 'PROPN'), ('Mueller', 'PROPN'), ('came', 'VERB'), ('to', 'ADP'), ('the', 'DET'), ('Oval', 'PROPN'), ('Office', 'PROPN'), ('(', 'PUNCT'), ('along', 'ADP'), ('with', 'ADP'), ('other', 'ADJ'), ('potential', 'ADJ'), ('candidates', 'NOUN'), (')', 'PUNCT'), ('seeking', 'VERB'), ('to', 'PART'), ('be', 'VERB'), ('named', 'VERB'), ('the', 'DET'), ('Director', 'PROPN'), ('of', 'ADP'), ('the', 'DET'), ('FBI', 'PROPN'), ('.', 'PUNCT'), ('He', 'PRON'), ('had', 'VERB'), ('already', 'ADV'), ('been', 'VERB'), ('in', 'ADP'), ('that', 'DET'), ('position', 'NOUN'), ('for', 'ADP'), ('12', 'NUM'), ('years', 'NOUN'), ('I', 'PRON'), ('told', 'VERB'), ('him', 'PRON'), ('NO', 'INTJ'), ('.', 'PUNCT'), ('The', 'DET'), ('next', 'ADJ'), ('day', 'NOUN'), ('he', 'PRON'), ('was', 'VERB'), ('named', 'VERB'), ('Special', 'PROPN'), ('Counsel', 'PROPN'), ('-', 'PUNCT'), ('A', 'PROPN'), ('total', 'ADJ'), ('Conflict', 'NOUN'), ('of', 'ADP'), ('Interest', 'PROPN'), ('.', 'PUNCT'), ('NICE', 'INTJ'), ('!', 'PUNCT'

### Disable ALL

In [62]:
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner', 'parser'])
nlp.pipeline

[]

In [63]:
index_doc = 0

doc = nlp(raw_docs[index_doc])

In [64]:
print([t.text for t in doc])

['Robert', 'Mueller', 'came', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidates', ')', 'seeking', 'to', 'be', 'named', 'the', 'Director', 'of', 'the', 'FBI', '.', 'He', 'had', 'already', 'been', 'in', 'that', 'position', 'for', '12', 'years', 'I', 'told', 'him', 'NO', '.', 'The', 'next', 'day', 'he', 'was', 'named', 'Special', 'Counsel', '-', 'A', 'total', 'Conflict', 'of', 'Interest', '.', 'NICE', '!']


In [65]:
print([t.lemma_ for t in doc])

['Robert', 'Mueller', 'come', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidate', ')', 'seek', 'to', 'be', 'name', 'the', 'Director', 'of', 'the', 'FBI', '.', 'He', 'have', 'already', 'be', 'in', 'that', 'position', 'for', '12', 'year', 'I', 'tell', 'him', 'NO', '.', 'The', 'next', 'day', 'he', 'be', 'name', 'Special', 'Counsel', '-', 'A', 'total', 'Conflict', 'of', 'Interest', '.', 'NICE', '!']


In [26]:
[(ent.text, ent.label_) for ent in doc.ents]

[]

In [27]:
print([(token.text, token.pos_) for token in doc])

[('Robert', ''), ('Mueller', ''), ('came', ''), ('to', ''), ('the', ''), ('Oval', ''), ('Office', ''), ('(', ''), ('along', ''), ('with', ''), ('other', ''), ('potential', ''), ('candidates', ''), (')', ''), ('seeking', ''), ('to', ''), ('be', ''), ('named', ''), ('the', ''), ('Director', ''), ('of', ''), ('the', ''), ('FBI', ''), ('.', ''), ('He', ''), ('had', ''), ('already', ''), ('been', ''), ('in', ''), ('that', ''), ('position', ''), ('for', ''), ('12', ''), ('years', ''), ('I', ''), ('told', ''), ('him', ''), ('NO', ''), ('.', ''), ('The', ''), ('next', ''), ('day', ''), ('he', ''), ('was', ''), ('named', ''), ('Special', ''), ('Counsel', ''), ('-', ''), ('A', ''), ('total', ''), ('Conflict', ''), ('of', ''), ('Interest', ''), ('.', ''), ('NICE', ''), ('!', '')]


In [28]:
print([t.text for t in doc])

['Robert', 'Mueller', 'came', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidates', ')', 'seeking', 'to', 'be', 'named', 'the', 'Director', 'of', 'the', 'FBI', '.', 'He', 'had', 'already', 'been', 'in', 'that', 'position', 'for', '12', 'years', 'I', 'told', 'him', 'NO', '.', 'The', 'next', 'day', 'he', 'was', 'named', 'Special', 'Counsel', '-', 'A', 'total', 'Conflict', 'of', 'Interest', '.', 'NICE', '!']


# Implement Preprocessor

In [29]:
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

#### Lemmatize

In [30]:
index_doc = 0

doc = nlp(raw_docs[index_doc])

# Lemmatize
text = [token.lemma_ for token in doc]

In [31]:
print(raw_docs[index_doc])

Robert Mueller came to the Oval Office (along with other potential candidates) seeking to be named the Director of the FBI. He had already been in that position for 12 years I told him NO. The next day he was named Special Counsel - A total Conflict of Interest. NICE!


In [32]:
print(text)

['Robert', 'Mueller', 'come', 'to', 'the', 'Oval', 'Office', '(', 'along', 'with', 'other', 'potential', 'candidate', ')', 'seek', 'to', 'be', 'name', 'the', 'Director', 'of', 'the', 'FBI', '.', '-PRON-', 'have', 'already', 'be', 'in', 'that', 'position', 'for', '12', 'year', '-PRON-', 'tell', '-PRON-', 'no', '.', 'the', 'next', 'day', '-PRON-', 'be', 'name', 'Special', 'Counsel', '-', 'A', 'total', 'conflict', 'of', 'Interest', '.', 'nice', '!']


#### Remove Stopwords

In [33]:
text = [token.lemma_ for token in doc if not token.is_stop]

In [34]:
print(text)

['Robert', 'Mueller', 'come', 'Oval', 'Office', '(', 'potential', 'candidate', ')', 'seek', 'name', 'Director', 'FBI', '.', 'position', '12', 'year', 'tell', '.', 'day', 'name', 'Special', 'Counsel', '-', 'total', 'conflict', 'Interest', '.', 'nice', '!']


#### Remove Punctuations

In [35]:
text = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

In [36]:
print(text)

['Robert', 'Mueller', 'come', 'Oval', 'Office', 'potential', 'candidate', 'seek', 'name', 'Director', 'FBI', 'position', '12', 'year', 'tell', 'day', 'name', 'Special', 'Counsel', 'total', 'conflict', 'Interest', 'nice']


In [37]:
text = [token.lemma_ for token in doc if token.pos_ not in ('PUNCT')]
print(text)

['Robert', 'Mueller', 'come', 'to', 'the', 'Oval', 'Office', 'along', 'with', 'other', 'potential', 'candidate', 'seek', 'to', 'be', 'name', 'the', 'Director', 'of', 'the', 'FBI', '-PRON-', 'have', 'already', 'be', 'in', 'that', 'position', 'for', '12', 'year', '-PRON-', 'tell', '-PRON-', 'no', 'the', 'next', 'day', '-PRON-', 'be', 'name', 'Special', 'Counsel', 'A', 'total', 'conflict', 'of', 'Interest', 'nice']


# Define Function to Lemmatize and Remove Stopwords

In [38]:
def cleaning(doc_spacy):
    """
    Args:
        tokens_spacy (spacy.tokens.doc.Doc):
    """
    
#     text = [token.lemma_ for token in doc_spacy if not token.is_stop and not token.is_punct]
    text = [token.lemma_ for token in doc_spacy if token.pos_ not in ('PUNCT', 'NUM') and not token.is_stop]

    
    return text

In [39]:
spacy.explain('SYM')

'symbol'

### Performance with Pipe
Taking advantage of spaCy .pipe() attribute to speed-up the cleaning process:

In [40]:
# brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in raw_docs)

brief_cleaning = (row.lower() for row in raw_docs)

type(brief_cleaning)

# %time texts = [cleaning(doc) for doc in nlp.pipe(raw_docs)]
%time texts = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=100, n_threads=-1)]

CPU times: user 2.39 s, sys: 116 ms, total: 2.51 s
Wall time: 2.51 s


In [41]:
print(texts[0])

['robert', 'mueller', 'come', 'oval', 'office', 'potential', 'candidate', 'seek', 'name', 'director', 'fbi', 'position', 'year', 'tell', 'day', 'name', 'special', 'counsel', 'total', 'conflict', 'interest', 'nice']


### Performance without Pipe

In [42]:
def parse_forloop():
    texts = [] 
    for d in raw_docs:
        d = d.lower()
        text = cleaning(nlp(d))
        texts.append(text)
        
    return texts

%time texts = parse_forloop()

CPU times: user 5.36 s, sys: 0 ns, total: 5.36 s
Wall time: 5.36 s


In [43]:
print(texts[0])

['robert', 'mueller', 'come', 'oval', 'office', 'potential', 'candidate', 'seek', 'name', 'director', 'fbi', 'position', 'year', 'tell', 'day', 'name', 'special', 'counsel', 'total', 'conflict', 'interest', 'nice']
