# Set up

## libraries

In [22]:
# basic operations
import os
import logging
import re
from pprint import pprint as pp
# data analysis/management/manipulation
import numpy as np
import pandas as pd
# nlp pipeline
import spacy
import en_core_web_lg
import stanza 
# building corpus/dictionary
import gensim
from gensim import corpora
from gensim.models import Phrases
from gensim.corpora import Dictionary

## package version

In [21]:
print("""
spaCy version: {}
Gensim version: {}
""".format(spacy.__version__, gensim.__version__))


spaCy version: 2.0.12
Gensim version: 3.8.0



## working directory

In [50]:
PATH = os.getcwd()
FOLDER = 'brexit'

# Read data

In [3]:
FILE1 = 'pr__sus_attr.csv'
FILE2 = 'pr__sus_docs.csv'
FILE3 = 'pr__sources.csv'
pr_attr = pd.read_csv(os.path.join(PATH, FOLDER, FILE1))
pr_docs = pd.read_csv(os.path.join(PATH, FOLDER, FILE2))
pr_sources = pd.read_csv(os.path.join(PATH, FOLDER, FILE3))

In [4]:
pr_attr.columns

Index(['article', 'source', 'variable', 'value'], dtype='object')

In [17]:
pr_docs.columns

Index(['source', 'text', 'article', 'start', 'id', 'sort'], dtype='object')

## clean the data

In [6]:
pr_date = pr_attr[pr_attr.variable == 'date'].drop(['variable','source'], axis=1)

# the expected timespan is April 23, 2016 - August 23, 2016
pr_timespan = pr_date[(pr_date.value < '2016-08-24 00:00:00') & (pr_date.value >= '2016-04-23 00:00:00')]

# get list of chosen article id
articles = pr_timespan.article.to_list()

# join pr_docs and pr_timespan
pr = pd.merge(pr_timespan, pr_docs, on=['article', 'article']).rename(columns={'value':'date'})
pr.sort_values('date', inplace=True)
pr

Unnamed: 0,article,date,source,text,start,id,sort
445,647,2016-04-23 00:00:00,3,HIGHLIGHT: Ed Crooks is fascinated by a biogra...,HIGHLIGHT: Ed Crooks is fascinated by a biogra...,180,1
444,646,2016-04-23 00:00:00,3,HIGHLIGHT: \'If China could be persuaded to co...,HIGHLIGHT: \'If China could be persuaded to co...,225,1
443,645,2016-04-23 00:00:00,3,"After months of wrangling, eurozone finance mi...","After months of wrangling, eurozone finance mi...",20,1
442,644,2016-04-23 00:00:00,3,How can the world best combat global warming? ...,How can the world best combat global warming? ...,233,1
446,648,2016-04-26 00:00:00,3,"Zaoui & Co, the tiny European advisory firm se...","Zaoui & Co, the tiny European advisory firm se...",599,1
...,...,...,...,...,...,...,...
562,765,2016-08-17 00:00:00,3,Ministers have given the go-ahead for the worl...,Ministers have given the go-ahead for the worl...,322,1
441,619,2016-08-18 00:00:00,3,Sustainable palm oil buyers are still struggli...,Sustainable palm oil buyers are still struggli...,436,1
563,766,2016-08-18 00:00:00,3,Timing is everything when it comes to investin...,Timing is everything when it comes to investin...,545,1
564,767,2016-08-19 00:00:00,3,When the furnaces were turned off at South Aus...,When the furnaces were turned off at South Aus...,581,1


# NLP Pipeline

In [7]:
# prepare list to pass through spacy
docs = [article.strip().lower() for article in pr.text]

# hyphen to underscores
docs = [re.sub(r'\b-\b', '_', text) for text in docs]

## simple 'web_lg'

In [44]:
# load spaCy model 'web_lg'
nlp = en_core_web_lg.load()

In [45]:
# expand on spaCy's stopwords
# my stopwrods
my_stopwords = ['\x1c',
                'ft', 'wsj', 'time', 'sec',
                'say', 'says', 'said',
                'mr.', 'mister', 'mr', 'miss', 'ms',
                'inc']
# expand on spacy's stopwords
for stopword in my_stopwords:
    nlp.vocab[stopword].is_stop = True

In [46]:
# tokenize text
docs_tokens, tmp_tokens = [], []

for doc in docs:
    tmp_tokens = [token.lemma_ for token in nlp(doc)
                  if not token.is_stop
                  and not token.is_space
                  and not token.is_punct
                  and not token.like_num
                  and not token.like_url
                  and not token.like_email
                  and not token.is_currency
                  and not token.is_oov]
    docs_tokens.append(tmp_tokens)
    tmp_tokens = []

In [47]:
phrases = Phrases(docs, min_count=30, progress_per=10000)

In [48]:
# get rid of common terms
common_terms = [u'of', u'with', u'without', u'and', u'or', u'the', u'a',
                u'not', 'be', u'to', u'this', u'who', u'in']

# fing phrases as bigrams
bigram = Phrases(docs_tokens,
                 min_count=50,
                 threshold=5,
                 max_vocab_size=50000,
                 common_terms=common_terms)
# fing phrases as trigrams
trigram = Phrases(bigram[docs_tokens],
                  min_count=50,
                  threshold=5,
                  max_vocab_size=50000,
                  common_terms=common_terms)
# manipulate docs
docs_phrased = [bigram[line] for line in docs_tokens]

In [53]:
os.getcwd()

'/Users/omoi/Documents/SMM694-NLP'

In [55]:
pr_dictionary = Dictionary(docs_phrased)
pr_dictionary.save('/Users/omoi/Documents/SMM694-NLP/corpus/pr_dictionary.dict')

pr_corpus = [pr_dictionary.doc2bow(doc) for doc in docs_phrased]

corpora.MmCorpus.serialize('/Users/omoi/Documents/SMM694-NLP/corpus/pr_corpus.mm', pr_corpus)

In [15]:
# check outcome of nlp pipeline
print('''
=============================================================================
published text: {}

=============================================================================
tokenized text: {}

=============================================================================
tri-grammed tokenized text: {}

'''.format(docs[1],
           docs_tokens[1],
           docs_phrased[1]))


published text: highlight: \'if china could be persuaded to construct energy_efficient new buildings, we might have an effective way of reducing global warming\'  how can the world best combat global warming? when asked that question, many people will talk about the need to close coal_fired power stations, embrace electric cars, recycle our rubbish and eschew air travel. some might also point out that it is not enough for reforms to occur only in the west - emerging economies are also crucial. but if the former us treasury secretary hank paulson is correct, there is another essential step that almost nobody is talking about: putting money into chinese housing. yes, you read that right. these days, the phrase "chinese real estate" is usually mentioned by economists who are worried about bubbles. but paulson is fascinated by another concern. recent environmental studies suggest that about "40 per cent of carbon emissions currently come from buildings", or so he told me over lunch earlie

## stanza

In [61]:
# get a list to pass through the Stanza pipeline
def cleaning(_string):
    '''
    : argument     : string 's'
    : return clean : clean version of 's' (lower case, no non-alpha characters)
    '''
    # purge non alpha characters
    alpha = re.sub("[^A-Za-z']+", ' ', str(_string))
    return alpha.lower()


# get a list
docs = [cleaning(item) for item in pr.text]

In [57]:
# download English model (once for all)
#stanza.download('en') 

# initialize English neural pipeline
nlp = stanza.Pipeline('en')

2020-07-07 15:24:33 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| ner       | ontonotes |

2020-07-07 15:24:33 INFO: Use device: cpu
2020-07-07 15:24:33 INFO: Loading: tokenize
2020-07-07 15:24:33 INFO: Loading: pos
2020-07-07 15:24:33 INFO: Loading: lemma
2020-07-07 15:24:34 INFO: Loading: depparse
2020-07-07 15:24:34 INFO: Loading: ner
2020-07-07 15:24:35 INFO: Done loading processors!


In [70]:
'''
for doc in docs:
    for i, sentence in enumerate(nlp(doc).sentences):
        print(f'___________ Sentence {i+1} tokens ____________')
        print(*[f'id: {token.id}\ttext: {token.text}' 
                for token in sentence.tokens], sep='\n')


tokens = []

for i, doc in enumerate(docs):
    for j, sentence in enumerate(nlp(doc).sentences):
        tmp_tokens = [t.text for t in sentence.tokens]
        tokens.append([i, [j, tmp_tokens]])



TOO MUCH TIME TO RUN WITHOUT GPU
'''

"\nfor doc in docs:\n    for i, sentence in enumerate(nlp(doc).sentences):\n        print(f'___________ Sentence {i+1} tokens ____________')\n        print(*[f'id: {token.id}\ttext: {token.text}' \n                for token in sentence.tokens], sep='\n')\n\n\ntokens = []\n\nfor i, doc in enumerate(docs):\n    for j, sentence in enumerate(nlp(doc).sentences):\n        tmp_tokens = [t.text for t in sentence.tokens]\n        tokens.append([i, [j, tmp_tokens]])\n\n\n\nTOO MUCH TIME TO RUN WITHOUT GPU\n"

## spacy_stanza

In [40]:
from spacy_stanza import StanzaLanguage

In [67]:
snlp = stanza.Pipeline(lang="en", processors='tokenize,pos,lemma')
nlp = StanzaLanguage(snlp)

2020-07-07 15:39:54 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2020-07-07 15:39:54 INFO: Use device: cpu
2020-07-07 15:39:54 INFO: Loading: tokenize
2020-07-07 15:39:54 INFO: Loading: pos
2020-07-07 15:39:54 INFO: Loading: lemma
2020-07-07 15:39:54 INFO: Done loading processors!


In [72]:
'''
for line in docs:
    doc = nlp.pipe([line])
    token_details = []

    for sents in doc:
        for tok in sents:
            token_details.append([tok.text])
            
'''

KeyboardInterrupt: 