<img src="images/thro.png" align="right"> 
# # A2I2 - Natural Language Processing (NLP)

## <span style="color:red">Lecture - Part 1: Preprocessing</span>

## <span style="color:red">Exercise Solution</span>
---

#### Setup

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import webtext
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [2]:
nltk.download('webtext')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package webtext to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/webtext.zip.
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## <span style="color:blue">Exercise for Part 1</span>

In the exercise we will be looking at a corpus of twitter tweets:

In [3]:
from nltk.corpus import twitter_samples

In [4]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [5]:
tweets_raw = twitter_samples.strings('tweets.20150430-223406.json')
tweets_raw[:5]

['RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain £170 billion per year! #BetterOffOut #UKIP',
 'VIDEO: Sturgeon on post-election deals http://t.co/BTJwrpbmOY',
 'RT @LabourEoin: The economy was growing 3 times faster on the day David Cameron became Prime Minister than it is today.. #BBCqt http://t.co…',
 'RT @GregLauder: the UKIP east lothian candidate looks about 16 and still has an msn addy http://t.co/7eIU0c5Fm1',
 "RT @thesundaypeople: UKIP's housing spokesman rakes in £800k in housing benefit from migrants.  http://t.co/GVwb9Rcb4w http://t.co/c1AZxcLh…"]

In [6]:
# The default tokenizer for Tweets is specialised for 'casual' text, and the 
# tokenized() method returns a list of lists of tokens.
tweets_tok = twitter_samples.tokenized('tweets.20150430-223406.json')
tweets_tok[:3]

[['RT',
  '@KirkKus',
  ':',
  'Indirect',
  'cost',
  'of',
  'the',
  'UK',
  'being',
  'in',
  'the',
  'EU',
  'is',
  'estimated',
  'to',
  'be',
  'costing',
  'Britain',
  '£',
  '170',
  'billion',
  'per',
  'year',
  '!',
  '#BetterOffOut',
  '#UKIP'],
 ['VIDEO',
  ':',
  'Sturgeon',
  'on',
  'post-election',
  'deals',
  'http://t.co/BTJwrpbmOY'],
 ['RT',
  '@LabourEoin',
  ':',
  'The',
  'economy',
  'was',
  'growing',
  '3',
  'times',
  'faster',
  'on',
  'the',
  'day',
  'David',
  'Cameron',
  'became',
  'Prime',
  'Minister',
  'than',
  'it',
  'is',
  'today',
  '..',
  '#BBCqt',
  'http://t.co…']]

**<span style="color:blue">Preprocess this corpus of twitter tweets!</span>**

#### Convert to lower case (downloading and tokenization has already been done)

In [7]:
tweets_lc = [[token.lower() for token in tweet] for tweet in tweets_tok]
tweets_lc[:3]

[['rt',
  '@kirkkus',
  ':',
  'indirect',
  'cost',
  'of',
  'the',
  'uk',
  'being',
  'in',
  'the',
  'eu',
  'is',
  'estimated',
  'to',
  'be',
  'costing',
  'britain',
  '£',
  '170',
  'billion',
  'per',
  'year',
  '!',
  '#betteroffout',
  '#ukip'],
 ['video',
  ':',
  'sturgeon',
  'on',
  'post-election',
  'deals',
  'http://t.co/btjwrpbmoy'],
 ['rt',
  '@laboureoin',
  ':',
  'the',
  'economy',
  'was',
  'growing',
  '3',
  'times',
  'faster',
  'on',
  'the',
  'day',
  'david',
  'cameron',
  'became',
  'prime',
  'minister',
  'than',
  'it',
  'is',
  'today',
  '..',
  '#bbcqt',
  'http://t.co…']]

#### Remove stopwords (incl. special stopwords for tweets)

In [8]:
# we see a lot of stop words, so let's remove these
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [9]:
sw = stopwords.words('english')
tweets_nsw1 = [[word for word in tweet if not word in sw] for tweet in tweets_lc]

In [10]:
tweets_nsw1[0:3]

[['rt',
  '@kirkkus',
  ':',
  'indirect',
  'cost',
  'uk',
  'eu',
  'estimated',
  'costing',
  'britain',
  '£',
  '170',
  'billion',
  'per',
  'year',
  '!',
  '#betteroffout',
  '#ukip'],
 ['video',
  ':',
  'sturgeon',
  'post-election',
  'deals',
  'http://t.co/btjwrpbmoy'],
 ['rt',
  '@laboureoin',
  ':',
  'economy',
  'growing',
  '3',
  'times',
  'faster',
  'day',
  'david',
  'cameron',
  'became',
  'prime',
  'minister',
  'today',
  '..',
  '#bbcqt',
  'http://t.co…']]

In [11]:
fdist = nltk.FreqDist([token for tweet in tweets_nsw1 for token in tweet])
print(fdist)
fdist.most_common(50)

<FreqDist with 20225 samples and 322813 outcomes>


[(':', 16631),
 ('rt', 13540),
 ('.', 12588),
 (',', 7546),
 ('…', 6371),
 ('miliband', 5318),
 ('snp', 4611),
 ('"', 4270),
 ('tories', 4112),
 ('ed', 2812),
 ('labour', 2624),
 ('#bbcqt', 2617),
 ('-', 2574),
 ('cameron', 2468),
 ('?', 2332),
 ("'", 2321),
 ('%', 2284),
 ('!', 1916),
 ('farage', 1823),
 ('ukip', 1773),
 ('tory', 1748),
 ('...', 1725),
 ('david', 1697),
 ('rather', 1537),
 ('vote', 1465),
 ('would', 1355),
 ('let', 1320),
 ('/', 1231),
 ('deal', 1227),
 ('(', 1198),
 ('&', 1182),
 ('#asknigelfarage', 1176),
 ('#ukip', 1125),
 ('http', 1104),
 (')', 1050),
 ('work', 1039),
 ('clegg', 1001),
 ('nigel', 984),
 ('support', 944),
 ('tonight', 927),
 ('people', 891),
 ('scotland', 847),
 ('lab', 838),
 ('going', 821),
 ('leader', 817),
 ('says', 812),
 ('w', 807),
 ('come', 787),
 ('man', 784),
 ('claiming', 772)]

In [12]:
twitter_sw = [':', 'rt', '.', ',', '…', '"', '-', '?', "'", '%', '!', '...', '/', '(', '&', 'http', ')', 'w', 
              '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', ':/', '’', '*', '+', '..', "i'm", "he's" , "he'd"]
tweets_nsw2 = [[word for word in tweet if not word in twitter_sw] for tweet in tweets_nsw1]
fdist = nltk.FreqDist([token for tweet in tweets_nsw2 for token in tweet])
print(fdist)
fdist.most_common(100)

<FreqDist with 20189 samples and 236064 outcomes>


[('miliband', 5318),
 ('snp', 4611),
 ('tories', 4112),
 ('ed', 2812),
 ('labour', 2624),
 ('#bbcqt', 2617),
 ('cameron', 2468),
 ('farage', 1823),
 ('ukip', 1773),
 ('tory', 1748),
 ('david', 1697),
 ('rather', 1537),
 ('vote', 1465),
 ('would', 1355),
 ('let', 1320),
 ('deal', 1227),
 ('#asknigelfarage', 1176),
 ('#ukip', 1125),
 ('work', 1039),
 ('clegg', 1001),
 ('nigel', 984),
 ('support', 944),
 ('tonight', 927),
 ('people', 891),
 ('scotland', 847),
 ('lab', 838),
 ('going', 821),
 ('leader', 817),
 ('says', 812),
 ('come', 787),
 ('man', 784),
 ('claiming', 772),
 ('time', 721),
 ('#snp', 709),
 ('get', 696),
 ('@ukip', 695),
 ('@ed_miliband', 694),
 ('mps', 692),
 ('need', 688),
 ('times', 678),
 ('@nigel_farage', 674),
 ('like', 674),
 ('#ge2015', 658),
 ('wrote', 644),
 ('financial', 642),
 ('inequality', 641),
 ('preoccupied', 637),
 ('@nicolasturgeon', 636),
 ('want', 634),
 ('@tommy_colc', 632),
 ('audience', 587),
 ('government', 582),
 ('one', 561),
 ('sturgeon', 556),


In [13]:
# we could go through more words and remove even more stopwords, but let's leave it at this for now

#### Lemmatize the words

In [14]:
wordnet_lemmatizer = WordNetLemmatizer()
tweets_lem = [[wordnet_lemmatizer.lemmatize(word) for word in tweet] for tweet in tweets_nsw2]

In [15]:
fdist = nltk.FreqDist([token for tweet in tweets_lem for token in tweet])
print(fdist)
fdist.most_common(50)

<FreqDist with 19259 samples and 236064 outcomes>


[('tory', 5863),
 ('miliband', 5318),
 ('snp', 4615),
 ('ed', 2814),
 ('labour', 2651),
 ('#bbcqt', 2617),
 ('cameron', 2468),
 ('farage', 1823),
 ('ukip', 1773),
 ('david', 1697),
 ('vote', 1577),
 ('rather', 1537),
 ('deal', 1443),
 ('time', 1399),
 ('say', 1366),
 ('would', 1355),
 ('let', 1347),
 ('#asknigelfarage', 1176),
 ('#ukip', 1125),
 ('work', 1100),
 ('clegg', 1001),
 ('leader', 991),
 ('nigel', 984),
 ('support', 948),
 ('tonight', 945),
 ('people', 909),
 ('scotland', 853),
 ('lab', 840),
 ('come', 832),
 ('going', 821),
 ('question', 812),
 ('man', 787),
 ('get', 780),
 ('want', 772),
 ('claiming', 772),
 ('mp', 744),
 ('need', 736),
 ('#snp', 709),
 ('@ukip', 695),
 ('@ed_miliband', 694),
 ('like', 690),
 ('@nigel_farage', 674),
 ('#ge2015', 658),
 ('wrote', 644),
 ('financial', 642),
 ('inequality', 641),
 ('preoccupied', 637),
 ('@nicolasturgeon', 636),
 ('@tommy_colc', 632),
 ('poll', 604)]

In [16]:
# note that for example question and questions have been combined, as have saying and say

#### Finally, write the lemmatized tweets into a file so we can re-use it in the next notebook

In [17]:
import pickle
with open('tweets_lem.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(tweets_lem, filehandle)

In [18]:
# EOF