# Importing Data for Data Science 2

# Importing Data

## Import Packages
Import a set of packages that we will use in order to perform text analysis. These are very commonly used Python packages.

In [None]:
import nltk # The best known Python natural language processing toolkit
import matplotlib # Python plotting library
import matplotlib.pyplot as plt # Easy syntax access to pyplot
%matplotlib inline 
matplotlib.rcParams.update({'font.size': 22})
import requests # for URL requests
from bs4 import BeautifulSoup # Import the BeautifulSoup package for web scraping
import feedparser  # For reading RSS feeds - imstnall using ./conda install feedparser
import tweepy # for access to the twitter apis

## Accessing Data From Text Files

Accessing data from a text file is straightforward. 

In [None]:
# Load a simple text file into a string variable
f = open("data/test_text.txt", "r")
lines = f.readlines()
f.close()

# Iterate through each line in the file and print it out
for line in nltk.sent_tokenize(lines[0]):
    print("****", line)

In [None]:
for line in nltk.sent_tokenize(lines[0]):
    print("****")
    for word in nltk.word_tokenize(line):
        print(word)

We can even load a text file across the Internet by using **requests.get** from the **requests** package instead of simply **open**. We use [the Guttenberg Press](http://www.gutenberg.org) in this example.

In [None]:
# Define a URL to Alice in Wonderland on the Guttenberg Press (www.gutenberg.org)
url='http://www.gutenberg.org/cache/epub/11/pg11.txt'

# Read the text from the URL
text_page = requests.get(url).text

# Print the first 500 characters of the book
print(text_page[:1000])

We can even connnect to a HTML file, but this starts to get messy.

In [None]:
# Connect to a URL and extract the HTML text
url = "http://www.independent.ie/sport/soccer/international-soccer/neil-taylor-facing-longer-ban-for-seamus-coleman-horror-tackle-as-fifa-step-in-35578919.html"
text = requests.get(url).text
print(text[:1000])

## Parsing HTML Files

Accessing data from web pages is straightforward. The tricky bit is extracting the useful information from the webpage. We can use the **BeautifulSoup4** (http://www.crummy.com/software/BeautifulSoup) packages to make this easier.

In [None]:
# Read the HTML file
url = "http://www.independent.ie/sport/soccer/international-soccer/neil-taylor-facing-longer-ban-for-seamus-coleman-horror-tackle-as-fifa-step-in-35578919.html"
html = requests.get(url).text

# Create a beautiful soup object from the text file so that we get at the article text
article_soup = BeautifulSoup(html, "html.parser")

# Extract the actual article text  - this relies on the fact that I know what the HTML looks like, not completely robust!
article = article_soup.find('article')
headline = article.find('h1')
article_content = article.find_all('p')

# Start the article text by adding the headline
article_text = headline.get_text()

# Construct the article by adding togehter the paragraph pieces
for tag in article_content:
    article_text += tag.get_text()
    
# Print the article content
print(article_text)

## Accessing RSS Feeds 

One way to access lots of news articles is to use an RSS feed. We can access RSS feeds easily in Python using the **feedparser** package.

In [None]:
# Read from the Irish Times RSS feed
RSS_url = "https://www.irishtimes.com/cmlink/news-1.1319192"
it_feed = feedparser.parse(RSS_url)
len(it_feed.entries)

In [None]:
# Iterate through the entries from the feed and print the title of each article and the URL for the articl
for article_entry in it_feed.entries:
    article_title = article_entry['title']
    article_url = article_entry['links'][0]['href']
    print(article_title)
    print(article_url)

## Accessing Data From Twitter

Twitter is obviously a fun service to get text from. We can use the **Tweepy** package to access the Twitter API. Before using Tweepy you must have Twitter **OAuth credentials** available from https://apps.twitter.com/. Create a new applciation (using your own Twitter credentials) and the generate access tokens.

In [None]:
# OAuth access details for getting at the Twitter API - having these in my code is pretty insecure!!
# consumer_key = "ADD KEYS HERE"
# consumer_secret = "ADD KEYS HERE"
# access_token = "ADD KEYS HERE"
# access_token_secret = "ADD KEYS HERE" 
consumer_key = "qNii6AnY3YxLfcf6ZJTauo6ry"
consumer_secret = "SLQ2YB4NL05gWsaEZvO9lfuMxdQNVw0fNSUsKnjlLjLJe0drIS"
access_token = "4876325422-uA89EDGIeAXIjIiddCyXS8cAoVWeFLqOinv4ahQ"
access_token_secret = "ZXe7QV3u7BgBj46e1ZT8UFejzHLohZtDDavK3WTrLLiNh"

# Connect to the Twitter API using authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)

In [None]:
# Access the tweets appearing in my timeline
coord_list = list()
public_tweets = api.home_timeline(count=25)
for tweet in public_tweets:
    print("@" + tweet.author.screen_name, "|", tweet.author.name)
    print(tweet.text)
    print() 

Use the search API to find tweets based on a search term.

In [None]:
#Search for recent tweets containing a specific keyword
results = api.search(q="Dublin", count=10)
for tweet in results:
    print("@" + tweet.author.screen_name, "|", tweet.author.name)
    print(tweet.text)
    print() 

Iteratively find more and more tweets

In [None]:
max_id_found = None
tweets_found = list()
query_term = 'machine'
for i in  range(0, 50):
    if(max_id_found == None):
        #Search for recent tweets containing a specific keyword
        results = api.search(q=query_term, count=100)
        max_id_found = results[-1].id
    else:
        #Search for recent tweets containing a specific keyword
        results = api.search(q=query_term, count=100, max_id=max_id_found)
        max_id_found = results[-1].id
    
    tweets_found = tweets_found + results


Write tweets to files

In [None]:
import os
import io
import codecs
tweets_dir = "data/tweets/"
if not os.path.exists(tweets_dir): os.mkdir(tweets_dir)
for tweet in tweets_found:

    try:
        file = codecs.open(str(tweets_dir + tweet.id_str + ".txt"),"w", "utf-8") 
        file.write(str("@" + tweet.author.screen_name + " " + tweet.author.name))
        file.write(tweet.text)
        file.close() 
    except:
        print('skipped')

Print tweets

In [None]:
for tweet in tweets_found:
    print("@" + tweet.author.screen_name, "|", tweet.author.name)
    print(tweet.text)
    print() 

Read the tweets written out into an nltk corpus

In [None]:
tweets_corpus = nltk.corpus.reader.plaintext.PlaintextCorpusReader(root='data/tweets', \
                                                                   fileids = '.*\.txt', encoding= 'utf8')

In [None]:
len(tweets_corpus.fileids())

In [None]:
tweet_fd = nltk.FreqDist(tweets_corpus.words())
tweet_fd.most_common(20)

Use a TweetTokeniser instead of the default.

In [None]:
tweets_corpus = nltk.corpus.reader.plaintext.PlaintextCorpusReader(root='./data/tweets', \
                                                                   word_tokenizer = nltk.casual.TweetTokenizer(), \
                                                                   fileids = '.*\.txt', encoding= 'utf8')


In [None]:
tweets_corpus.words()

In [None]:
tweet_fd = nltk.FreqDist(tweets_corpus.words())
tweet_fd.most_common(20)

In [None]:
tweet_fd.plot(25)

Build a unigram labnguage model for tweets

In [None]:
tweet_freq_dist = nltk.FreqDist(tweets_corpus.words())
unigram_model = nltk.probability.LaplaceProbDist(tweet_freq_dist)

Build a bigram labnguage model for tweets

In [None]:
bigram_dist = nltk.probability.ConditionalFreqDist(\
 nltk.bigrams(tweets_corpus.words(), pad_right=True, pad_left=True))

bigram_model = nltk.probability.ConditionalProbDist(\
                    bigram_dist, \
                    nltk.probability.LaplaceProbDist)

Build a trigram language model for tweets

In [None]:
trigrams = nltk.trigrams(tweets_corpus.words(), pad_right=True, pad_left=True)
trigram_pairs = [((t[0], t[1]), t[2]) for t in trigrams]

In [None]:
cfdist = nltk.probability.ConditionalFreqDist(trigram_pairs)
cpdist = nltk.probability.ConditionalProbDist(cfdist, nltk.probability.LaplaceProbDist)

Generate a sentence

In [None]:
sentence = list()
word1 = unigram_model.generate()
word2 = bigram_model[word1].generate()
sentence.append(word1)
for i in range(0, 20):
    sentence.append(word2)
    word3 = cpdist[(word1, word2)].generate()
    word1 = word2
    word2 = word3
display(sentence)