In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import sys
import json
from datetime import datetime
from tqdm import tqdm
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# training and Testing directories
training_dir = os.path.join("Datasets", "Training")
testing_dir = os.path.join("Datasets", "Testing")
if not os.path.isdir(training_dir):
    raise Exception("ERROR: training dataset not found")
if not os.path.isdir(testing_dir):
    raise Exception("ERROR: testing dataset not found")

## Establish Locations

In [2]:
# iterate over all hashtag files 
for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        print(os.path.join(root, file))

Datasets/Training/tweets_#nfl.txt
Datasets/Training/tweets_#superbowl.txt
Datasets/Training/tweets_#sb49.txt
Datasets/Training/tweets_#patriots.txt
Datasets/Training/tweets_#gohawks.txt
Datasets/Training/tweets_#gopatriots.txt


In [3]:
# Look at the gopatriots dataset for testing
# change to only_superbowl = ['superbowl'] for actual running

only_superbowl = ['superbowl']
only_gopatriots = ['gopatriots'] 

In [4]:
# Testing with Smaller File
locations = []

# iterate over all hashtag files 
for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        filename = os.path.splitext(file)[0].replace('tweets_#', '')
          
        # CHANGE TO only_superbowl for final submission
        # Only Look at the gopatriots Data file for testing
#         if not filename in only_superbowl:
        if not filename in only_gopatriots:
            continue
        
        print('Parsing {}...'.format(filename))
        
        # only extracting specific features from the tweet json objects
        citation_dates = []
        
        # open the file and read all lines:
        with open(os.path.join(root, file), "r", encoding="utf-8") as hashtag:
            # read line-by-line
            for line in hashtag:
                json_obj = json.loads(line)
                
                # get citation date
                citation_date = json_obj['citation_date']
                citation_dates.append(citation_date)
                
                # get locations 
                location = json_obj['tweet']['user']['location']
                locations.append(location)
        
        # processing citation feature
        print('\t'+'-'*10)
        citation_dates = np.array(citation_dates)
        print('\tnumber of tweets in period: {}'.format(len(citation_dates)))
        min_date = np.min(citation_dates)
        max_date = np.max(citation_dates)

        # processing all locations
        print('\t'+'-'*10)
        print('\tnumber of locations in {} dataset: {}'.format(filename, len(locations)))
        
        print('\t'+'-'*10)
        print('')

Parsing gopatriots...
	----------
	number of tweets in period: 23511
	----------
	number of locations in gopatriots dataset: 23511
	----------



In [5]:
unique_locations = []

for location in locations:
    if location in unique_locations: # Avoid duplicates
        continue

    unique_locations.append(location)
    
print('number of unique locations is: {}'.format(len(unique_locations)))

number of unique locations is: 7477


## Determine if in MA or WA

In [6]:
# Define Locations

MA_WA = [' MA',' WA', 'Massachusetts', 'Washington', 'Boston', 'Seattle'] # Look for all possible MA and WA matches
# NOTE: added 'space' before MA and WA to avoid problems such as IOWA; Could do this differently

all_MA_WA = [location for location in unique_locations if any(place in location for place in MA_WA)] # Create list of all places in MA and WA

DC = ['DC', 'D.C.', 'D.C']

all_MA_WA_no_DC = [location for location in all_MA_WA if not any(place in location for place in DC)] # remove DC and D.C.

MA = [' MA', 'Massachusetts', 'Boston'] # Get just locations in Massachusetts
WA = [' WA', 'Washington', 'Seattle'] # Get just locations in Washington

only_MA = [location for location in all_MA_WA_no_DC if any(place in location for place in MA)] # Get just locations in Massachusetts
only_WA = [location for location in all_MA_WA_no_DC if any(place in location for place in WA)] # Get just locations in Washington

print('number of unique locations in MA or WA : {}'.format(len(all_MA_WA)))
print('-'*10)
print('number of unique locations in MA or WA after removing DC: {}'.format(len(all_MA_WA_no_DC)))
print('-'*10)
print('number of locations in MA: {}'.format(len(only_MA)))
print('-'*10)
print('number of locations in WA: {}'.format(len(only_WA)))

print('-'*10)
print('Locations in MA and WA: ','\n', *all_MA_WA_no_DC, sep = "\n")
print('-'*10)
print('Locations in MA: ','\n', *only_MA, sep = "\n")
print('-'*10)
print('Locations in WA: ','\n', *only_WA, sep = "\n")

number of unique locations in MA or WA : 349
----------
number of unique locations in MA or WA after removing DC: 339
----------
number of locations in MA: 305
----------
number of locations in WA: 35
----------
Locations in MA and WA: 


Boston, Massachusetts
Massachusetts
North of Boston
Boston, MA
Boston 
Providence, Boston
Boston
Whidbey Island, WA
314 Shawmut Avenue, Boston, MA
Hotlanta/Massachusetts 
Boston, MA 
Needham, MA
Seattle, WA
Boston,MA
Boston Strong 
Boston, MASS
Boylston Street, Boston MA 
Brookline, MA
Boston - Gonaives
Greater Boston
Everett, MA 
Charlestown, MA
Boston, MA ~ New England
Methuen MA
Boston and the South Shore
Boston, Mass
Harvard Square / Cambridge, MA
Boston, Ma 
Boston/ L.A.
Holyoke, MA
Massachusetts, USA
Pittsfield, MA
MAKE PEACE NOT WAR ❤️  #SHALOM
oh Boston you're my home
Cambridge, MA
Boston & San Diego
Braintree, MA
#BostonFinest #PatriotNation
North Bend, WA
Holden, MA
Waltham, MA
Boston. 617
Boston MA
Ohhh Boston, you're my home!
Foxboro, Mass

## Extract all tweets from WA or MA

In [7]:
# Extract all tweets from WA or MA

# Testing with Smaller File

# Store textual data and location labels 0 if MA 1 if WA 
tweet_textual_data = [] 
tweet_location_labels = []

# Initialize counting variables to keep track of number of tweets from MA and WA
num_tweets_MA = 0
num_tweets_WA = 0

for root, dirs, files in os.walk(training_dir, topdown=False):
    for file in files:
        filename = os.path.splitext(file)[0].replace('tweets_#', '')
          
        # CHANGE TO only_superbowl for final submission
        # Only Look at the gopatriots Data file for testing
#         if not filename in only_superbowl:
        if not filename in only_gopatriots:
            continue
        
        print('Parsing {}...'.format(filename))
        
        # only extracting specific features from the tweet json objects
        
        # open the file and read all lines:
        with open(os.path.join(root, file), "r", encoding="utf-8") as hashtag:
            # read line-by-line
            for line in hashtag:
                json_obj = json.loads(line)
                
                # get tweets that are only in MA and WA (not DC)
                location = json_obj['tweet']['user']['location']
                
                if not any(MAWA in location for MAWA in all_MA_WA_no_DC):
                    continue
                
                if any (loc in location for loc in only_MA):
                    # get textual data
                    text = json_obj['tweet']['text'] 
                    tweet_textual_data.append(text)
                    
                    # add location is in MA (0)
                    tweet_location_labels.append(0)
                    
                    num_tweets_MA += 1
                    
                    print('\t', location)
                    
                if any (loc in location for loc in only_WA):
                    if any (loc in location for loc in DC): # Check if contains DC
                        continue
                        
                    # get textual data
                    text = json_obj['tweet']['text'] 
                    tweet_textual_data.append(text)
                
                    # add location is in WA (1)
                    tweet_location_labels.append(1)
                                         
                    num_tweets_WA += 1    
                    
                    print('\t', location)
                
        # process textual data
        print('\t'+'-'*10)
        print('\tNumber of texts in {} dataset: {}'.format(filename, len(tweet_textual_data)))
        
        # Process MA and WA locations
        print('\t'+'-'*10)
        print('\tNumber of tweets from MA and WA is: {}'.format(len(tweet_location_labels)))
        
        print('\t'+'-'*10)
        print('\tNumber of tweets from MA is: ', num_tweets_MA)
        
        print('\t'+'-'*10)
        print('\tNumber of tweets from WA is: ', num_tweets_WA)
        
        print('\t'+'-'*10)
        print('\tTextual data looks like: ', *tweet_textual_data, sep = "\n")
        
        print('\t'+'-'*10)
        print('\tLabels look like: ', tweet_location_labels)
        

Parsing gopatriots...
	 Boston, Massachusetts
	 Boston, Massachusetts
	 Massachusetts
	 North of Boston
	 Boston, MA
	 Boston, Massachusetts
	 Boston, MA
	 Boston, Massachusetts
	 Boston, Massachusetts
	 Boston, Massachusetts
	 Boston 
	 Boston, MA
	 Providence, Boston
	 Boston, Massachusetts
	 Massachusetts
	 Boston
	 Whidbey Island, WA
	 Massachusetts
	 314 Shawmut Avenue, Boston, MA
	 Hotlanta/Massachusetts 
	 Boston, MA 
	 Massachusetts
	 Boston, Massachusetts
	 Boston, MA
	 Needham, MA
	 Boston
	 Boston, Massachusetts
	 Boston, MA 
	 Whidbey Island, WA
	 Seattle, WA
	 Boston,MA
	 Boston Strong 
	 Boston, MASS
	 Boston, Massachusetts
	 Boston, Massachusetts
	 Boylston Street, Boston MA 
	 Boston, Massachusetts
	 Brookline, MA
	 Boston, Massachusetts
	 Massachusetts
	 Boston, MA
	 Boston, MA
	 Boston, Massachusetts
	 Boston, Massachusetts
	 Boston - Gonaives
	 Greater Boston
	 Everett, MA 
	 Whidbey Island, WA
	 Whidbey Island, WA
	 Charlestown, MA
	 Boston, MA ~ New England
	 Bosto

	 Boston, Massachusetts
	 Berkshire County, MA
	 Boston, Massachusetts
	 Boston, Massachusetts
	 Boston, Massachusetts
	 Boston, MA
	 Savannah GA. // Boston MA. R.S
	 Boston, MA
	 Boston
	 Outside Boston
	 Somerset, MA
	 Beverly, MA
	 Beverly, MA
	 Boston
	 Boston MA
	 New Bedford, MA
	 South Boston
	 Whidbey Island, WA
	 Boston
	 Massachusetts
	 Foxboro, MA 02035
	 Boston, MA
	 Whidbey Island, WA
	 Boston, Massachusetts
	 Boston, Massachusetts
	 Boston, Massachusetts
	 Boston, Massachusetts
	 Massachusetts
	 Boston, Massachusetts
	 Mansfield, MA
	 Boston, MA
	 Boston, Massachusetts
	 New Bedford, MA
	 Boston, Massachusetts
	 Boston
	 Outside Boston
	 Massachusetts 
	 Boston
	 Boston, MA
	 Boston MA
	 Southcoast, Massachusetts
	 Boston, Massachusetts
	 North Shore, MA
	 New Bedford, MA
	 Whidbey Island, WA
	 South Boston, MA
	 Massachusetts
	 Lowell, MA
	 Boston, MA
	 Roxbury, MA
	 New Bedford, MA
	 New Bedford, MA
	 Boston 
	 Boston, Massachusetts
	 Boston
	 Boston
	 Cambridge, MA
	 B

	 Boston
	 Boston, MA
	 FL girl ☀ but ❤'s  in Boston 
	 Malden, MA
	 Boston MA
	 Andover, MA
	 Boston
	 Dudley, Massachusetts
	 South of Boston, MA
	 Boston, MA
	 Boston, MA
	 Boston
	 Leominster MA
	 Boston Native, Midwest Convert
	 Boston
	 Boston, MA
	 Marshfield, MA
	 Rome/Boston
	 Lawrence, MA
	 Boston, MA
	 Worcester Massachusetts
	 Boston, MA
	 Boston, MA | Sorocaba, SP
	 Boston, MA
	 LA. Boston. Bacolod. 
	 Cambridge, MA, EEUU
	 CT, NH, MA
	 Atlanta ✈ Boston 
	 Boston 
	 Boston ma.
	 Berkshires, MA
	 Boston
	 Boston, MA
	 Boston
	 Boston
	 Boston / Obregón
	 Boston
	 Boston 
	 Cape Cod/Boston
	 Gardner, MA
	 Malden, MA
	 DIECISIETE DE MAYO
	 Boston, MA
	 Boston, Ma
	 CT, NH, MA
	 Methuen, Massachusetts 
	 Boston
	 Olympia Washington
	 Boston, MA
	 Boston ma.
	 Cambridge, MA
	 Boston
	 CT, NH, MA
	 Boston, Ma 
	 CT, NH, MA
	 Boston,ma
	 Billerica MA
	 Boston
	 Boston
	 Boston, MA
	 Massachusetts
	 Atlanta ✈ Boston 
	 Methuen, Massachusetts 
	 Brazil✈️Boston 
	 Massachusetts, USA

@HarveyWCVB @WCVB they are all awsome tonight! #GoPatriots
Is there a mercy rule in the NFL? My god what a nightmare for Indy! I love it! #GoPatriots @NashIcon989
4th qtr. 45-7... only right!💯 #GoPatriots
Another interception #AFCChampion #gopatriots
Oops!!!! So not a good #Luck night for the #Colts but #GoPatriots
JIMMY G!!!!!! #DoYourJob #GoPatriots
@LawyerMilloy looks like you got your wish from Santa!!! #GoPatriots #superbowlbound
#Embarrassmentofriches I love my family, friends, colleagues, and sports scene! #GoPatriots
Bring it Seahawks! #SuperBowlXLIX #GoPatriots 👏👏👏
#thankyoubrady #TomBrady #GoPatriots #PatriotsNation
@BuckinBoston so jealous have fun in AZ!  #GOPATRIOTS
YAAAAAAYYYY #GoPatriots #SuperBowlXLIX
Time to go to the Super Bowl! #GoPatriots
OMG.. I'm so happy!!!!  I love our team :D  #GoPatriots #PatriotsNation
And there we have it! It will be the New England Patriots vs the Seahawks in the latest Superbowl! #GoPatriots!
#OnToTheSuperBowl!!!!!!! #PatriotsNation #GoPat

In [8]:
tweet_location_labels = np.asarray(tweet_location_labels)

In [9]:
# # To use geography package, run pip install geograpy

# import geograpy
# url = 'http://www.bbc.com/news/world-europe-26919928'
# places = geograpy.get_place_context(url=url)

# from geograpy import extraction

# Doesn't seem to load