In [3]:
# Imports all the libraries required
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import rgb2hex
from descartes import PolygonPatch
from shapely.geometry import Polygon, MultiPolygon

In [6]:
# Code from https://www.thetopsites.net/article/53409156.shtml

# Generates a generator for the data given in the file to stream the data. 
def stream_read_json(filename):

    start_pos = 0
    with open(filename, 'r') as f:
        while True:
            try:
                obj = json.load(f)
                yield obj
                return
            except json.JSONDecodeError as e:
                f.seek(start_pos)
                json_str = f.read(e.pos)
                obj = json.loads(json_str)
                start_pos += e.pos
                yield obj

In [9]:
### The code above crashes after a couple of reads so we only read the first twitter entry to play with. ###
for item in stream_read_json('geotagged_tweets_20160812-0912.jsons'):
    tweet_dict = item
    break

In [10]:
tweet_dict

{'created_at': 'Fri Aug 12 10:04:00 +0000 2016',
 'id': 764039724818272256,
 'id_str': '764039724818272256',
 'text': '@theblaze @realDonaldTrump https://t.co/TY9DlZ584c',
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'truncated': False,
 'in_reply_to_status_id': 764038820476051456,
 'in_reply_to_status_id_str': '764038820476051456',
 'in_reply_to_user_id': 10774652,
 'in_reply_to_user_id_str': '10774652',
 'in_reply_to_screen_name': 'theblaze',
 'user': {'id': 366636488,
  'id_str': '366636488',
  'name': 'GIL DUPUY',
  'screen_name': 'DUPUY77',
  'location': 'Miami',
  'url': 'http://ggm-dupuy.com',
  'description': "Fashion photographer, love action and adventure, care for the less fortunate, don't tolerate any kind of racism regardless of race or religion",
  'protected': False,
  'verified': False,
  'followers_count': 186,
  'friends_count': 446,
  'listed_count': 19,
  'favourites_count': 1708,
  'statuses_count': 17620,
  'cre

### Things to notice: ###  
  
1. In the cell above we can see that the data for a twitter entry is a nested dictionary (aka. the values of some keys are dictionaries themselves. Therefore, a pandas dataframe cannot be created with this data, see the trial below to witness the misformed pandas DataFrame.
2. We seem to have data on 1) the creation of the tweet, 2) the user that created the tweet, 3) the place where it was created (including a polygon bounding box of coordinates) and 4) some data on other entities connected to the tweet (links, likes, retweets, etc)

In [11]:
# Creates the misformed DataFrame of the tweet entry.
pd.DataFrame(tweet_dict)

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,retweet_count,favorite_count,entities,extended_entities,favorited,retweeted,possibly_sensitive,filter_level,lang,timestamp_ms
id,Fri Aug 12 10:04:00 +0000 2016,764039724818272256,764039724818272256,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,"<a href=""http://twitter.com/download/iphone"" r...",False,764038820476051456,764038820476051456,10774652,10774652,...,0,0,,,False,False,False,low,und,1470996240225
id_str,Fri Aug 12 10:04:00 +0000 2016,764039724818272256,764039724818272256,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,"<a href=""http://twitter.com/download/iphone"" r...",False,764038820476051456,764038820476051456,10774652,10774652,...,0,0,,,False,False,False,low,und,1470996240225
name,Fri Aug 12 10:04:00 +0000 2016,764039724818272256,764039724818272256,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,"<a href=""http://twitter.com/download/iphone"" r...",False,764038820476051456,764038820476051456,10774652,10774652,...,0,0,,,False,False,False,low,und,1470996240225
screen_name,Fri Aug 12 10:04:00 +0000 2016,764039724818272256,764039724818272256,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,"<a href=""http://twitter.com/download/iphone"" r...",False,764038820476051456,764038820476051456,10774652,10774652,...,0,0,,,False,False,False,low,und,1470996240225
location,Fri Aug 12 10:04:00 +0000 2016,764039724818272256,764039724818272256,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,"<a href=""http://twitter.com/download/iphone"" r...",False,764038820476051456,764038820476051456,10774652,10774652,...,0,0,,,False,False,False,low,und,1470996240225
url,Fri Aug 12 10:04:00 +0000 2016,764039724818272256,764039724818272256,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,"<a href=""http://twitter.com/download/iphone"" r...",False,764038820476051456,764038820476051456,10774652,10774652,...,0,0,,,False,False,False,low,und,1470996240225
description,Fri Aug 12 10:04:00 +0000 2016,764039724818272256,764039724818272256,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,"<a href=""http://twitter.com/download/iphone"" r...",False,764038820476051456,764038820476051456,10774652,10774652,...,0,0,,,False,False,False,low,und,1470996240225
protected,Fri Aug 12 10:04:00 +0000 2016,764039724818272256,764039724818272256,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,"<a href=""http://twitter.com/download/iphone"" r...",False,764038820476051456,764038820476051456,10774652,10774652,...,0,0,,,False,False,False,low,und,1470996240225
verified,Fri Aug 12 10:04:00 +0000 2016,764039724818272256,764039724818272256,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,"<a href=""http://twitter.com/download/iphone"" r...",False,764038820476051456,764038820476051456,10774652,10774652,...,0,0,,,False,False,False,low,und,1470996240225
followers_count,Fri Aug 12 10:04:00 +0000 2016,764039724818272256,764039724818272256,@theblaze @realDonaldTrump https://t.co/TY9DlZ...,"<a href=""http://twitter.com/download/iphone"" r...",False,764038820476051456,764038820476051456,10774652,10774652,...,0,0,,,False,False,False,low,und,1470996240225


To get to know the data more intimately, we can look at the keys and their values present in the dictionary.

In [46]:
tweet_dict.keys()

dict_keys(['created_at', 'id', 'id_str', 'text', 'source', 'truncated', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'entities', 'extended_entities', 'favorited', 'retweeted', 'possibly_sensitive', 'filter_level', 'lang', 'timestamp_ms'])

In [47]:
tweet_dict.values()

dict_values(['Fri Aug 12 10:04:00 +0000 2016', 764039724818272256, '764039724818272256', '@theblaze @realDonaldTrump https://t.co/TY9DlZ584c', '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', False, 764038820476051456, '764038820476051456', 10774652, '10774652', 'theblaze', {'id': 366636488, 'id_str': '366636488', 'name': 'GIL DUPUY', 'screen_name': 'DUPUY77', 'location': 'Miami', 'url': 'http://ggm-dupuy.com', 'description': "Fashion photographer, love action and adventure, care for the less fortunate, don't tolerate any kind of racism regardless of race or religion", 'protected': False, 'verified': False, 'followers_count': 186, 'friends_count': 446, 'listed_count': 19, 'favourites_count': 1708, 'statuses_count': 17620, 'created_at': 'Fri Sep 02 14:54:17 +0000 2011', 'utc_offset': None, 'time_zone': None, 'geo_enabled': True, 'lang': 'en', 'contributors_enabled': False, 'is_translator': False, 'profile_background_color': '131516', 'profile_backgro

In [48]:
# Gets the tweet text, user name, user description and location.
print("Name: ", tweet_dict['user']['name'])
print("User description: ", tweet_dict['user']['description'])
print("Location: ", tweet_dict['place']['full_name'], tweet_dict['place']['country'] )
print("Tweet text: ", tweet_dict['text'])


Name:  GIL DUPUY
User description:  Fashion photographer, love action and adventure, care for the less fortunate, don't tolerate any kind of racism regardless of race or religion
Location:  Frontenac, MO United States
Tweet text:  @theblaze @realDonaldTrump https://t.co/TY9DlZ584c


In [49]:
# Same information on the next tweet.

i = 0
for item in stream_read_json('geotagged_tweets_20160812-0912.jsons'):
    tweet_dict2 = item
    i += 1
    if i == 2:
        break

# Gets the tweet text, user name, user description and location.
print("Name: ", tweet_dict2['user']['name'])
print("User description: ", tweet_dict2['user']['description'])
print("Location: ", tweet_dict2['place']['full_name'], tweet_dict2['place']['country'] )
print("Tweet text: ", tweet_dict2['text'])

Name:  Red Octopus
User description:  AWSCWI Pipefitter USAF NRA I remember USA, Hunting Fishing, HarleyDavidsons- a time when we HS students could carry guns in our trks 2 school
Location:  Baton Rouge, LA United States
Tweet text:  @BarackObama 
@FBI
@LORETTALYNCH 
ALL IN COLLUSION TOGETHER 

#NOJUSTICE 

@realDonaldTrump 
#TrumpPence 

https://t.co/5GMNZq40V3


### Things to ask ourselves ###
  
1. Is there a way we can stream the data without it crashing?
Wenhao: I think it works on mine, and I make a test file called test.json including top 10 tweets. The whole dataset will seize up my computer. Is this the decode problem?  

2. What is all the relevant information we need to have per tweet?
Wenhao: In my opinion, it depends on the insights we hope to get. First, we should focus on one main function such as state vote percentage. Therefore, we should use text and place(coordinates) at the vary beginning.

3. Can we store the data in a pandas DataFrame?
Wenhao: I think this is a tree structure data, maybe we can add the prefix and convert it.

---------------------
In response to Q3, only the keys 'user', 'place', 'entities' and 'extended_entities' seem to contain nested dictionaries. Maybe we can try to create a dataframe without these keys first?

Afterwards, the link below had a method I feel could be a good method https://stackoverflow.com/questions/13575090/construct-pandas-dataframe-from-items-in-nested-dictionary