# Get Tweets of the @CTA Twitter handle

In [68]:
import tweepy
import os

consumer_key = os.getenv('twitter_CONSUMER_KEY')
consumer_secret = os.getenv('twitter_CONSUMER_SECRET')
access_token = os.getenv('twitter_ACCESS_TOKEN')
access_token_secret = os.getenv('twitter_ACCESS_TOKEN_SECRET')

# creating an OAuthHandler instance
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# my saved access token opens up the Twitter API treasure box. 
auth.set_access_token(access_token, access_token_secret)                           

# Construct the authenticated API instance
api = tweepy.API(auth)

In [69]:
tweets = []

for status in tweepy.Cursor(api.user_timeline, id="@cta").items():
    tweets.append(status._json)

In [70]:
len(tweets)

3225

In [71]:
print(tweets[1].keys())

dict_keys(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'lang'])


In [72]:
tweets[-1]

{'created_at': 'Wed Aug 29 15:42:07 +0000 2018',
 'id': 1034828552217272327,
 'id_str': '1034828552217272327',
 'text': 'Blue Line service has been temporarily suspended between Cicero and Forest Park due to a police investigation. Shut… https://t.co/i33sIa9uCo',
 'truncated': True,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [],
  'urls': [{'url': 'https://t.co/i33sIa9uCo',
    'expanded_url': 'https://twitter.com/i/web/status/1034828552217272327',
    'display_url': 'twitter.com/i/web/status/1…',
    'indices': [117, 140]}]},
 'source': '<a href="http://www.transitchicago.com/updates" rel="nofollow">ctaUpdates</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 342782636,
  'id_str': '342782636',
  'name': 'cta',
  'screen_name': 'cta',
  'location': 'Chicago, IL',
  'description': 'Chicago Transit Authority updates/news/more. N

In [73]:
import pandas as pd

cols = ['created_at', 'text', 'retweet_count', 'favorite_count']
new_tweets_df = pd.DataFrame(tweets, columns=cols)
new_tweets_df['created_at'] = pd.to_datetime(new_tweets_df['created_at'])

In [74]:
new_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3225 entries, 0 to 3224
Data columns (total 4 columns):
created_at        3225 non-null datetime64[ns]
text              3225 non-null object
retweet_count     3225 non-null int64
favorite_count    3225 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 100.9+ KB


In [75]:
new_tweets_df.head()

Unnamed: 0,created_at,text,retweet_count,favorite_count
0,2018-11-20 02:10:32,[Significant Delays] O'Hare-bound Blue Line tr...,3,4
1,2018-11-20 02:07:04,[Minor Delays / Reroute] 7 Harrison buses tem...,4,2
2,2018-11-20 01:55:12,[Normal Service*] The temporary reroute of 3 K...,3,1
3,2018-11-20 01:12:48,[Significant Delays] Blue Line trains are oper...,3,0
4,2018-11-20 01:03:59,[Significant Delays] Blue Line trains are stan...,3,1


## Open Saved Tweets

    Open the existing file of tweets from prior pulls from `tweepy`. 

In [76]:
saved_tweets_df = pd.read_feather('cta_tweets_sample.feather')
last_tweet = saved_tweets_df.created_at.max()

  return feather.read_dataframe(path, nthreads=nthreads)


## Clean the new tweets

Some of the new tweets scraped from the API may overlap with tweets in the existing `saved_tweets_df` dataframe. We need to filter these out from the `new_tweets_df`. We will make sure that any of the new tweets are time stamped prior to the max timestamped tweet in `saved_tweets_df`.

In [77]:
print(new_tweets_df.shape) # before subsetting
new_tweets_df = new_tweets_df[new_tweets_df.created_at > last_tweet]
print(new_tweets_df.shape) # after subsetting
new_tweets_df.created_at.describe()

(3225, 4)
(2184, 4)


count                    2184
unique                   2182
top       2018-09-28 22:54:15
freq                        2
first     2018-09-22 08:00:12
last      2018-11-20 02:10:32
Name: created_at, dtype: object

In [78]:
new_tweets_df.head(10)

Unnamed: 0,created_at,text,retweet_count,favorite_count
0,2018-11-20 02:10:32,[Significant Delays] O'Hare-bound Blue Line tr...,3,4
1,2018-11-20 02:07:04,[Minor Delays / Reroute] 7 Harrison buses tem...,4,2
2,2018-11-20 01:55:12,[Normal Service*] The temporary reroute of 3 K...,3,1
3,2018-11-20 01:12:48,[Significant Delays] Blue Line trains are oper...,3,0
4,2018-11-20 01:03:59,[Significant Delays] Blue Line trains are stan...,3,1
5,2018-11-20 00:20:56,[Significant Delays] Blue Line trains are oper...,2,1
6,2018-11-20 00:07:53,[Major Delays] Blue Line trains are operating ...,2,1
7,2018-11-20 00:01:08,[Major Delays] Blue Line trains are standing a...,4,1
8,2018-11-19 23:51:32,[Significant Delays] Blue Line trains are stan...,4,2
9,2018-11-19 23:29:24,[Significant Delays] Some Howard-bound Red Lin...,2,3


## Append the new tweets to the existing tweets DataFrame

In [59]:
print(saved_tweets_df.shape)
print(new_tweets_df.shape)

(3234, 4)
(2184, 4)


In [79]:
updated_tweets_df = (pd.concat([saved_tweets_df, new_tweets_df], ignore_index=True)
                     .sort_values(by=['created_at'], ascending=True)
                     .reset_index(drop=True))
print(updated_tweets_df.shape)

(5418, 4)


## Write the Updated Dataframe of Tweets to File

In [80]:
updated_tweets_df.to_feather('cta_tweets_sample.feather')