# Get Tweets of the @CTA Twitter handle

In [1]:
import tweepy
import os

consumer_key = os.getenv('twitter_CONSUMER_KEY')
consumer_secret = os.getenv('twitter_CONSUMER_SECRET')
access_token = os.getenv('twitter_ACCESS_TOKEN')
access_token_secret = os.getenv('twitter_ACCESS_TOKEN_SECRET')

# creating an OAuthHandler instance
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# my saved access token opens up the Twitter API treasure box. 
auth.set_access_token(access_token, access_token_secret)                           

# Construct the authenticated API instance
api = tweepy.API(auth)

In [2]:
tweets = []

for status in tweepy.Cursor(api.user_timeline, id="@cta").items():
    tweets.append(status._json)

In [3]:
len(tweets)

3221

In [4]:
print(tweets[1].keys())

dict_keys(['created_at', 'id', 'id_str', 'text', 'truncated', 'entities', 'source', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 'retweeted', 'lang'])


In [6]:
tweets[3220]

{'created_at': 'Mon Aug 27 23:00:02 +0000 2018',
 'id': 1034213981299200000,
 'id_str': '1034213981299200000',
 'text': '@intoxicatdfumes Cars‘ AC is checked before going into service/thru day, but can fail during a trip; if a unit stop… https://t.co/8SAumTZtPI',
 'truncated': True,
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'intoxicatdfumes',
    'name': 'big daddy',
    'id': 1671731496,
    'id_str': '1671731496',
    'indices': [0, 16]}],
  'urls': [{'url': 'https://t.co/8SAumTZtPI',
    'expanded_url': 'https://twitter.com/i/web/status/1034213981299200000',
    'display_url': 'twitter.com/i/web/status/1…',
    'indices': [117, 140]}]},
 'source': '<a href="https://www.hootsuite.com" rel="nofollow">Hootsuite Inc.</a>',
 'in_reply_to_status_id': 1034204635228037121,
 'in_reply_to_status_id_str': '1034204635228037121',
 'in_reply_to_user_id': 1671731496,
 'in_reply_to_user_id_str': '1671731496',
 'in_reply_to_screen_name': 'intoxicatdfumes',
 '

In [24]:
import pandas as pd

new_tweets_df = pd.DataFrame(tweets, 
                         columns = ['created_at', 'text', 'retweet_count', 'favorite_count'])

new_tweets_df['created_at'] = pd.to_datetime(new_tweets_df['created_at'])

In [25]:
new_tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3221 entries, 0 to 3220
Data columns (total 4 columns):
created_at        3221 non-null datetime64[ns]
text              3221 non-null object
retweet_count     3221 non-null int64
favorite_count    3221 non-null int64
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 100.7+ KB


In [26]:
new_tweets_df.head()

Unnamed: 0,created_at,text,retweet_count,favorite_count
0,2018-11-16 02:41:07,[Significant Delays] Red Line trains are opera...,1,2
1,2018-11-16 02:36:35,[Significant Delays] Red Line trains are stand...,1,3
2,2018-11-16 02:24:52,[Significant Delays] Some 95th-bound Red Line ...,0,1
3,2018-11-16 02:15:54,[Significant Delays] 95th-bound Red Line trai...,0,1
4,2018-11-16 01:33:16,[Significant Delays] Howard-bound Red Line tra...,0,1


## Open Saved Tweets

    Open the existing file of tweets from prior pulls from `tweepy`. 

In [27]:
tweets_original = pd.read_feather('cta_tweets_sample.feather')
last_tweet = tweets_original.created_at.max()

  return feather.read_dataframe(path, nthreads=nthreads)


## Clean the new tweets

Some of the new tweets scraped from the API may overlap with tweets in the existing `tweets_original` dataframe. We need to filter these out from the `new_tweets_df`.

In [28]:
print(new_tweets_df.shape) # before subsetting
new_tweets_df = tweets_df[tweets_df.created_at > last_tweet]
print(new_tweets_df.shape) # after subsetting
new_tweets_df.created_at.describe()

(3221, 4)
(2078, 4)


count                    2078
unique                   2077
top       2018-09-28 22:54:15
freq                        2
first     2018-09-22 08:00:12
last      2018-11-16 02:41:07
Name: created_at, dtype: object

In [34]:
new_tweets_df.head(10)

Unnamed: 0,created_at,text,retweet_count,favorite_count
0,2018-11-16 02:41:07,[Significant Delays] Red Line trains are opera...,1,2
1,2018-11-16 02:36:35,[Significant Delays] Red Line trains are stand...,1,3
2,2018-11-16 02:24:52,[Significant Delays] Some 95th-bound Red Line ...,0,1
3,2018-11-16 02:15:54,[Significant Delays] 95th-bound Red Line trai...,0,1
4,2018-11-16 01:33:16,[Significant Delays] Howard-bound Red Line tra...,0,1
5,2018-11-16 01:27:47,[Significant Delays] Howard-bound Red Line tra...,1,3
6,2018-11-16 00:19:33,[Significant Delays] Forest Park-bound Blue Li...,2,2
7,2018-11-16 00:17:09,[Significant Delays] Forest Park-bound Blue Li...,2,2
8,2018-11-16 00:04:36,[Significant Delays] Howard-bound Red Line tra...,1,2
9,2018-11-15 23:56:19,[Significant Delays] Howard-bound Red Line tra...,3,6


## Append the new tweets to the existing tweets DataFrame

## Write to file

In [79]:
tweets_df.to_feather('cta_tweets_sample.feather')