# Data Access

This notebook served to explore the Twitter API using Tweepy and pull timeline tweets from 10 selected Twitter accounts. At the end of the notebook the data is saved in a CSV file for use in subsequent notebooks.

In [1]:
#access keys removed after running

import os
import tweepy as tw
import pandas as pd

consumer_key = ''
consumer_key_secret = ''
access_token= ''
access_token_secret = ''

Bearer_token = ''

auth = tw.OAuthHandler(consumer_key, consumer_key_secret)
auth.set_access_token(access_token, access_token_secret)

api = tw.API(auth, wait_on_rate_limit=True)

In [57]:
#users selected by referencing https://www.music-map.com/ and spotify to determine related artists
#minimum 5k tweets per account and all verified accounts

screen_names = ['awonderland','wearegalantis','sanholobeats','theknocks','pekingduk','porterrobinson','WhatSoNot','LouisTheChild','NGHTMRE','JaiWolfx']

In [62]:
#https://gist.github.com/yanofsky/5436496 referenced for get_timeline_tweets function
#https://gist.github.com/jaymcgrath/367c521f1dd786bc5a05ec3eeeb1cb04 used to view list of tweepy status object attributes

def get_timeline_tweets(screen_name):
    tweets = []
    new_tweets = api.user_timeline(screen_name = screen_name,count=200)
    tweets.extend(new_tweets)
    last_id = new_tweets[-1].id - 1
    
    while len(new_tweets) > 0:
        new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id = last_id)
        tweets.extend(new_tweets)
        
        if len(new_tweets) > 0:
            last_id = new_tweets[-1].id - 1
    
    tweet_data = [[tweet.user.screen_name, tweet.in_reply_to_screen_name, tweet.is_quote_status, tweet.text,tweet.created_at,
                   tweet.favorite_count,tweet.retweet_count, tweet.author.followers_count,tweet.entities,tweet.truncated,
                   tweet.author.statuses_count] for tweet in tweets]
    
    columns = ['screen_name','reply_to','is_quote_status','text','created_at','favorites','retweets','follower_count',
               'entities','is_truncated','statuses_count']
    
    return pd.DataFrame(data=tweet_data,columns = columns)

In [81]:
tweet_data = pd.DataFrame()

for screen_name in screen_names:
    tweet_data = tweet_data.append(get_timeline_tweets(screen_name))

In [84]:
tweet_data.describe()

Unnamed: 0,favorites,retweets,follower_count,statuses_count
count,31680.0,31680.0,31680.0,31680.0
mean,325.249937,503.165467,191415.243813,20866.750789
std,1376.256152,9623.661166,188751.388703,11501.246595
min,0.0,0.0,20066.0,7299.0
25%,0.0,0.0,59551.0,11829.0
50%,9.0,3.0,95724.0,19498.0
75%,112.0,26.0,254074.0,26470.0
max,61951.0,762868.0,678602.0,44948.0


In [85]:
tweet_data.head()

Unnamed: 0,screen_name,reply_to,is_quote_status,text,created_at,favorites,retweets,follower_count,entities,is_truncated,statuses_count
0,awonderland,DannyjClayton,False,@DannyjClayton Bet,2021-02-01 11:16:51,0,0,326241,"{'hashtags': [], 'symbols': [], 'user_mentions...",False,44948
1,awonderland,i_am_jonjon,False,@i_am_jonjon @dmuoasl @daddydactyl @JustJasmin...,2021-02-01 06:40:46,5,0,326241,"{'hashtags': [], 'symbols': [], 'user_mentions...",False,44948
2,awonderland,awonderland,False,@dmuoasl @daddydactyl @JustJasminexxx @flipsic...,2021-02-01 06:30:36,10,1,326241,"{'hashtags': [], 'symbols': [], 'user_mentions...",False,44948
3,awonderland,dmuoasl,False,@dmuoasl @daddydactyl @JustJasminexxx @flipsic...,2021-02-01 06:27:03,7,1,326241,"{'hashtags': [], 'symbols': [], 'user_mentions...",False,44948
4,awonderland,JustJasminexxx,False,@JustJasminexxx @flipsickle Okay,2021-02-01 06:26:47,4,1,326241,"{'hashtags': [], 'symbols': [], 'user_mentions...",False,44948


In [86]:
tweet_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31680 entries, 0 to 3111
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   screen_name      31680 non-null  object        
 1   reply_to         11167 non-null  object        
 2   is_quote_status  31680 non-null  bool          
 3   text             31680 non-null  object        
 4   created_at       31680 non-null  datetime64[ns]
 5   favorites        31680 non-null  int64         
 6   retweets         31680 non-null  int64         
 7   follower_count   31680 non-null  int64         
 8   entities         31680 non-null  object        
 9   is_truncated     31680 non-null  bool          
 10  statuses_count   31680 non-null  int64         
dtypes: bool(2), datetime64[ns](1), int64(4), object(4)
memory usage: 2.5+ MB


Exploring the is_truncated attribute.

In [88]:
tweet_data[tweet_data.is_truncated==True].head()

Unnamed: 0,screen_name,reply_to,is_quote_status,text,created_at,favorites,retweets,follower_count,entities,is_truncated,statuses_count
204,awonderland,DevonESawa,False,@DevonESawa @NinjaAssassin75 I believe vanilla...,2021-01-26 05:38:46,13,0,326241,"{'hashtags': [], 'symbols': [], 'user_mentions...",True,44948
243,awonderland,,False,I’ve always appreciated every time I’ve steppe...,2021-01-22 22:52:37,2063,180,326241,"{'hashtags': [], 'symbols': [], 'user_mentions...",True,44948
368,awonderland,,False,yeh sex is great but \n\nhave you ever heard a...,2021-01-11 00:38:48,14330,2036,326241,"{'hashtags': [], 'symbols': [], 'user_mentions...",True,44948
431,awonderland,,False,Due to the increase of Covid in LA we made the...,2021-01-05 23:07:10,983,76,326241,"{'hashtags': [], 'symbols': [], 'user_mentions...",True,44948
441,awonderland,ohamberlinaaaa,False,@ohamberlinaaaa @KayzoMusic @SaidTheSky @iamda...,2021-01-05 02:54:39,13,0,326241,"{'hashtags': [], 'symbols': [], 'user_mentions...",True,44948


In [91]:
tweet_data.iloc[243].text

'I’ve always appreciated every time I’ve stepped on stage to play a show but I will never ever take anything for gra… https://t.co/IvQhYXwzlW'

In [101]:
tweet_data.iloc[240].entities['hashtags']

[]

In [116]:
tweet_data.to_csv('./tweet_data_Feb1.csv')