In [8]:
import time
import tweepy
import pandas as pd
import json
import yaml
import pprint

ModuleNotFoundError: No module named 'yaml'

In [7]:
!pip install pyyaml

Collecting pyyaml
  Downloading PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl (197 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m197.6/197.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: pyyaml
Successfully installed pyyaml-6.0


## Twitter API Example

- Interact with [Twitter API](https://developer.twitter.com/en/docs.html). The main endpoints return tweets, users and followers. 
- It is necessary to generate an API key to obtain access to the endpoints.
- You need to sign in on [Twitter](https://twitter.com) and then following this steps https://developer.twitter.com/en/docs/basics/authentication/guides/access-tokens.html

API keys and tokens are needed to initialize Python Twitter Wrapper ([tweepy](https://tweepy.readthedocs.io/en/3.7.0/api.html)). It is a best practice to store the keys in a separated **configuration file** that should be kept secret and not shared (e.g.: on GitHub).

In [2]:
with open(r'./twitter_API.yaml') as file:
    cred = yaml.full_load(file)["twitter_credentials"]

In [3]:
consumer_key = cred['consumer_key']
consumer_secret = cred['consumer_secret']
access_token = cred['access_token']
access_token_secret = cred['access_token_secret']

In [4]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
twitter = tweepy.API(auth)

1) Get last N_MAX tweets of a specific user and store them in MongoDB collection.

In [5]:
def save_tweet(data):
    tw = {}
    tw['id_post'] = tweet.id
    tw['username'] = tweet.user.screen_name

    if tweet.coordinates is not None:
        coor = tweet.coordinates['coordinates']
        lat = coor[1]
        lng = coor[0]
        tw['lat'] = lat
        tw['long'] = lng
    else:
        lat = None
        lng = None

    if tweet.place is not None:
        place = tweet.place.name
        tw['place'] = place
    else:
        place = None

    tw['text'] = tweet.full_text
    tw['timestamp'] = tweet.created_at
    tw['retweets'] = tweet.retweet_count
    tw['likes'] = tweet.favorite_count
    tw['lang'] = tweet.lang
    
    return tw

In [6]:
N_MAX = 100
username = 'polimi'

tweets_df = pd.DataFrame(columns=['id_post','username','lat', 'long', 'place','text','timestamp','retweets','likes','lang'])
for tweet in tweepy.Cursor(twitter.user_timeline, screen_name=username, tweet_mode='extended').items(N_MAX):
    tw_row = save_tweet(tweet)
    tweets_df = tweets_df.append(tw_row, ignore_index=True)

2) Retrieve user account information

In [7]:
u = twitter.get_user(screen_name = username)

pprint.pprint(u._json)

{'contributors_enabled': False,
 'created_at': 'Wed Feb 16 15:13:32 +0000 2011',
 'default_profile': False,
 'default_profile_image': False,
 'description': 'Since 1863. Technology, Creativity, Culture',
 'entities': {'description': {'urls': []},
              'url': {'urls': [{'display_url': 'polimi.it',
                                'expanded_url': 'http://www.polimi.it',
                                'indices': [0, 22],
                                'url': 'http://t.co/ogI6rYvBii'}]}},
 'favourites_count': 1387,
 'follow_request_sent': False,
 'followers_count': 38738,
 'following': False,
 'friends_count': 233,
 'geo_enabled': False,
 'has_extended_profile': False,
 'id': 253099487,
 'id_str': '253099487',
 'is_translation_enabled': False,
 'is_translator': False,
 'lang': None,
 'listed_count': 501,
 'location': 'Milano',
 'name': 'Politecnico Milano',
 'notifications': False,
 'profile_background_color': '00437A',
 'profile_background_image_url': 'http://abs.twimg.com/image

3) Save **follow** relationship. 

This is the most expensive operation, since number of followers can be extremely large. For this reason, we need to define a function to handle **API rate limits**: over a certain number of requests, that depends on resource, the API stops for **15 minutes** (more details [here](https://developer.twitter.com/en/docs/basics/rate-limits))

In [8]:
def limit_handled(cursor):
    while True:
        try:
            yield cursor.next()
        except tweepy.RateLimitError:
            print ('API Rate Limit exceeded. Waiting...')
            
            # wait for 15 minutes to reset the API timeout
            time.sleep(15 * 60)
        except StopIteration:
            return

In [9]:
follow = pd.DataFrame(columns=['id_following', 'id_followed'])

id_user = u.id
for follower in limit_handled(tweepy.Cursor(twitter.followers_ids, screen_name=username).items(200)):
    follow = follow.append({'id_following': follower, 'id_followed': id_user}, ignore_index=True)

In [10]:
follow.head()

Unnamed: 0,id_following,id_followed
0,372084104,253099487
1,1238120666949353473,253099487
2,1397890657633681412,253099487
3,1442838293649952770,253099487
4,430882302,253099487


In [11]:
follow['id_following'] = follow['id_following'].astype('int64')
follow['id_followed'] = follow['id_followed'].astype('int64')
follow.head()

Unnamed: 0,id_following,id_followed
0,372084104,253099487
1,1238120666949353473,253099487
2,1397890657633681412,253099487
3,1442838293649952770,253099487
4,430882302,253099487


In [12]:
# save data
follow.to_csv("./followers.csv", index=None)