# LOAD LIBRARY

In [1]:
!pip install twitter

Collecting twitter
[?25l  Downloading https://files.pythonhosted.org/packages/85/e2/f602e3f584503f03e0389491b251464f8ecfe2596ac86e6b9068fe7419d3/twitter-1.18.0-py2.py3-none-any.whl (54kB)
[K     |██████                          | 10kB 19.2MB/s eta 0:00:01[K     |████████████                    | 20kB 4.4MB/s eta 0:00:01[K     |██████████████████              | 30kB 5.4MB/s eta 0:00:01[K     |████████████████████████        | 40kB 3.8MB/s eta 0:00:01[K     |██████████████████████████████▏ | 51kB 4.6MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 5.0MB/s 
[?25hInstalling collected packages: twitter
Successfully installed twitter-1.18.0


In [0]:
import json
import time
import os
import re
import pandas as pd
from datetime import datetime, timedelta
from twitter import Twitter, OAuth, TwitterHTTPError, TwitterStream
from time import mktime

In [0]:
# SET KEYWORDS
keyword_list = 'earthquake OR flood OR fire OR tsunami OR typhoon OR storm OR tornado'

# TWITTER API

https://developer.twitter.com/en/docs/tweets/search/api-reference/get-search-tweets.html

In [0]:
ACCESS_TOKEN = 'Enter your token here'
ACCESS_SECRET = 'Enter your token here'
CONSUMER_KEY = 'Enter your token here'
CONSUMER_SECRET = 'Enter your token here'

# DEFINE METHOD

## TWITTER AUTHENTICATION

In [0]:
def authen(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET):
    oauth = OAuth(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)
    twitter = Twitter(auth=oauth)
    
    return twitter

## SEARCH

In [0]:
def search(min_id, keyword_list):
    if min_id == -1:
        tweet = twitter.search.tweets(
            q = keyword_list,
            count = 100,
            result_type = 'recent',
            tweet_mode = 'extended'
        )
    else:
        tweet = twitter.search.tweets(
        q = keyword_list,
        count = 100,
        result_type = 'recent',
        tweet_mode = 'extended',
        max_id = min_id - 1
    )

    return tweet['statuses']

## REPORT

In [0]:
def report(status_list, min_id):
    this_count = len(status_list)
    print('Tweet Count: ', this_count)

    if this_count > 0:
        tweet_id_list = [x['id'] for x in status_list]
        min_id = min(tweet_id_list)
        max_id = max(tweet_id_list)
        print('\tID Min:', min(tweet_id_list), '\t|\tID Max:', max(tweet_id_list))

        tweet_time_list = [time.strftime('%Y-%m-%d %H:%M:%S', time.strptime(x['created_at'], '%a %b %d %H:%M:%S +0000 %Y')) for x in status_list]
        min_time = min(tweet_time_list)
        max_time = max(tweet_time_list)
#         print('Time Min:', min_time, '\t|\tTime Max:', max_time)
#         print('=======================================================================')
    
    return min_id

## EXTRACT FIELD & DATA

In [0]:
reg = r'RT @\w+: '

def extract_field(status, keyword_list, created_at = None):
    this_data_list = list()
    
    if created_at == None:
        created_at = status['created_at']
    
    twitter_id = str(status['id'])
    
    text = status['full_text']
    if 'retweeted_status' in status:
        retweeted_status_id = str(status['retweeted_status']['id'])
        match = re.match(reg, text)
        if match is None:
            text = status['retweeted_status']['full_text']
        else:
            text = match.group(0) + status['retweeted_status']['full_text']

    user_id = status['user']['id']
    user_name = status['user']['name']
    screen_name = status['user']['screen_name']
    followers_count = status['user']['followers_count']
    friends_count = status['user']['friends_count']
    profile_image = status['user']['profile_image_url']

    retweet_count = status['retweet_count']
    favorite_count = status['favorite_count']

    lang = status['lang']
    
    latitue = None
    longitude = None
    if 'geo' in status and status['geo'] != None:
        latitue = status['geo']['coordinates'][0]
        longitude = status['geo']['coordinates'][1]

    place_type = None
    place_name = None
    place_full_name = None
    country_code = None
    country = None
    if 'place' in status and status['place'] != None:
        place_type = status['place']['place_type']
        place_name = status['place']['name']
        place_full_name = status['place']['full_name']
        country_code = status['place']['country_code']
        country = status['place']['country']
    
    disaster_type = ''
    for keyword in keyword_list.split(' OR '):
        if keyword.lower() in text.lower():
            disaster_type = keyword
            this_data_list.append([
                created_at,
                twitter_id,
                text,

                user_id,
                user_name,
                screen_name,
                followers_count,
                friends_count,
                profile_image,

                retweet_count,
                favorite_count,
                
                lang,
                
                latitue,
                longitude,
                place_type,
                place_name,
                place_full_name,
                country_code,
                country,
            
                disaster_type
            ])
    return this_data_list, created_at
    

def extract_data(status_list, keyword_list):
    data_list = list()

    for status in status_list:
        this_data_list, created_at = extract_field(status, keyword_list)
        data_list = data_list + this_data_list
        
        
        if 'quoted_status' in status and status['quoted_status'] != None:
            this_data_list, created_at2 = extract_field(status['quoted_status'], keyword_list, created_at)
            data_list = data_list + this_data_list

    df = pd.DataFrame(
        data_list, 
        columns=[
            'created_at',
            'twitter_id',
            'text',

            'user_id',
            'user_name',
            'screen_name',
            'followers_count',
            'friends_count',
            'profile_image',

            'retweet_count',
            'favorite_count',
            
            'lang',
            
            'latitude',
            'longitude',
            'place_type',
            'place_name',
            'place_full_name',
            'country_code',
            'country',

            'disaster_type'
        ]
    )
    
    struct_time_list = [time.strptime(created_at, '%a %b %d %H:%M:%S +0000 %Y') for created_at in df['created_at'].values]
    datetime_list = [datetime.fromtimestamp(mktime(struct_time)) for struct_time in struct_time_list]
    df['created_at'] = datetime_list
    df['date'] = df['created_at'].dt.date

    return df

## WRITE DATA

In [0]:
def write_data(df, status_list):
    date_list = df['date'].unique()

    for d in date_list:
        file_name = '{0}.csv'.format(d)
        header = not os.path.exists(file_name)
        df[df['date'] == d].to_csv(
            file_name, 
            mode = 'a', 
            header = header,
            index = False
        )

        with open('twitter.json', 'a') as f:
            json.dump(status_list, f)

# RUN

In [0]:
# INITIAL PARAMETERES
min_id = -1

## SET KEYWORDS
#keyword_list = 'earthquake OR flood OR fire OR tsunami OR typhoon OR storm OR tornado'

# AUTHENTICATION
twitter = authen(ACCESS_TOKEN, ACCESS_SECRET, CONSUMER_KEY, CONSUMER_SECRET)

while True:
    status_list = search(min_id, keyword_list)
    if len(status_list) == 0:
        break

    min_id = report(status_list, min_id)
    df = extract_data(status_list, keyword_list)

    write_data(df, status_list)
    

    time.sleep(5)

Tweet Count:  100
	ID Min: 1171828218946609157 	|	ID Max: 1171828278660870150
Tweet Count:  100
	ID Min: 1171828164030599168 	|	ID Max: 1171828218879520768
Tweet Count:  100
	ID Min: 1171828108946751495 	|	ID Max: 1171828163707637761
Tweet Count:  100
	ID Min: 1171828053833437184 	|	ID Max: 1171828108028047360
Tweet Count:  100
	ID Min: 1171828002210091008 	|	ID Max: 1171828053611335681
Tweet Count:  100
	ID Min: 1171827952985788417 	|	ID Max: 1171828002122022912
Tweet Count:  100
	ID Min: 1171827881921540096 	|	ID Max: 1171827952365051904
Tweet Count:  100
	ID Min: 1171827824753332225 	|	ID Max: 1171827880252366850


KeyboardInterrupt: ignored

In [0]:
!ls -al