# import packages

In [2]:
import tweepy
import yaml
import json
import sqlite3
from datetime import datetime
import pandas as pd

# Read twitter Authentication Keys

In [3]:
# yaml file reader funtion
def read_yaml(file_path):
    with open(file_path, "r") as f:
        return yaml.safe_load(f)

# yaml config file path
file_path = "twitter_api_key_config.yaml"
# read from config file
api_credential = read_yaml(file_path)

# Create Twitter Authentication

In [4]:
# API authentication
auth = tweepy.OAuthHandler(api_credential["api_key"], \
                           api_credential["api_secret_token"])
auth.set_access_token(api_credential["access_token"], \
                      api_credential["access_token_secret"])
api = tweepy.API(auth, wait_on_rate_limit=True)

# Create the Database and Required Tables

In [71]:
# establish a database connection
conn = sqlite3.connect('tbt_table.db')
cur = conn.cursor()
create_tweet_info_table = """CREATE TABLE tweet_info(tweet_id BIGINT PRIMARY KEY, \
                                          user_id BIGINT, \
                                          tweet_lang TEXT, \
                                          tweet_time TEXT, \
                                          source TEXT, \
                                          tweet_text TEXT,\
                                          quote_count TEXT, \
                                          reply_count INT, \
                                          retweet_count INT,\
                                          tweet_favorite_count INT, \
                                          hashtags TEXT, \
                                          short_urls TEXT, \
                                          expanded_urls TEXT, \
                                          user_mentions TEXT, \
                                          id_str TEXT, \
                                          truncated INT, \
                                          in_reply_to_status_id BIGINT, \
                                          in_reply_to_status_id_str TEXT, \
                                          in_reply_to_user_id BIGINT, \
                                          in_reply_to_user_id_str TEXT, \
                                          in_reply_to_screen_name TEXT, \
                                          coordinates TEXT, \
                                          place TEXT
                                          );"""

create_user_info_table =  """CREATE TABLE user_info(user_id BIGINT PRIMARY KEY, \
                                          user_screen_name TEXT, \
                                          user_name TEXT, \
                                          user_language TEXT, \
                                          location TEXT, \
                                          profile_url TEXT, \
                                          description TEXT, \
                                          protected TEXT, \
                                          verified TEXT, \
                                          created_at TEXT, \
                                          friends_count BIGINT, \
                                          followers_count BIGINT,\
                                          favorites_count BIGINT, \
                                          statuses_count BIGINT, \
                                          id_str TEXT, \
                                          url TEXT);"""

cur.execute(create_tweet_info_table)
cur.execute(create_user_info_table)
conn.close()

# check if a table exists in the database

In [72]:
conn = sqlite3.connect('tbt_table.db')
cur = conn.cursor()

table_exist_query = ''' SELECT count(*) FROM sqlite_master WHERE type='table' AND name='tweet_info' '''
cur.execute(table_exist_query)
exist_result = cur.fetchone()

if exist_result[0]==1:
    print("tweet_info table exists.")
else:
    print("tweet_info table does not exist.")
conn.close()

tweet_info table exists.


# define a StreamListener Object

In [73]:
# override tweepy.StreamListener to add logic to on_status
class MyStreamListener(tweepy.StreamListener):
    def __init__(self, listen_time=60):
        super(MyStreamListener, self).__init__()
        self.counter = 0
        print("Initialized Tweepy StreamListener.")
        self.start_time = datetime.now()
        self.current_time = datetime.now()
        self.listen_time = listen_time
        self.unique_user_id_set = set([])
        # adding database connection code
        self.conn = sqlite3.connect('tbt_table.db')
        self.cur  = self.conn.cursor()
        
    def insert_data(self, data):
        tweet_object=json.loads(data) # convert "string-line" into json
        # check if json object has a key id. Otherwise continue to next.
        if 'id' in tweet_object.keys(): 
            
            ##### Stuff I changed ######
            
            # tweet object information
            tweet_id        = tweet_object['id']                                #14
            user_id         = tweet_object['user']['id']                        #
            tweet_lang      = tweet_object['lang']                              #
            tweet_time      = str(pd.to_datetime(tweet_object['created_at']))   # created_at duplicate
            source          = tweet_object['source']                            #
            tweet_text      = tweet_object['text']                              #

            # tweet numeric information
            quote_count = tweet_object['quote_count']                           #
            reply_count = tweet_object['reply_count']                           #
            retweet_count = tweet_object['retweet_count']                       #
            tweet_favorite_count = tweet_object['favorite_count']               #

            # meta-content information
            hashtags = [str(hashtag['text']) for hashtag in tweet_object['entities']['hashtags']]  #
            hashtags = ",".join(hashtags)
            short_urls = [str(url['url']) for url in tweet_object['entities']['urls']]             #
            short_urls = ",".join(short_urls)
            expanded_urls = []                                                                     #
            try:
                expanded_urls = [str(url['expanded_url']) for url in tweet_object['entities']['urls']]
            except:
                print('Error Message: No Expanded URL.')
            expanded_urls = ",".join(expanded_urls)

            # user interaction based informations    
            user_mentions = [str(user_mentions['id'])\
                        for user_mentions in tweet_object['entities']['user_mentions']]
            user_mentions = ",".join(user_mentions)                                                 #


            # New Parameters
            # used try/except to ensure that program does not fail 

            # id_str
            try:
                id_str = tweet_object['id_str'] 
            except:
                id_str = None

            # truncated 
            try:
                truncated = tweet_object['truncated'] 
            except:
                truncated = None

            # in_reply_to_status_id 
            try:
                in_reply_to_status_id = tweet_object['in_reply_to_status_id'] 
            except:
                in_reply_to_status_id = None

            # in_reply_to_status_id_str 
            try:
                in_reply_to_status_id_str = tweet_object['in_reply_to_status_id_str'] 
            except:
                in_reply_to_status_id_str = None

            # in_reply_to_user_id  
            try:
                in_reply_to_user_id  = tweet_object['in_reply_to_user_id'] 
            except:
                in_reply_to_user_id  = None

            # in_reply_to_user_id_str 
            try:
                in_reply_to_user_id_str = tweet_object['in_reply_to_user_id_str'] 
            except:
                in_reply_to_user_id_str = None
            
            # in_reply_to_screen_name 
            try:
                in_reply_to_screen_name = tweet_object['in_reply_to_screen_name'] 
            except:
                in_reply_to_screen_name = None

            # coordinates 
            try:
                coordinates = tweet_object['coordinates'] 
            except:
                coordinates = None

            # place 
            try:
                place = tweet_object['place'] 
            except:
                place = None


            tweet_info = (tweet_id, user_id, tweet_lang,\
                    tweet_time, source, tweet_text,\
                    quote_count, reply_count, retweet_count,\
                    tweet_favorite_count, hashtags, short_urls,\
                    expanded_urls, user_mentions, id_str, truncated,\
                    in_reply_to_status_id, in_reply_to_status_id_str,\
                    in_reply_to_user_id, in_reply_to_user_id_str,\
                    in_reply_to_screen_name, coordinates, place)
            self.cur.execute("INSERT INTO tweet_info \
            VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", tweet_info)

            ########## End of my changes #############
            
            # user profile information

            if user_id in self.unique_user_id_set:
                pass
            else:
                self.unique_user_id_set.add(user_id)
                
                user_screen_name      = tweet_object['user']['screen_name']
                user_name             = tweet_object['user']['name']
                user_language         = tweet_object['user']['lang']  
                location              = tweet_object['user']['location']
                profile_url           = tweet_object['user']['url']
                description           = tweet_object['user']['description']
                protected             = tweet_object['user']['protected']
                verified              = tweet_object['user']['verified']
                created_at            = str(pd.to_datetime(tweet_object['user']['created_at']))
                friends_count         = tweet_object['user']['friends_count']
                followers_count       = tweet_object['user']['followers_count']
                favorites_count       = tweet_object['user']['favourites_count']
                statuses_count        = tweet_object['user']['statuses_count']
                
                try:
                    id_str = tweet_object['user']['id_str'] 
                except Exception as e:
                    print('Exception: id_str')
                    print(e)
                    id_str = None
                    
                try:
                    url = tweet_object['user']['url'] 
                except Exception as e:
                    print('Exception: url')
                    print(e)
                    url = None
                    
                    
                
                    
            

                user_information = (user_id, user_screen_name, user_name,\
                       user_language, location, profile_url,\
                       description, protected, verified, created_at,\
                       friends_count, followers_count,\
                       favorites_count, statuses_count, id_str, url)
                self.cur.execute(" INSERT INTO user_info VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);", user_information)
            
            self.conn.commit()
        
    def on_data(self, data):
        self.current_time = datetime.now()
        time_elapsed = (self.current_time - self.start_time).total_seconds()
        if time_elapsed < self.listen_time:
            try:
                self.counter += 1
                """
                Changing the code here.
                Previously, we saved to Text file.
                Now, we will pass this to the Database insertor method.
                """
                # -- self.output_file.write(str(data))
                print(f"Tweet Processed: {self.counter}\n")
                self.insert_data(str(data))
                
            except Exception as e:
                print(f"On data Exception:{e}.")
        else:
            print(f"Stream listen time period ended. Total listen time: {self.listen_time} seconds.\n\n")
            print(f"Total Tweet processed: {self.counter}")
            self.conn.close()
            return False

    # handling Errors
    def on_error(self, status_code):
        print(f"status_code: {status_code}")
        if status_code == 420:
            #returning False in on_error disconnects the stream
            return False

# create a stream

In [74]:
myStreamListener = MyStreamListener(listen_time=90000)
myStream = tweepy.Stream(api.auth, myStreamListener)

Initialized Tweepy StreamListener.


# start the streamer

In [75]:
keywords = ['#tbt']
try:
    print("Stream Filter")
    myStream.filter(track=keywords)
    print("DONE")
except Exception as e:
    print(f"error in stream filter {e}")

Stream Filter
Tweet Processed: 1

Tweet Processed: 2

Tweet Processed: 3

Tweet Processed: 4

Tweet Processed: 5

Tweet Processed: 6

Tweet Processed: 7

Tweet Processed: 8

Tweet Processed: 9

Tweet Processed: 10

On data Exception:Error binding parameter 22 - probably unsupported type..
Tweet Processed: 11



KeyboardInterrupt: 

(1438344497218326532, 192832526, 'en', '2021-09-16 03:30:25+00:00', '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>', 'Its called PRIVATE LIFE for a reason #thursdaymorning #Tbt https://t.co/17uTnZCSbY', '0', 0, 0, 0, 'thursdaymorning,Tbt', '', '', '')



(192832526, 'sucrepie_', 'Judith', None, 'Hungary', None, 'Agriculture| Plant based food $ smoothies recipes | Relationships| Public Relations | God fearing | Model | Research| Realist | I love love and love to be loved', '0', '0', '2010-09-20 08:21:45+00:00', 156, 133, 5347, 2320)


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)






user_info_column_names: ['user_id', 'user_screen_name', 'user_name', 'user_language', 'location', 'profile_url', 'description', 'protected', 'verified', 'created_at', 'friends_count', 'followers_count', 'favorites_count', 'statuses_count']


# fetch the data into a pandas dataframe

In [76]:
conn = sqlite3.connect('tbt_table.db')
cur = conn.cursor()

cur.execute("SELECT * FROM tweet_info")
tweet_info_all_result = cur.fetchall()
tweet_info_column_names = [description[0] for description in cur.description]
tweet_info_dataframe = pd.DataFrame(tweet_info_all_result, \
                                   columns=tweet_info_column_names)
print(f"No of tweet stored: {len(tweet_info_all_result)}, {tweet_info_dataframe.shape[0]}\n")



cur.execute("SELECT * FROM user_info")
user_info_all_result = cur.fetchall()
user_info_column_names = [description[0] for description in cur.description]
user_info_dataframe = pd.DataFrame(user_info_all_result, \
                                   columns=user_info_column_names)
print(f"No of user info stored: {len(user_info_all_result)}, {user_info_dataframe.shape[0]}\n")

conn.close()

No of tweet stored: 10, 10

No of user info stored: 10, 10



In [77]:
tweet_info_dataframe.head()

Unnamed: 0,tweet_id,user_id,tweet_lang,tweet_time,source,tweet_text,quote_count,reply_count,retweet_count,tweet_favorite_count,...,user_mentions,id_str,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,coordinates,place
0,1458925192646909956,1213521040392912896,en,2021-11-11 22:30:46+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @Randomshii101: #tbt It’s The Aesthetic For...,0,0,0,0,...,1.335550888774152e+18,1458925192646909956,0,,,,,,,
1,1458925192194019328,157440591,es,2021-11-11 22:30:45+00:00,"<a href=""http://gainapp.com"" rel=""nofollow"">Ga...",Compartamos juntos estos momentos que lo son t...,0,0,0,0,...,,1458925192194019328,1,,,,,,,
2,1458925195683631106,920772517526212610,es,2021-11-11 22:30:46+00:00,"<a href=""https://www.later.com"" rel=""nofollow""...",#TBT Los bebés #Maluma y #Camilo\n😀 😀 https://...,0,0,0,0,...,,1458925195683631106,0,,,,,,,
3,1458925197810098188,17054509,pt,2021-11-11 22:30:47+00:00,"<a href=""http://twitter.com/download/android"" ...",Eu tomo uns sustos na quinta quando vejo posts...,0,0,0,0,...,,1458925197810098188,0,,,,,,,
4,1458925199492005891,1083767420970508288,en,2021-11-11 22:30:47+00:00,"<a href=""https://www.later.com"" rel=""nofollow""...",#tbt to my first weighable derby fish! I was s...,0,0,0,0,...,,1458925199492005891,1,,,,,,,


In [79]:
user_info_dataframe

Unnamed: 0,user_id,user_screen_name,user_name,user_language,location,profile_url,description,protected,verified,created_at,friends_count,followers_count,favorites_count,statuses_count,id_str,url
0,1213521040392912896,roxburysfinest_,Durand Bernarr Stan Account,,Boston,https://linktr.ee/Karim95,Boston Based singer-Songwriter and everything ...,0,0,2020-01-04 18:02:30+00:00,634,736,10293,11980,1213521040392912896,https://linktr.ee/Karim95
1,157440591,deliriocali,Delirio Cali,,"Cali, Colombia",http://www.delirio.com.co,Salsa + Circo + Orquesta.,0,1,2010-06-19 20:43:36+00:00,853,8747,1575,5687,157440591,http://www.delirio.com.co
2,920772517526212610,iHeartLatino,iHeartLatino,,United States,http://www.iHeartLatino.com,La casa de tus artistas y su música.,0,0,2017-10-18 22:04:10+00:00,117,1551,78,1870,920772517526212610,http://www.iHeartLatino.com
3,17054509,little_dee,Wendy G S,,Brazil,,"That's all, folks!",0,0,2008-10-29 22:58:45+00:00,274,158,1158,1966,17054509,
4,1083767420970508288,OutdoorsAlexa,Alexa Tetrault,,"Milton, VT",http://instagram.com/alexade_vivo,🎣 Angler living the dream in VT🏔 Contact me fo...,0,0,2019-01-11 16:47:43+00:00,373,549,816,506,1083767420970508288,http://instagram.com/alexade_vivo
5,1137080751135100928,harmony91417950,harmony,,United States,,"Justin followed on September 14, 2019 at 1:20p...",0,0,2019-06-07 19:35:51+00:00,224,309,133980,104118,1137080751135100928,
6,1458286664732794886,OhHey_Jorge,Jorge Luis,,,,Chooser of violence.,0,0,2021-11-10 04:13:43+00:00,7,3,0,3,1458286664732794886,
7,37853837,piratedjsivan,Ivan,,"ÜT: 37.658732,-122.437416",,Pirate DJs | True S.F. | The Pirate Sector | C...,0,0,2009-05-05 04:08:41+00:00,312,1336,464,8398,37853837,
8,1242086408740835329,tm_thamy,Thamy🖤👸🏾,,Carioca Rj,,Enzo Gabriel👩‍👦 Minimal Prog👽💛 ...,0,0,2020-03-23 13:51:01+00:00,508,504,17253,22429,1242086408740835329,
9,630524565,Ale_Garcia2000,Ale García,,,,Malaguista 💙,0,0,2012-07-08 19:04:18+00:00,2477,187,6218,6180,630524565,


['user_id', 'user_screen_name', 'friends_count', 'followers_count', 'favorites_count', 'statuses_count']


Unnamed: 0,user_id,user_screen_name,friends_count,followers_count,favorites_count,statuses_count
0,192832526,sucrepie_,156,133,5347,2320
1,37161598,HomeboyzRadio,2906,334684,169051,618003
2,243906097,guiicostavieira,641,952,81330,73372
3,1849788637,gabrielsancarr,1078,924,475409,32393
4,1151926051892805632,RichWeberr,2356,2338,119108,25881


Returned entries: 5636
# rows user_info_filtered_dataframe: 5636


Unnamed: 0,user_id,followers_count
0,192832526,133
1,37161598,334684
2,243906097,952
3,1849788637,924
4,1151926051892805632,2338


# close the database connection

In [18]:
conn.close()