### Historic data to dict by year
- {key, value}: {year, text}

In [None]:
# couchbd_settings
address = 'localhost:15984'
username = 'admin'
password = '123456'
# tweets = 'raw_tweets'
tweets = 'test'
user = 'user_list'

In [2]:
from typing import Optional
from collections import defaultdict
import couchdb
import re
class CouchDBHandler:
    def __init__(self, username, password, address, dbname):
        self.db = couchdb.Server('http://' + username + ':' + password + '@' + address)[dbname]
        
    def get_tweets(self, max_size):
        tweet_historic_dict = defaultdict(list)
        for i, doc_id in enumerate(self.db.view('_all_docs')):
            if max_size:
                if i > max_size:
                    break
            id_ = doc_id['id']
            try:
                year = self.db[id_]['historic']['created_at']
                if year[:2] == '20':
                    yr_created = year[:4]
                elif year[-4:-2] == '20':
                    yr_created = year[-4:]
                else:  
                    yr_created = 'Unknown' 
                
                tweet_historic_dict[yr_created].append(self.db[id_]['historic']['text'])
            except:
                print("error:", self.db[id_])
        return tweet_historic_dict 
    
    #def send_tweets(tweets: dict[str, str]):
        # self.db.save(tweets)

In [4]:
db_handler = CouchDBHandler(username, password, address, tweets)

In [5]:
tweet_historic_dict = db_handler.get_tweets(max_size=341556)

### Timeline & Stream data to dict by year
- {key, value}: {year, text}

In [25]:
# couchbd_settings
address = 'localhost:5984'
username = 'admin'
password = '123456'
tweets2 = 'raw_tweets'
user = 'user_list'

In [26]:
from typing import Optional
from collections import defaultdict
import couchdb
import re
class CouchDBHandler:
    def __init__(self, username, password, address, dbname):
        self.db = couchdb.Server('http://' + username + ':' + password + '@' + address)[dbname]
        
    def get_tweets(self, max_size):
        tweet_timeline_dict = defaultdict(list)
        tweet_stream_dict = defaultdict(list)
        for i, doc_id in enumerate(self.db.view('_all_docs')):
            if max_size:
                if i > max_size:
                    break
            id_ = doc_id['id']
            try:
                year = self.db[id_]['stream']['created_at']
                if year[:2] == '20':
                    yr_created = year[:4]
                elif year[-4:-2] == '20':
                    yr_created = year[-4:]
                else:  
                    yr_created = 'Unknown' 
                
                tweet_stream_dict[yr_created].append(self.db[id_]['stream']['text'])
            except:
                year = self.db[id_]['timeline']['created_at']
                
                if year[:2] == '20':
                    yr_created = year[:4]
                else:  #close to today
                    yr_created = '2022' 
                
                tweet_timeline_dict[yr_created].append(self.db[id_]['timeline']['text'])
        return tweet_timeline_dict, tweet_stream_dict
    
    #def send_tweets(tweets: dict[str, str]):
        # self.db.save(tweets)

In [27]:
db_handler2 = CouchDBHandler(username, password, address, tweets2)

In [None]:
tweet_timeline_dict,tweet_stream_dict = db_handler2.get_tweets(max_size=145112)

### top n counts by hashtags & occurrence words by year

In [9]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import string
from collections import Counter

[nltk_data] Downloading package punkt to /home/azureuser/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import re

def f(s, pat):
    pat = r'(\w*%s\w*)' % pat 
    return re.findall(pat, s)

#### historic data

In [42]:
for key in tweet_historic_dict.keys():
    lst = []
    lst_after_removes = []
    for text in tweet_historic_dict[key]:
        lst = lst + f(text, "#")
        #if lst: # a list of words contains #
            #print(lst)
        
        removes = set(stopwords.words('english') + list(string.punctuation)+ ['http'])
        lst_after_removes = lst_after_removes + [i for i in word_tokenize(text.lower()) if i not in removes]
    
    c = Counter(lst)
    c2 = Counter(lst_after_removes)
    print("common hash words:", c.most_common(50))
    print("common words:", c2.most_common(30))

common hash words: [('#Melbourne', 3245), ('#melbourne', 1893), ('#auspol', 869), ('#Job', 753), ('#Jobs', 559), ('#victraffic', 479), ('#TweetMyJobs', 424), ('#', 378), ('#australia', 361), ('#Australia', 333), ('#firewatch', 293), ('#AUSvIND', 286), ('#qanda', 259), ('#vicvotes', 253), ('#NashsNewVideo', 246), ('#vote5sos', 235), ('#coffee', 226), ('#love', 223), ('#springst', 218), ('#FollowMeNash', 218), ('#illridewithyou', 203), ('#foodporn', 201), ('#food', 191), ('#vscocam', 181), ('#SaveDallas', 162), ('#krazykristmas', 161), ('#travel', 157), ('#AFLGF', 157), ('#photo', 156), ('#BusinessMgmt', 155), ('#Christmas', 153), ('#summer', 153), ('#christmas', 151), ('#jackfollowme', 148), ('#sydneysiege', 146), ('#TheBachelorAU', 128), ('#nofilter', 127), ('#beach', 127), ('#Sales', 126), ('#GetCamTo3Mill', 122), ('#sunset', 119), ('#art', 118), ('#subscribetokianandjc', 118), ('#yum', 117), ('#spring', 117), ('#stkilda', 116), ('#SmallzyWelcomesHome5SOS', 116), ('#ihgjobs', 115), ('

In [17]:
tweet_historic_dict.keys()

dict_keys(['2014', '2015'])

#### timeline & stream data combined

In [10]:
for key in tweet_timeline_dict.keys():
    lst = []
    lst_after_removes = []
    for text in tweet_timeline_dict[key]:
        lst = lst + f(text, "#")
        #if lst: # a list of words contains #
            #print(lst)
        
        removes = set(stopwords.words('english') + list(string.punctuation)+ ['http'])
        lst_after_removes = lst_after_removes + [i for i in word_tokenize(text.lower()) if i not in removes]
    
    c = Counter(lst)
    c2 = Counter(lst_after_removes)
    print("common hash words:", c.most_common(70))
    print("common words:", c2.most_common(50))

common hash words: [('#melbourne', 95), ('#vinyl', 89), ('#vinylrecords', 89), ('#vinylcollection', 88), ('#vinylclub', 88), ('#vinyljunkie', 84), ('#vinylart', 82), ('#records', 78), ('#IAMCARLTON', 77), ('#NAVYBLUES', 76), ('#recordcollection', 75), ('#UNITED', 74), ('#music', 74), ('#Batman', 72), ('#musiclover', 70), ('#fishergang', 58), ('#musictragic', 57), ('#Savetheoceans', 56), ('#gratitude', 55), ('#PurpleHaze', 54), ('#GetOnUp', 54), ('#bungeefitness', 54), ('#jiujitsu', 51), ('#haydenjoshuacharliefc', 51), ('#fisherfamily', 50), ('#dailygratitude', 50), ('#haydenjoshuacharlie', 49), ('#grateful', 42), ('#jiujitsuforeveryone', 42), ('#bungeeworkout', 40), ('#halfthaiaussie', 40), ('#Melbourne', 36), ('#nowspinning', 35), ('#BoundByBlue', 35), ('#gracie', 34), ('#personaltrainer', 34), ('#bungeedance', 33), ('#GratitudePractice', 32), ('#bungeejumping', 30), ('#Maximumeffort', 28), ('#sunnykingsup', 27), ('#', 26), ('#3rrrfm', 25), ('#classicalbums', 24), ('#bungeefly', 24), 