In [2]:
import numpy as np
import pandas as pd
import re
import nltk
import string
import couchdb
# from couchdb_settings import *
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from textblob import TextBlob
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/weimin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/weimin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# couchdb_settings
address = '172.26.130.201:5984' 
username = 'grp5admin'
password = 'password'
tweets = 'raw_tweets'
user = 'user_list'

In [5]:
def db_connect(dbname):
    """
    """

    couchserver = couchdb.Server('http://' + username + ':' + password + '@' + address)
    try:
      db = couchserver[dbname]
    except:
      db = couchserver.create(dbname)

    return db

In [6]:
def fetch_DB(dbname):
    """
    connect to CouchDB
    params: credentials, db addressm dbname
    return: the db to establish connection with
    return type: database
    """
    
    db = couchdb.Database('http://' + address + '/' + dbname)
    db.resource.credentials = (username, password)

    return db

In [7]:
# connect to raw_tweets db
tweet_db = fetch_DB(tweets)

In [43]:
def delete_docs(topic, save_db):
    """
    delete existing data in DB
    """
    
    # use this after DB is completely ready
    # docs = []    
    # for row in save_db.view(topic + '/all', include_docs=True):
    #     doc = row['doc']
    #     if int(doc['year'] >=2018):
    #         doc['_deleted']=True
    #         docs.append(doc)
    #     save_db.update(docs)
        
    docs = []    
    for row in save_db.view(topic + '/all', include_docs=True):
        doc = row['doc']
        doc['_deleted']=True
        docs.append(doc)
        save_db.update(docs)

In [9]:
def now_trending(db, N):
    """
    Extract top N mostly used hasgtags in tweets from past 14 days
    params: raw_tweets database;
            number of hashtags to extract
    return: top N hashtags extracted from tweets made within last 14 days; 
            all hashtags in lowercases
    return type: dict - {hashtag:count}
    render: wordcloud
    """
    
    hashtags = {}
    
    for item in db.view('hashtags/trending', group = True, group_level = 1):
        if item.key.lower() not in hashtags.keys():
            hashtags[item.key.lower()] = item.value
        else:
            hashtags[item.key.lower()] += item.value
        
    hashtags = {k: v for k, v in sorted(hashtags.items(), key=lambda item: item[1])[-N:]}

    return hashtags

now_trending(tweet_db, 10)

{'afl': 107,
 'alboforpm': 111,
 'gopies': 117,
 'armukrainenow': 124,
 'slavaukraini': 147,
 'victraffic': 150,
 'melbourne': 172,
 'ausvotes': 190,
 'auspol': 511,
 'امپورٹڈ_حکومت_نامنظور': 918}

In [10]:
def read_langCode(langCode_path):
    """
    param: language code file path
    return: {language_code: language_name} - language code dictionary
    """

    langCode = {}
    with open(langCode_path, 'r', encoding= 'utf-8') as f:
        for line in f:
            (val, key) = line.split()
            langCode[key] = val
    return langCode 

In [13]:
langCode_path = 'Data/langCode.json'

# read_langCode(langCode_path)

In [15]:
def top_n_lang_count(db, langCode_path, N):
    """
    Extract top N languages other than English in which tweets were made
    params: raw_tweets database;
            path to langCode.json file;
            number of languages to extract
    return: top N most tweeted languages other than English
    return type: dict - {language code: count}
    render: Bar chart?
    """
    languages = {}
    
    for item in db.view('lang/lang-count', group = True, group_level = 1):
        if item.key != 'en':
            if item.key == 'in':
                languages['id'] = item.value
            else:
                languages[item.key] = item.value
            
        languages = {k:v for k, v in sorted(languages.items(), key=lambda item: item[1])[::-1][:N]}
    
    langCode = read_langCode(langCode_path)
    
    languages = {v2: v1 for k1, v1 in languages.items() for k2, v2 in langCode.items() if k1 == k2}
            
    return languages

top_n_lang_count(tweet_db, langCode_path, 10)

{'Indonesian': 23201,
 'Spanish': 22505,
 'Japanese': 17309,
 'Arabic': 12625,
 'Turkish': 9132,
 'Tagalog': 8936,
 'French': 6675,
 'Portuguese': 4943,
 'Chinese': 4317,
 'Thai': 4232}

In [16]:
def top_n_birth_country(file_path, N):
    """
    Extract top N non-English-speaking countries where people living in the Greater Melbourne were originally from
    params: path to census data download from AURIN - 'country_of_birth.csv';
            number of non-English-speaking countries to extract
    return: top N non-English-speaking countries' names, total population count, and percentage population
    return type: numpy arrays
    render: Pareto chart? Bar chart?
    """
    data = pd.read_csv(file_path)
    
    match_cols = []
    new_cols = []
    col_names = data.columns
    for name in col_names:
        if name.endswith('_p'):
            match_cols.append(name)
            new_cols.append(name.strip())

    ext_data = pd.DataFrame(data[match_cols])
    ext_data.columns = new_cols
    
    grand_total = ext_data['tot_p'].sum(axis = 0)
    country_total = ext_data.sum(axis = 0)
    percentage = country_total/grand_total * 100
    
    birth_country = pd.DataFrame(country_total, columns = ['country_total'])
    birth_country['percentage'] = percentage
    
    drop_columns = ['hong_kong_sar_china_p', 'born_elsewhere_p', 'tot_p', 'os_visitors_p', 'country_birth_not_stated_p', 'australia_p', 'new_zealand_p', 'united_states_america_p', 'united_kingdom_ci_im_p', 'fiji_p', 'south_africa_p']
    
    birth_country.loc['china_excl_sars_taiwan_p'] += birth_country.loc['hong_kong_sar_china_p']
    birth_country = birth_country.T.drop(drop_columns, axis = 1)
    birth_country = birth_country.rename({'china_excl_sars_taiwan_p' : 'china_p', 'sri_lanka_p' : 'srilanka_p'}, axis = 1)
    
    country_names = []
    for item in birth_country.columns:
        item = item.split('_')
        country_names.append(item[0].capitalize())

    birth_country.columns = country_names
    birth_country = birth_country.rename({'Srilanka' : 'Sri Lanka'}, axis = 1).T
    birth_country = birth_country.sort_values(by = ['country_total'], ascending = False)[:N]

    birth = {}
    for i in range(len(birth_country)):
        birth[birth_country.index[i]] = birth_country.country_total.values[i], birth_country.percentage.values[i]
    
    return birth

In [17]:
filepath = 'Data/AURIN/country_of_birth.csv'
top_n_birth_country(filepath, 10)

{'China': (174418.0, 3.85545140819977),
 'India': (160058.0, 3.5380284230620616),
 'Vietnam': (78036.0, 1.7249596147775874),
 'Italy': (61521.0, 1.3599010772045204),
 'Sri Lanka': (52658.0, 1.1639874339402094),
 'Malaysia': (45852.0, 1.0135430859703463),
 'Greece': (43881.0, 0.9699747918403725),
 'Philippines': (43642.0, 0.9646917769763117),
 'Pakistan': (19127.0, 0.42279592177778097),
 'Germany': (17511.0, 0.38707478361743725)}

In [23]:
def top_n_lang_spoken_at_home(file_path, langCode_path, N):
    """
    Extract top N languages other than English spoken at home
    params: path to census data download from AURIN - 'lang_at_home.csv';
            path to langCode.json file;
            number of languages other than English to extract
    return: names of top N languages other than English spoken at home, total population count, percentage of population count to total SOL population, 
            percentage of population count to total population, and percentage of SOL population to total population
    return type: numpy arrays
    render: Pareto chart? Bar chart?
    """
    
    data = pd.read_csv(file_path)

    match_cols = []
    new_cols = []
    col_names = data.columns
    for name in col_names:
        if name.endswith('_P'):
            match_cols.append(name)   
            new_cols.append(name.strip())

    ext_data = pd.DataFrame(data[match_cols])
    ext_data.columns = new_cols

    SOL_tot = ext_data['SOL_Tot_P'].sum(axis = 0)
    tot = ext_data['Total_P'].sum(axis = 0)
    SOL_perc = SOL_tot/tot * 100

    drop_columns = ['SOL_Other_P', 'SOL_Samoan_P', 'SOL_Assyrian_P','SOL_Iran_Lan_Tot_P', 
                    'SOL_Irani_Lan_Othr_P', 'SOL_Se_As_A_L_Othr_P', 'SOL_Aus_Indig_Lang_P', 
                    'SOL_In_Ar_Lang_Othr_P', 'SOL_In_Ar_Lang_Tot_P', 'SOL_Se_As_A_L_Tot_P', 
                    'Language_spoken_home_ns_P', 'SOL_Tot_P', 'Total_P', 
                    'SOL_Chin_lang_Mand_P', 'SOL_Chin_lang_Other_P', 'SOL_Chin_lang_Cant_P']
    
    ext_data = ext_data.drop(drop_columns, axis = 1)
    lang_tot = ext_data.sum(axis = 0)
    
    columns = []
    for index in lang_tot.index:
        idx = index.split('_')
        if 'Se_As' in index or 'In_Ar' in index or 'Ir_Lang' in index:
            columns.append(idx[-2])
        elif 'Ir_La' in index:
            columns.append(idx[3])
        else:
            columns.append(idx[1])

    lang_tot.index = columns
    lang_data = pd.DataFrame(lang_tot, columns = ['number']).T
    lang_data = lang_data.rename({'Pe' : 'Persian'}, axis = 1).T
    lang_data['percentage_SOL'] = lang_data['number']/SOL_tot * 100
    lang_data['percentage_Total'] = lang_data['number']/tot * 100

    langCode = read_langCode(langCode_path)
    langdict = {k:v for v in langCode.values() for k in lang_data.index if k in v}

    idx = []
    for i in lang_data.index:
        name = langdict[i]
        idx.append(name)

    lang_data.index = idx
    lang_data = lang_data.sort_values(by = ['number'], ascending = False)[:N]
    
    spoken = {}
    for i in range(len(lang_data)):
        array = np.array([lang_data.number.values[i], lang_data.percentage_SOL.values[i], lang_data.percentage_Total.values[i]])
        spoken[lang_data.index[i]] = array.tolist()

    return spoken

In [24]:
filepath = 'Data/AURIN/lang_at_home.csv'
top_n_lang_spoken_at_home(filepath, langCode_path, 10)

{'Chinese': [189854.0, 16.326190148596588, 4.746370172073231],
 'Greek': [113424.0, 9.753714914694552, 2.835612051351218],
 'Italian': [112665.0, 9.68844592735278, 2.8166369707071257],
 'Vietnamese': [85122.0, 7.319929829389103, 2.1280590442509384],
 'Arabic': [65454.0, 5.628611722619703, 1.6363569545170566],
 'Hindi': [31607.0, 2.717993258117777, 0.7901783582580226],
 'Turkish': [31257.0, 2.6878955696202533, 0.7814283210703645],
 'Punjabi': [29517.0, 2.5382670610897082, 0.7379281361945789],
 'Macedonian': [29378.0, 2.526313979086406, 0.734453121425766],
 'Spanish': [27632.0, 2.376169510181618, 0.6908029359124777]}

In [25]:
def topic_switch(topic):
    """
    params: topic of selection
    return: paths to views relating to the selected topic 
    """

    count_view = 'text/' + topic + '-count'
    topic_view = 'text/' + topic
    topic_db = db_connect(topic + '_text')

    return count_view, topic_view, topic_db

In [32]:
def topic_trend(db, topic):
    """
    Extract the number and percentage of tweets on the selected topic made each year
    params: raw_tweets database;
            the topic of selection
    return:  
    return type: dict - {year : number of tweets on the selected topic made in that year}
                 dict - {year : total number of tweets made in that year}
                 dict - {year : percentage of tweets on selected topic over total number of tweets made in that year}
    render: Dual axes, line and column (combine with topic sentiment as the line)
    """

    count_view, _, _ = topic_switch(topic)
    
    year_topic = {}
    year_total = {}
    percent = {}

    for item in db.view(count_view, group = True, group_level = 1):
        year_topic[item.key] = item.value

    for item in db.view('time/by-year-count', group = True, group_level = 1):
        year_total[item.key] = item.value

    for key in year_topic.keys():
        percent[key] = year_topic[key]/year_total[key] * 100
            
    return year_topic, year_total, percent

In [35]:
topics = ['housing', 'cost', 'transportation']

trend = topic_trend(tweet_db, 'transportation')

In [34]:
trend

({2019: 1, 2021: 4, 2022: 37},
 {2014: 36319,
  2015: 1,
  2018: 9586,
  2019: 15241,
  2020: 35793,
  2021: 86473,
  2022: 99981},
 {2019: 0.006561249261859458,
  2021: 0.00462572132341887,
  2022: 0.03700703133595383})

In [44]:
def topic_wordcloud(query_db, topic):
    """
    Extract topic related wordcloud
    params: raw_tweets database;
            topic of selection
    return: corpus of combined tweets on the selected topic indexed by year; 
            all words in lowercases
    return type: dict - {year : corpus as a list}
    render: wordcloud
    """

    _, topic_view, save_db = topic_switch(topic)

    try:
        delete_docs(topic, save_db)
    except Exception:
        pass

    yearly_tweets = defaultdict(list)
    for item in query_db.view(topic_view):
        yearly_tweets[item.key].append(item.value)
    
    tokenizer = TweetTokenizer()
    
    for key, tweet in yearly_tweets.items():
        tweet = [' '.join(re.sub("(@[A-Za-z0-9\_]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",t).split()) for t in tweet]
        tweet = [' '.join(tweet)]
        tweet_tokens = tokenizer.tokenize(tweet[0])
        tweet_clean = []
        for word in tweet_tokens:
            if word.lower() not in stopwords.words('english') and word.lower() not in string.punctuation:
                tweet_clean.append(word.lower())
                
        yearly_tweets[key] = ' '.join(tweet_clean)

    for k, v in yearly_tweets.items():
        save_db.save({'year': k, 'text':v})

In [45]:
for topic in topics:
    topic_wordcloud(tweet_db, topic)

In [48]:
def topic_sentiment(topic):
    """
    Extract topic related sentiment
    params: raw_tweets database;
            topic of selection
    return: sentiment towards the selected topic indexed by year
    return type: dict - {year : sentiment score}
    render: Dual axes, line and column (combine with topic trend as the columns)
    """

    _, _, db = topic_switch(topic)

    yearly_tweets = {}
    for item in db.view(topic + '/text'):
        yearly_tweets[item.key] = item.value
    
    yearly_sentiment = {}
    for key, value in yearly_tweets.items():
        blob = TextBlob(value)
        for sentence in blob.sentences:
            sentiment = sentence.sentiment.polarity
            yearly_sentiment[key] = sentiment

    return yearly_sentiment

In [49]:
for topic in topics:
    print(topic_sentiment(topic))

{2014: 0.2710676351140269, 2018: 0.10800000000000001, 2019: 0.13429682929682932, 2020: 0.10835478680611424, 2021: 0.10943601545630192, 2022: 0.10005737447910346}
{2014: 0.16666666666666666, 2019: 0.1375, 2021: 0.02727272727272726, 2022: 0.05202380952380951}
{2014: -0.015116756259613354, 2018: 0.2530917280917281, 2019: 0.05319404621352675, 2020: 0.08493145743145744, 2021: 0.06922077922077922, 2022: -0.014736664490401105}
