In [567]:
from json_utils import read_jsonl
import pandas as pd
import plotly.express as px
from collections import Counter, defaultdict

In [568]:
data = read_jsonl("../data/dkpol_tweets.jsonl")

Reading ../data/dkpol_tweets.jsonl


In [603]:
def create_datetype_col(df, datetype):
    if datetype == "M":
        df["month"] = df["created_at"].dt.to_period(datetype).astype(str)
    elif datetype == "W":
        df["week"] = df["created_at"].dt.to_period(datetype).astype(str)
    else:
        print("Wrong datetype.")
    
    return df


def prepare_dataframe(data):
    df = pd.DataFrame(data)
    df = df[df['in_reply_to_user_id'].isna()]
    df["created_at"] = pd.to_datetime(df["created_at"])
    df = create_datetype_col(df, "M")
    df = create_datetype_col(df, "W")
    
    return df
    
    
def get_months(df):
    return sorted(df["month"].unique())


def get_weeks(df):
    return sorted(df["week"].unique())

In [604]:
df = prepare_dataframe(data)
months = get_months(df)
weeks = get_weeks(df)


Converting to PeriodArray/Index representation will drop timezone information.



In [576]:
def get_entities(tweet: dict) -> list:
    try:
        entities = tweet["entities"]
    except:
        pass

In [577]:
def get_tweet_entity(entities, entity_key):
    return [entity[entity_key].lower() for entity in entities]

In [578]:
def get_top_n_counts(A, n):
    counter = Counter()
    counter.update(A)
    try:
        counter.pop("dkpol")
    except:
        pass
    A_sorted = sorted(counter.items(), key=lambda k: k[1], reverse=True)[:n]
    return A_sorted

In [579]:
len(weeks), len(months)

(25, 19)

In [612]:
def get_tags_and_mentions_counts_per_period(df, periods, period_type):
    tags_per_period = []
    mentions_per_period = []
    
    for period in periods:
        df1 = df.loc[df[period_type]==period]
        data = df1.to_dict("records")
        mentions_, hashtags = [], []
        for tweet in data:
            try:
                tweet_entities = tweet["entities"]
            except:
                pass
            try:
                mentions_.append(get_tweet_entity(tweet_entities["mentions"], "username"))
            except:
                pass
            try:
                hashtags.append(get_tweet_entity(tweet_entities["hashtags"], "tag"))
            except:
                pass
        mentions = [item for mention in mentions_ for item in mention]
        tags = [item for tags in hashtags for item in tags]
        top_tags = get_top_n_counts(tags, 50)
        top_mentions = get_top_n_counts(mentions, 50)
        tags_per_period.append(top_tags)
        mentions_per_period.append(top_mentions)
        
    return tags_per_period, mentions_per_period
tags_per_week, mentions_per_week = get_tags_and_mentions_counts_per_period(df, weeks, "week")
tags_per_month, mentions_per_month = get_tags_and_mentions_counts_per_period(df, months, "month")

In [617]:
len(tags_per_week), len(mentions_per_week), len(tags_per_month), len(mentions_per_month)

(25, 25, 19, 19)

In [636]:
def create_word_count_dataframe(word_counts, periods):
    all_words = [word for alist in word_counts for word in list(dict(alist).keys())]
    data = defaultdict(lambda:defaultdict(int))

    for word in all_words:
        for idx in range(len(periods)):
            period = periods[idx]
            period_top_50 = dict(word_counts[idx])
            if word in period_top_50:
                data[period][word] = period_top_50[word]
            else:
                data[period][word] = 0
                
    df = pd.DataFrame(data).transpose()
    
    return df

df1 = create_word_count_dataframe(tags_per_week, weeks)
df2 = create_word_count_dataframe(tags_per_month, months)
df3 = create_word_count_dataframe(mentions_per_week, weeks)
df4 = create_word_count_dataframe(mentions_per_month, months)

In [637]:
def line_plot(df):
    fig = px.line(df)
    return fig.show()

In [638]:
line_plot(df1)
line_plot(df2)
line_plot(df3)
line_plot(df4)

In [402]:
all_words = [word for alist in tags_per_month for word in list(dict(alist).keys())]

In [418]:
data_ = defaultdict(lambda:defaultdict(int))

for word in all_words:
    for idx in range(len(months)):
        month = months[idx]
        month_top_50 = dict(tags_per_month[idx])
        if word in month_top_50:
            data_[month][word] = month_top_50[word]
        else:
            data_[month][word] = 0
        

In [515]:
df1 = pd.DataFrame(data_)

In [516]:
df1.head()

Unnamed: 0,2018-01,2018-02,2018-03,2018-06,2018-09,2018-11,2019-03,2019-06,2019-09,2019-11,2020-03,2020-06,2020-09,2020-11,2021-03,2021-06,2021-08,2021-09,2021-10
dkmedier,49,38,26,45,60,539,57,87,92,127,195,166,178,80,175,123,694,3500,2080
kvotekonger,44,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
svpol,34,32,20,16,16,173,10,12,8,33,41,48,21,6,20,17,74,322,166
eupol,22,10,0,2,17,107,0,9,16,10,0,7,8,4,12,3,18,122,116
dkbiz,18,4,2,6,10,183,2,4,37,29,76,50,55,19,37,32,169,862,496


In [550]:
df2 = df1.transpose()
df2.head()

Unnamed: 0,dkmedier,kvotekonger,svpol,eupol,dkbiz,migpol,sundpol,iran,freeallprotesters,uddpol,...,ecocide,boycottfaroeislands,dagpenge,boycottdenmark,rigsret,rvlm21,rammevilkår,df2021,skolevalg,vlm21
2018-01,49,44,34,22,18,18,17,16,16,14,...,0,0,0,0,0,0,0,0,0,0
2018-02,38,0,32,10,4,0,6,0,0,4,...,0,0,0,0,0,0,0,0,0,0
2018-03,26,0,20,0,2,3,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2018-06,45,0,16,2,6,0,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2018-09,60,0,16,17,10,0,8,0,0,3,...,0,0,0,0,0,0,0,0,0,0


In [551]:
df2.index = df2.index.astype(str)

In [552]:
px.line(df2)