In [224]:
import os
import json
import pandas as pd

In [225]:
filepath = './data/'
all_files = os.listdir(filepath)

In [226]:
all_files

['final_datasets',
 'events.json',
 'annotation',
 'coins',
 'labelled_data',
 'events.csv',
 'cryptonews',
 'investing.com',
 'old_annotation_data_dont_use']

In [227]:
events = pd.read_csv(filepath + all_files[5], sep=';')
events = events[['event_id', 'start', 'end', 'event_type']]
events

Unnamed: 0,event_id,start,end,event_type
0,1,05/03/2022,04/04/2022,Positive
1,2,27/01/2022,16/02/2022,Positive
2,3,23/02/2022,07/03/2022,Positive
3,4,01/06/2022,01/07/2022,Negative
4,5,27/04/2022,19/05/2022,Negative
5,6,12/01/2022,30/01/2022,Negative


In [228]:
print(events.to_latex(index=False))  


\begin{tabular}{rlll}
\toprule
 event\_id &      start &        end & event\_type \\
\midrule
        1 & 05/03/2022 & 04/04/2022 &   Positive \\
        2 & 27/01/2022 & 16/02/2022 &   Positive \\
        3 & 23/02/2022 & 07/03/2022 &   Positive \\
        4 & 01/06/2022 & 01/07/2022 &   Negative \\
        5 & 27/04/2022 & 19/05/2022 &   Negative \\
        6 & 12/01/2022 & 30/01/2022 &   Negative \\
\bottomrule
\end{tabular}




In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



In [229]:
filepath = './data/final_datasets/'
all_files = os.listdir(filepath)

In [230]:
filepath

'./data/final_datasets/'

In [231]:
all_files

['positive_zscores.csv',
 'negative_zscores.csv',
 'NoEvent_zscores.csv',
 'aggregated_dataset.tsv',
 'aggregated_dataset.csv',
 'top_bigrams.csv',
 'bot_annotated_dataset.csv',
 'FINAL_zscores.tsv']

In [232]:
df = pd.read_csv(filepath + 'FINAL_zscores.tsv', sep=';')

In [233]:
df.columns

Index(['date', 'conflict', 'similarity', 'knowledge', 'power', 'status',
       'support', 'identity', 'romance', 'trust', 'fun', 'conflict_sd',
       'similarity_sd', 'knowledge_sd', 'power_sd', 'status_sd', 'support_sd',
       'identity_sd', 'romance_sd', 'trust_sd', 'fun_sd', 'conflict_n_posts',
       'similarity_n_posts', 'knowledge_n_posts', 'power_n_posts',
       'status_n_posts', 'support_n_posts', 'identity_n_posts',
       'romance_n_posts', 'trust_n_posts', 'fun_n_posts', 'event_type',
       'period_type', 'event_id'],
      dtype='object')

In [234]:
df = df[['date', 'conflict','event_type','period_type','event_id']]
df.columns = ['date', 'conflict_zscore','event_type','period_type','event_id']

In [235]:
df.shape

(252, 5)

In [236]:
df

Unnamed: 0,date,conflict_zscore,event_type,period_type,event_id
0,2022-01-05,0.020890,0,0,0
1,2022-01-06,0.011768,0,0,0
2,2022-01-07,0.377078,0,0,0
3,2022-01-08,0.377078,0,0,0
4,2022-01-09,0.092948,0,0,0
...,...,...,...,...,...
247,2022-09-10,-1.654389,0,0,0
248,2022-09-11,-0.931077,0,0,0
249,2022-09-12,-0.925457,0,0,0
250,2022-09-13,-1.061681,0,0,0


In [237]:
import pandas as pd
import matplotlib.pyplot as plt
import dtale
import numpy as np

# Import threshold data from data/annotation/thresholds.json
thresholds = pd.read_json("data/annotation/new_thresholds.json", orient="index")
# Rename column as threshold
thresholds = thresholds.rename(columns={0: "threshold"})

# Rolling average window size
num_days_rolling_average = 7

# Load in the aggregated dataset
dataset_raw = pd.read_csv("data/final_datasets/aggregated_dataset.tsv", sep=";")

# Select subset of columns
dataset = dataset_raw[
    ["date", "High", "Low", "Mean", "event_id", "event_type", "period_type"]
].drop_duplicates()

# Initialize dimensions
dims = [
    "conflict",
    "similarity",
    "knowledge",
    "power",
    "status",
    "support",
    "identity",
    "romance",
    "trust",
    "fun",
]

# Initialize dictionary with eventtype dataframes
corr_dfs = {}

# Saving all event encordings
events = (
    dataset_raw[["date", "event_type", "period_type", "event_id"]]
    .drop_duplicates(subset=["date"], keep="first")
    .fillna(0)  # All non-event days are filled with 0
)

# Make event_id into integer
events["event_id"] = events["event_id"].astype(int)

# Iterate over dimensions and binarize based on the threshold defined in the top (for Bence)
for dim in dims:
    # Index threshold from thresholds dataframe
    threshold = thresholds.loc[dim]["threshold"]
    # Overwrites each dimension with a binary value
    dataset_raw[dim] = dataset_raw[dim].apply(lambda x: 1 if x > threshold else 0)

# Count number of scores above threshold for each day for each dimension
posts_per_day = dataset_raw.groupby(["date"]).sum()
# Remove columns that are not dimensions
posts_per_day = posts_per_day.drop(["High", "Low", "Mean", "event_id"], axis=1)

# Group event posts by date (summed)
# Basically assigns a count of posts containing a given dimension for each day
cols = ["date"] + dims
dataset_raw = dataset_raw[cols].groupby("date").mean().sort_index()


# Apply rolling average, essentially, iterating over the days,
# taking the average over the previous 3 days and 3 next days
# and assigning this value
# The first and last 3 days will be empty as they don't have enough preceding og succeeding days.
#### These days are dropped
dataset_out = (
    dataset_raw.rolling(num_days_rolling_average, center=True)
    .mean()
    .dropna()
    .sort_index()
)

# Create standard deviation dataframe for each dimension
dataset_out_sd = (
    dataset_raw.rolling(num_days_rolling_average, center=True)
    .std()
    .dropna()
    .sort_index()
)

# Iterating over the dimensions, calculating the mean and std in order to calculate the z-score
# z_score(x) = (x-mean(X))/std(X)
for dim in dims:
    # Get mu and std for each dimension
    mu = dataset_out[dim].mean()
    std = dataset_out[dim].std()
    # Get mu and std for standard deviation dataframe
    mu_sd = dataset_out_sd[dim].mean()
    std_sd = dataset_out_sd[dim].std()

    # Check if any values over the threshold for a dimension - if not we would accidentally divide by zero
    # Only an issue for identity
    if std:
        # Calculate z-score and overwrite value for each dimension
        dataset_out[dim] = dataset_out[dim].apply(lambda x: (x - mu) / std)
        # Calculate z-score for standard deviation dataframe
        dataset_out_sd[dim] = dataset_out_sd[dim].apply(lambda x: (x - mu_sd) / std_sd)

    else:
        # Drop empty dimensions
        dataset_out.drop(dim, axis=1, inplace=True)
        dataset_out_sd.drop(dim, axis=1, inplace=True)
        posts_per_day.drop(dim, axis=1, inplace=True)
        print("Dropping", dim)

# Merge with standard deviation dataframe
dataset_out = dataset_out.merge(
    dataset_out_sd, on="date", suffixes=("", "_sd"), how="left"
)

# Merge with posts per day dataframe
dataset_out = dataset_out.merge(
    posts_per_day, on="date", suffixes=("", "_n_posts"), how="left"
)

# Merge with event data
dataset_out = dataset_out.merge(events, on="date", how="left")

# Save the dataset



Columns (17,18) have mixed types. Specify dtype option on import or set low_memory=False.



In [238]:
dataset_raw = dataset_raw.reset_index()
dataset_raw.head()

Unnamed: 0,date,conflict,similarity,knowledge,power,status,support,identity,romance,trust,fun
0,2022-01-02,0.050562,0.078652,0.752809,0.016854,0.073034,0.140449,0.011236,0.0,0.157303,0.061798
1,2022-01-03,0.0325,0.0725,0.6475,0.0325,0.0575,0.09,0.0225,0.005,0.1325,0.055
2,2022-01-04,0.03,0.0675,0.6,0.0225,0.04,0.1225,0.03,0.0,0.1425,0.05
3,2022-01-05,0.0575,0.0625,0.605,0.0275,0.05,0.1425,0.03,0.0,0.16,0.0625
4,2022-01-06,0.0675,0.08,0.6725,0.0225,0.06,0.105,0.0525,0.005,0.13,0.0675


In [239]:
final_zscores = df.merge(dataset_raw, on='date')[['date','conflict','conflict_zscore','event_type','event_id']]

In [240]:
final_zscores.sample(n=5, random_state=1)

Unnamed: 0,date,conflict,conflict_zscore,event_type,event_id
67,2022-03-13,0.033333,-0.785828,positive,1
251,2022-09-14,0.059701,-0.912274,0,0
231,2022-08-25,0.036036,0.684217,0,0
161,2022-06-16,0.0925,0.864157,negative,4
91,2022-04-06,0.041885,-0.926173,0,0


In [241]:


print(final_zscores.sample(n=5, random_state=1).to_latex(index=False))  

\begin{tabular}{lrrlr}
\toprule
      date &  conflict &  conflict\_zscore & event\_type &  event\_id \\
\midrule
2022-03-13 &  0.033333 &        -0.785828 &   positive &         1 \\
2022-09-14 &  0.059701 &        -0.912274 &          0 &         0 \\
2022-08-25 &  0.036036 &         0.684217 &          0 &         0 \\
2022-06-16 &  0.092500 &         0.864157 &   negative &         4 \\
2022-04-06 &  0.041885 &        -0.926173 &          0 &         0 \\
\bottomrule
\end{tabular}




In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



In [242]:
df_raw = pd.read_table('full_sample.tsv')


Columns (69) have mixed types. Specify dtype option on import or set low_memory=False.



In [243]:
df_raw.shape


(13955, 103)

In [244]:
df_raw

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_background_color,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_text_color,author_flair_type,author_fullname,...,date,author_cakeday,banned_by,edited,event_end,event_is_live,event_start,poll_data,gilded,text_length
0,[],False,Suitable_Advisor7013,,noob,"[{'e': 'text', 't': 'redditor for 3 weeks'}]",redditor for 3 weeks,dark,richtext,t2_soht6lvt,...,2022_10_19,,,,,,,,,23
1,[],False,Suitable_Advisor7013,,noob,"[{'e': 'text', 't': 'redditor for 3 weeks'}]",redditor for 3 weeks,dark,richtext,t2_soht6lvt,...,2022_10_19,,,,,,,,,20
2,[],False,April-180,,,[],,,text,t2_n8fh62ey,...,2022_10_19,,,,,,,,,22
3,[],False,BerryInitial,,,[],,,text,t2_bkqewhn7,...,2022_10_19,,,,,,,,,25
4,[],False,paymefordoingnothing,,,[],,,text,t2_drdbd5e1,...,2022_10_19,,,,,,,,,22
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13950,[],False,coachhunter,,Gentleman,"[{'e': 'text', 't': 'XRP Hodler'}]",XRP Hodler,dark,richtext,t2_1hjxby03,...,2022_04_12,,,,,,,,,23
13951,[],False,kabrinjohn,,,[],,,text,t2_hbazjsp7,...,2022_04_12,,,,,,,,,41
13952,[],False,Responsible_Fig_878,,,[],,,text,t2_365si1dt,...,2022_04_12,,,,,,,,,22
13953,[],False,Just_Serve4239,,,[],,,text,t2_j176clvt,...,2022_04_12,,,,,,,,,36


In [245]:
from datetime import datetime
df_raw[['corpus','author','knowledge','date']]
df_raw['date'] =  pd.to_datetime(df_raw['date'], format='%Y_%m_%d')
#  - [ ]  post text
#- [ ]  userID
#- [ ]  Ten dimensions
#- [ ]  date
#- [ ]  Bitcoin closing price
#- [ ]  bot-categorical

In [246]:
bitcoin_prices = pd.read_csv('./data/coins/coinmarket-BTC.csv')
bitcoin_prices['Date'] =  pd.to_datetime(bitcoin_prices['Date'], format='%Y-%m-%d')

bitcoin_prices.columns.tolist()

['Unnamed: 0', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Market Cap']

In [247]:
merged = df_raw.merge(bitcoin_prices, left_on='date', right_on='Date')

In [248]:
final_raw = merged[['id','corpus','author','knowledge','date','Close']].sample(n=8, random_state=1)


In [249]:
bots_detection = pd.read_csv('data/final_datasets/bot_annotated_dataset.csv',sep=';')


Columns (19,20,21,22,23) have mixed types. Specify dtype option on import or set low_memory=False.



In [250]:
bots_detection.columns

Index(['date', 'id', 'corpus', 'text_type', 'conflict', 'similarity',
       'knowledge', 'power', 'status', 'support', 'identity', 'romance',
       'trust', 'fun', 'High', 'Low', 'Mean', 'Close', 'event_id',
       'event_type', 'period_type', 'lnbot', 'zelcore', 'mod'],
      dtype='object')

In [251]:
bots_detection = bots_detection[['id','lnbot']]

In [252]:
final_raw = final_raw.merge(bots_detection, on='id')

In [253]:
final_raw = final_raw[['corpus','author','knowledge','date','Close','lnbot']]


In [254]:
final_raw.columns = ['text','userID','knowledge','date','Close','lnbot']
final_raw['text'] = final_raw['text'].str[:24]


In [255]:


print(final_raw.to_latex(index=False))  


\begin{tabular}{llrlrl}
\toprule
                    text &               userID &  knowledge &       date &        Close & lnbot \\
\midrule
what is your opinion abo & InternationalSilver1 &   0.898398 & 2022-04-25 & 40458.308965 & False \\
 1 year ago today el sal &    kamaldeepsinghSEO &   0.995376 & 2022-06-10 & 29083.804721 & False \\
how often do you transfe &               2ez305 &   0.917202 & 2022-03-21 & 41077.997992 & False \\
you know what they say a &            Phemmy988 &   0.656172 & 2022-01-24 & 36654.329183 & False \\
based on the current mar &            hey\_oh\_hi &   0.964692 & 2022-03-04 & 39137.603639 & False \\
i have withdrawn some bi & Any-Philosopher-4061 &   0.976274 & 2022-02-19 & 40122.154766 &  True \\
\bottomrule
\end{tabular}




In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



In [256]:
dataset_raw

Unnamed: 0,date,conflict,similarity,knowledge,power,status,support,identity,romance,trust,fun
0,2022-01-02,0.050562,0.078652,0.752809,0.016854,0.073034,0.140449,0.011236,0.000000,0.157303,0.061798
1,2022-01-03,0.032500,0.072500,0.647500,0.032500,0.057500,0.090000,0.022500,0.005000,0.132500,0.055000
2,2022-01-04,0.030000,0.067500,0.600000,0.022500,0.040000,0.122500,0.030000,0.000000,0.142500,0.050000
3,2022-01-05,0.057500,0.062500,0.605000,0.027500,0.050000,0.142500,0.030000,0.000000,0.160000,0.062500
4,2022-01-06,0.067500,0.080000,0.672500,0.022500,0.060000,0.105000,0.052500,0.005000,0.130000,0.067500
...,...,...,...,...,...,...,...,...,...,...,...
253,2022-09-13,0.018634,0.031056,0.745342,0.018634,0.055901,0.105590,0.074534,0.000000,0.204969,0.062112
254,2022-09-14,0.059701,0.044776,0.632836,0.035821,0.038806,0.086567,0.056716,0.005970,0.110448,0.062687
255,2022-09-16,0.037383,0.009346,0.691589,0.018692,0.056075,0.074766,0.028037,0.000000,0.093458,0.056075
256,2022-09-17,0.016000,0.064000,0.736000,0.040000,0.040000,0.056000,0.040000,0.000000,0.208000,0.048000
