In [None]:
import pickle
from matplotlib import pyplot as plt

In [None]:
with open('data/twitter/election_dataset.pickle', 'rb') as handle:
    twitter = pickle.load(handle)

In [None]:
biden, trump = twitter['biden'], twitter['trump']
biden_fe, trump_fe = twitter['biden_fe'], twitter['trump_fe']

In [None]:
combi = [x for x in biden if 'trump' in x[1].lower()] + [x for x in trump if 'biden' in x[1].lower()]
biden = [x for x in biden if 'trump' not in x[1].lower()]
trump = [x for x in trump if 'biden' not in x[1].lower()]
combi_fe = [x for x in biden_fe if 'trump' in x[1].lower()] + [x for x in trump_fe if 'biden' in x[1].lower()]
biden_fe = [x for x in biden_fe if 'trump' not in x[1].lower()]
trump_fe = [x for x in trump_fe if 'biden' not in x[1].lower()]

In [None]:
import sys
import os
from embedding import BertHuggingface

In [None]:
bert = BertHuggingface(8, model_name='bert-base-multilingual-cased', batch_size=8)

In [None]:
def embed(data):
    times, tweets = zip(*data)
    embs = bert.embed(tweets)
    z = zip(times, embs)
    return list(z)

In [None]:
pickle_path = 'data/twitter/biden_{}embeddings.pickle'

if os.path.exists(pickle_path.format('')):
    with open(pickle_path.format(''), 'rb') as handle:
        embs_biden = pickle.load(handle)
else:
    embs_biden = embed(biden)
    with open(pickle_path.format(''), 'wb') as handle:
        pickle.dump(embs_biden, handle)
print('biden embedding done...')

if os.path.exists(pickle_path.format('fe_')):
    with open(pickle_path.format('fe_'), 'rb') as handle:
        embs_biden_fe = pickle.load(handle)
else:
    embs_biden_fe = embed(biden_fe)
    with open(pickle_path.format('fe_'), 'wb') as handle:
        pickle.dump(embs_biden, handle)
print('biden_fe embedding done...')

In [None]:
pickle_path = 'data/twitter/trump_{}embeddings.pickle'

if os.path.exists(pickle_path.format('')):
    with open(pickle_path.format(''), 'rb') as handle:
        embs_trump = pickle.load(handle)
else:
    embs_trump = embed(trump)
    with open(pickle_path.format(''), 'wb') as handle:
        pickle.dump(embs_trump, handle)
print('trump embedding done...')

if os.path.exists(pickle_path.format('fe_')):
    with open(pickle_path.format('fe_'), 'rb') as handle:
        embs_trump_fe = pickle.load(handle)
else:
    embs_trump_fe = embed(trump_fe)
    with open(pickle_path.format('fe_'), 'wb') as handle:
        pickle.dump(embs_trump_fe, handle)
print('trump embedding done...')

In [None]:
pickle_path = 'data/twitter/combi_{}embeddings.pickle'

if os.path.exists(pickle_path.format('')):
    with open(pickle_path.format(''), 'rb') as handle:
        embs_combi = pickle.load(handle)
else:
    embs_combi = embed(combi)
    with open(pickle_path.format(''), 'wb') as handle:
        pickle.dump(embs_combi, handle)
print('combi embedding done...')

if os.path.exists(pickle_path.format('fe_')):
    with open(pickle_path.format('fe_'), 'rb') as handle:
        embs_combi_fe = pickle.load(handle)
else:
    embs_combi_fe = embed(combi_fe)
    with open(pickle_path.format('fe_'), 'wb') as handle:
        pickle.dump(embs_combi_fe, handle)
print('combi foreign embedding done...')

In [None]:
import datetime
import numpy as np

def normalized(a, axis=-1, order=2):
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def create_moving_average(dataset, timeframe='hours'):
    timefactor = 1 if timeframe == 'hours' else 24
    moving_average = []
    min_date = min([x[0] for x in dataset])
    max_date = max([x[0] for x in dataset])
    for each in range((max_date - min_date).days*(24 if timeframe == 'hours' else 1)):
        d = min_date + datetime.timedelta(hours=each*timefactor)
        points = [normalized(x[1]) for x in dataset if x[0] > d and x[0] < d + datetime.timedelta(hours=timefactor)]
        if len(points):
            moving_average.append(normalized(sum(points))) 
    return moving_average

def count_entries(dataset, timeframe='hours', log=False):
    timefactor = 1 if timeframe == 'hours' else 24
    counts = []
    min_date = min([x[0] for x in dataset])
    max_date = max([x[0] for x in dataset])
    for each in range((max_date - min_date).days*(24 if timeframe == 'hours' else 1)):
        d = min_date + datetime.timedelta(hours=each*timefactor)
        l = len([x for x in dataset if x[0] > d and x[0] < d + datetime.timedelta(hours=timefactor)])
        if log and l > 0:
            counts.append(np.log(l))
        else:
            counts.append(l)
    return counts

In [None]:
import sklearn
def compute_cosine_similarities(X):
    cosine_similarities = []
    for i in range(len(X)):
        cosine_similarity = sklearn.metrics.pairwise.cosine_similarity(X[i], X[0])
        cosine_similarities.append(cosine_similarity.item())
        
    return cosine_similarities

### Tweet count by time

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(count_entries(biden, timeframe='days'))
axs[0].set_title('Count of Biden tweets by day')
axs[1].plot(count_entries(biden))
axs[1].set_title('Count of Biden tweets by hour')
plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(count_entries(trump, timeframe='days'))
axs[0].set_title('Count of Trump tweets by day')
axs[1].plot(count_entries(trump))
axs[1].set_title('Count of Trump tweets by hour')
plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(count_entries(biden_fe, timeframe='days'))
axs[0].set_title('Count of foreign Biden tweets by day')
axs[1].plot(count_entries(biden_fe))
axs[1].set_title('Count of foreign Biden tweets by hour')
plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(count_entries(trump_fe, timeframe='days'))
axs[0].set_title('Count of foreign Trump tweets by day')
axs[1].plot(count_entries(trump_fe))
axs[1].set_title('Count of foreign Trump tweets by hour')
plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(count_entries(combi, timeframe='days'))
axs[0].set_title('Count of Combi tweets by day')
axs[1].plot(count_entries(combi))
axs[1].set_title('Count of Combi tweets by hour')
plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(count_entries(combi_fe, timeframe='days'))
axs[0].set_title('Count of foreign Combi tweets by day')
axs[1].plot(count_entries(combi_fe))
axs[1].set_title('Count of foreign Combi tweets by hour')
plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(count_entries(biden, timeframe='days', log=True), color='xkcd:blue')
axs[0].plot(count_entries(trump, timeframe='days', log=True), color='xkcd:red')
axs[0].plot(count_entries(biden_fe, timeframe='days', log=True), color='xkcd:light blue')
axs[0].plot(count_entries(trump_fe, timeframe='days', log=True), color='xkcd:light red')
axs[0].plot(count_entries(combi, timeframe='days', log=True), color='xkcd:green')
axs[0].plot(count_entries(combi_fe, timeframe='days', log=True), color='xkcd:light green')
axs[0].set_title('Count of tweets by day')
axs[1].plot(count_entries(biden, log=True), color='xkcd:blue')
axs[1].plot(count_entries(trump, log=True), color='xkcd:red')
axs[1].plot(count_entries(biden_fe, log=True), color='xkcd:light blue')
axs[1].plot(count_entries(trump_fe, log=True), color='xkcd:light red')
axs[1].plot(count_entries(combi, log=True), color='xkcd:green')
axs[1].plot(count_entries(combi_fe, log=True), color='xkcd:light green')
axs[1].set_title('Count of tweets by hour')
plt.show()


# Interesting points in time:
#### Day 20: Day of the election (2020-11-04)
What to expect:
- More references to voting and appeal thereto
- More "I voted (blue | red | democrat | republican | biden | trump)", "Vote (biden | trump) today!" etc

#### Day 8: Last TV debate (2020-10-23 01:00) (GMT-2 apparently)
- More references to TV and debating
- More references to points and topics which came up in the debate like corona, iran & russia (election collusion), corruption (ukraine), north korea, healthcare, immigration, racism, climate (source [here](https://www.dw.com/en/trump-and-biden-square-off-in-final-debate-how-it-went/a-55364624))

### Cosine similarities by time

In [None]:
biden_ma_hours = create_moving_average(embs_biden)
biden_cos_hours = compute_cosine_similarities(biden_ma_hours)

biden_ma_days = create_moving_average(embs_biden, timeframe='days')
biden_cos_days = compute_cosine_similarities(biden_ma_days)

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(biden_cos_days)
axs[0].set_title('Similarity of Biden embeddings by day')
axs[1].plot(biden_cos_hours)
axs[1].set_title('Similarity of Biden embeddings by hour')
plt.show()

In [None]:
biden_ma_hours_fe = create_moving_average(embs_biden_fe)
biden_cos_hours_fe = compute_cosine_similarities(biden_ma_hours_fe)

biden_ma_days_fe = create_moving_average(embs_biden_fe, timeframe='days')
biden_cos_days_fe = compute_cosine_similarities(biden_ma_days_fe)

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(biden_cos_days_fe)
axs[0].set_title('Similarity of foreign Biden embeddings by day')
axs[1].plot(biden_cos_hours_fe)
axs[1].set_title('Similarity of foreign Biden embeddings by hour')
plt.show()

## Trump embeddings

In [None]:
trump_ma_hours = create_moving_average(embs_trump)
trump_cos_hours = compute_cosine_similarities(trump_ma_hours)

trump_ma_days = create_moving_average(embs_trump, timeframe='days')
trump_cos_days = compute_cosine_similarities(trump_ma_days)

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(trump_cos_days)
axs[0].set_title('Similarity of Trump embeddings by day')
axs[1].plot(trump_cos_hours)
axs[1].set_title('Similarity of Trump embeddings by hour')
plt.show()

In [None]:
trump_ma_hours_fe = create_moving_average(embs_trump_fe)
trump_cos_hours_fe = compute_cosine_similarities(trump_ma_hours_fe)

trump_ma_days_fe = create_moving_average(embs_trump_fe, timeframe='days')
trump_cos_days_fe = compute_cosine_similarities(trump_ma_days_fe)

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(trump_cos_days_fe)
axs[0].set_title('Similarity of foreign Trump embeddings by day')
axs[1].plot(trump_cos_hours_fe)
axs[1].set_title('Similarity of foreign Trump embeddings by hour')
plt.show()

## Combined tweet similarities

In [None]:
combi_ma_hours = create_moving_average(embs_combi)
combi_cos_hours = compute_cosine_similarities(combi_ma_hours)

combi_ma_days = create_moving_average(embs_combi, timeframe='days')
combi_cos_days = compute_cosine_similarities(combi_ma_days)

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(combi_cos_days)
axs[0].set_title('Similarity of Combi embeddings by day')
axs[1].plot(combi_cos_hours)
axs[1].set_title('Similarity of Combi embeddings by hour')
plt.show()

In [None]:
combi_ma_hours_fe = create_moving_average(embs_combi_fe)
combi_cos_hours_fe = compute_cosine_similarities(combi_ma_hours_fe)

combi_ma_days_fe = create_moving_average(embs_combi_fe, timeframe='days')
combi_cos_days_fe = compute_cosine_similarities(combi_ma_days_fe)

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(combi_cos_days_fe)
axs[0].set_title('Similarity of foreign Combi embeddings by day')
axs[1].plot(combi_cos_hours_fe)
axs[1].set_title('Similarity of foreign Combi embeddings by hour')
plt.show()

## All in one

In [None]:
fig, axs = plt.subplots(2, 1, constrained_layout=True, figsize=(8,8))
axs[0].plot(biden_cos_days, color='xkcd:blue')
axs[0].plot(trump_cos_days, color='xkcd:red')
axs[0].plot(biden_cos_days_fe, color='xkcd:light blue')
axs[0].plot(trump_cos_days_fe, color='xkcd:light red')
axs[0].plot(combi_cos_days, color='xkcd:green')
axs[0].plot(combi_cos_days_fe, color='xkcd:light green')
axs[0].set_title('Similarity of tweets by day')
axs[1].plot(biden_cos_hours, color='xkcd:blue')
axs[1].plot(trump_cos_hours, color='xkcd:red')
axs[1].plot(biden_cos_hours_fe, color='xkcd:light blue')
axs[1].plot(trump_cos_hours_fe, color='xkcd:light red')
axs[1].plot(combi_cos_hours, color='xkcd:green')
axs[1].plot(combi_cos_hours_fe, color='xkcd:light green')
axs[1].set_title('Similarity of tweets by hour')
plt.show()


#### inspection

In [None]:
print(trump_cos_hours.index([x for x in trump_cos_hours if x < 0.9825][0]))
print(trump_cos_hours.index([x for x in trump_cos_hours if x < 0.9825][2]))

In [None]:
print(min([x[0] for x in embs_trump]) + datetime.timedelta(hours=192))
print(min([x[0] for x in embs_trump]) + datetime.timedelta(hours=486))

In [None]:
def find_max_tweets(data, amount=5):
    data_r = data[1:]
    maxes = []
    for i in range(amount):
        maxes.append(max(data_r))
        data_r.remove(maxes[-1])
    return [data.index(x) for x in maxes]

In [None]:
# calculate cosine differences for all points to mean at 100 and plot their histogram
d = min([x[0] for x in embs_trump]) + datetime.timedelta(hours=100)
points = [normalized(x[1]) for x in embs_trump if x[0] > d and x[0] < d + datetime.timedelta(hours=1)]
combi_100 = [trump_ma_hours[100]]
combi_100.extend(points)
trump_100 = compute_cosine_similarities(combi_100)
plt.hist(trump_100[1:])

five_max = find_max_tweets(trump_100, amount=5)
for i in range(5):
    print([x for x in trump if x[0] > d and x[0] < d + datetime.timedelta(hours=1)][five_max[i]][1], trump_100[five_max[i]],'\n')

In [None]:
# calculate cosine differences for all points to mean at 480 and plot their histogram
d = min([x[0] for x in embs_trump]) + datetime.timedelta(hours=480)
points = [normalized(x[1]) for x in embs_trump if x[0] > d and x[0] < d + datetime.timedelta(hours=1)]
combi_480 = [trump_ma_hours[480]]
combi_480.extend(points)
trump_480 = compute_cosine_similarities(combi_480)
plt.hist(trump_480[1:])

five_max = find_max_tweets(trump_480, amount=5)
for i in range(5):
    print([x for x in trump if x[0] > d and x[0] < d + datetime.timedelta(hours=1)][five_max[i]][1], trump_480[five_max[i]],'\n')


##### same with biden

In [None]:
# calculate cosine differences for all points to mean at 100 and plot their histogram
d = min([x[0] for x in embs_biden]) + datetime.timedelta(hours=100)
points = [normalized(x[1]) for x in embs_biden if x[0] > d and x[0] < d + datetime.timedelta(hours=1)]
combi_100 = [biden_ma_hours[100]]
combi_100.extend(points)
biden_100 = compute_cosine_similarities(combi_100)
plt.hist(biden_100[1:])
five_max = find_max_tweets(biden_100, amount=5)
for i in range(5):
    print([x for x in biden if x[0] > d and x[0] < d + datetime.timedelta(hours=1)][five_max[i]][1], biden_100[five_max[i]],'\n')

In [None]:
# calculate cosine differences for all points to mean at 480 and plot their histogram
d = min([x[0] for x in embs_biden]) + datetime.timedelta(hours=480)
points = [normalized(x[1]) for x in embs_biden if x[0] > d and x[0] < d + datetime.timedelta(hours=1)]
combi_480 = [biden_ma_hours[480]]
combi_480.extend(points)
biden_480 = compute_cosine_similarities(combi_480)
plt.hist(biden_480[1:])
five_max = find_max_tweets(biden_480, amount=5)
for i in range(5):
    print([x for x in biden if x[0] > d and x[0] < d + datetime.timedelta(hours=1)][five_max[i]][1], biden_480[five_max[i]], '\n')

#### some sentiment analysis

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import numpy as np

def load_sent_dataset(split = "train"):
    ds = tfds.load('sentiment140', split=split, shuffle_files=False)
    return ds

def train_model(X,y,name= "Sent_net"):
        
        print("training model", name)
        X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        loadModel = os.path.exists(name)
        if not loadModel:
            model = keras.Sequential()
            model.add(keras.Input(shape=(X[0].shape)))
            model.add(layers.Dense(100, activation="relu"))
            model.add(layers.Dense(50, activation="relu"))
            model.add(layers.Dense(1, activation = "sigmoid"))
            model.compile(optimizer="rmsprop",loss='mse')
            history = model.fit(X,y, epochs = 5, validation_data = (X_test,y_test), batch_size = 64,verbose = 3)
        
            model.save(name)
        else:
            model = keras.models.load_model(name)
        
        return model
    
def preprocess_dataset(ds, encoder):
    X = []
    X_text= []
    y= []
    if not os.path.exists('data/embeded_twitter_ds.pkl'):
        print("preprocessing dataset")
        for dicto in ds.take(100000):
            X_text.append(str(dicto["text"]))
            y.append([float(dicto["polarity"])/4.0])
        print(True if [x for x in X_text if type(x)!=type('bla')] else False)
        X = encoder.embed(X_text)
        X, y = np.asarray(X), np.asarray(y)
        file = open('data/embeded_twitter_ds.pkl', 'wb')
        pickle.dump([X, y, X_text], file)
    else:
        file = open('data/embeded_twitter_ds.pkl', 'rb')
        X, y, X_text = pickle.load(file)     
    file.close()
    return X, y

In [None]:
if not os.path.exists('Sent_net_trained'):
    ds = load_sent_dataset()
    X, Y = preprocess_dataset(ds, bert)
    with tf.device('/GPU:0'):
        sentiment_model = train_model(X, Y)
    sentiment_model.save('Sent_net_trained')
else:
    sentiment_model = keras.models.load_model('Sent_net_trained')

In [None]:
def predict_sentiment(data, model):
    times, tweets = zip(*data)
    embs = model.predict(tweets)
    z = zip(times, embs)
    return list(z)


In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


In [None]:
if os.path.exists('data/twitter_sentiments.pickle'):
    with open('data/twitter/twitter_sentiments.pickle', 'rb') as handle:
        biden_sentiment, trump_sentiment = pickle.load(handle)
else:
    print('sentimenting biden...')
    biden_sentiment = predict_sentiment(embs_biden, sentiment_model)
    print('sentimenting trump...')
    trump_sentiment = predict_sentiment(embs_trump, sentiment_model)
    with open('data/twitter/twitter_sentiments.pickle', 'wb') as handle:
        pickle.dump((biden_sentiment, trump_sentiment), handle)
    

In [None]:
trump_sentiment[0]

### Markup of the data

In [None]:
print('# Trump tweets:', len(trump))
print('# Biden tweets:', len(biden))
print('# Both tweets:', len(combi))

print('# foreign Trump tweets:', len(trump_fe))
print('# foreign Biden tweets:', len(biden_fe))
print('# foreign Both tweets:', len(combi_fe))