In [None]:
import pickle
from matplotlib import pyplot as plt

In [None]:
with open('data/twitter/election_dataset.pickle', 'rb') as handle:
    twitter = pickle.load(handle)

In [None]:
biden, trump = twitter['biden'], twitter['trump']

In [None]:
biden = [x for x in biden if 'trump' not in x[1].lower()]
trump = [x for x in trump if 'biden' not in x[1].lower()]

In [None]:
import sys
import os
from embedding import BertHuggingface

In [None]:
bert = BertHuggingface(8, model_name='bert-base-multilingual-cased', batch_size=8)

In [None]:
def embed(data):
    times, tweets = zip(*data)
    embs = bert.embed(tweets)
    z = zip(times, embs)
    return list(z)

if os.path.exists('data/twitter/biden_embeddings.pickle'):
    with open('data/twitter/biden_embeddings.pickle', 'rb') as handle:
        embs_biden = pickle.load(handle)
else:
    embs_biden = embed(biden)
    with open('data/twitter/biden_embeddings.pickle', 'wb') as handle:
        pickle.dump(embs_biden, handle)
print('biden embedding done...')

In [None]:
import datetime
import numpy as np

def normalized(a, axis=-1, order=2):
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def create_moving_average(dataset, timeframe='hours'):
    timefactor = 1 if timeframe == 'hours' else 24
    moving_average = []
    min_date = min([x[0] for x in dataset])
    max_date = max([x[0] for x in dataset])
    for each in range((max_date - min_date).days*(24 if timeframe == 'hours' else 1)):
        d = min_date + datetime.timedelta(hours=each*timefactor)
        points = [normalized(x[1]) for x in dataset if x[0] > d and x[0] < d + datetime.timedelta(hours=timefactor)]
        if len(points):
            moving_average.append(normalized(sum(points))) 
    return moving_average

In [None]:
import sklearn
def compute_cosine_similarities(X):
    cosine_similarities = []
    for i in range(len(X)):
        cosine_similarity = sklearn.metrics.pairwise.cosine_similarity(X[i], X[0])
        cosine_similarities.append(cosine_similarity.item())
        
    return cosine_similarities

In [None]:
biden_ma_hours = create_moving_average(embs_biden)
biden_cos_hours = compute_cosine_similarities(biden_ma_hours)
plt.plot(biden_cos_hours)

In [None]:
biden_ma_days = create_moving_average(embs_biden, timeframe='days')
biden_cos_days = compute_cosine_similarities(biden_ma_days)
plt.plot(biden_cos_days)

In [None]:
if os.path.exists('data/twitter/trump_embeddings.pickle'):
    with open('data/twitter/trump_embeddings.pickle', 'rb') as handle:
        embs_trump = pickle.load(handle)
else:
    embs_trump = embed(trump)
    with open('data/twitter/trump_embeddings.pickle', 'wb') as handle:
        pickle.dump(embs_trump, handle)
print('trump embedding done...')

In [None]:
trump_ma_hours = create_moving_average(embs_trump)
trump_cos_hours = compute_cosine_similarities(trump_ma_hours)
plt.plot(trump_cos_hours)
# small dips are always somewhat around 5-8am 

In [None]:
trump_ma_days = create_moving_average(embs_trump, timeframe='days')
trump_cos_days = compute_cosine_similarities(trump_ma_days)
plt.plot(trump_cos_days, marker='x')

#### inspection

In [None]:
print(trump_cos_hours.index([x for x in trump_cos_hours if x < 0.975][0]))
print(trump_cos_hours.index([x for x in trump_cos_hours if x < 0.94][0]))

In [None]:
print(min([x[0] for x in embs_trump]) + datetime.timedelta(hours=465))
print(min([x[0] for x in embs_trump]) + datetime.timedelta(hours=480))

In [None]:
def find_max_tweets(data, amount=5):
    data_r = data[1:]
    maxes = []
    for i in range(amount):
        maxes.append(max(data_r))
        data_r.remove(maxes[-1])
    return [data.index(x) for x in maxes]

In [None]:
# calculate cosine differences for all points to mean at 100 and plot their histogram
d = min([x[0] for x in embs_trump]) + datetime.timedelta(hours=100)
points = [normalized(x[1]) for x in embs_trump if x[0] > d and x[0] < d + datetime.timedelta(hours=1)]
combi_100 = [trump_ma_hours[100]]
combi_100.extend(points)
trump_100 = compute_cosine_similarities(combi_100)
plt.hist(trump_100[1:])

five_max = find_max_tweets(trump_100, amount=5)
for i in range(5):
    print([x for x in trump if x[0] > d and x[0] < d + datetime.timedelta(hours=1)][five_max[i]][1], trump_100[five_max[i]],'\n')

In [None]:
# calculate cosine differences for all points to mean at 480 and plot their histogram
d = min([x[0] for x in embs_trump]) + datetime.timedelta(hours=480)
points = [normalized(x[1]) for x in embs_trump if x[0] > d and x[0] < d + datetime.timedelta(hours=1)]
combi_480 = [trump_ma_hours[480]]
combi_480.extend(points)
trump_480 = compute_cosine_similarities(combi_480)
plt.hist(trump_480[1:])

five_max = find_max_tweets(trump_480, amount=5)
for i in range(5):
    print([x for x in trump if x[0] > d and x[0] < d + datetime.timedelta(hours=1)][five_max[i]][1], trump_480[five_max[i]],'\n')


##### same with biden

In [None]:
# calculate cosine differences for all points to mean at 100 and plot their histogram
d = min([x[0] for x in embs_biden]) + datetime.timedelta(hours=100)
points = [normalized(x[1]) for x in embs_biden if x[0] > d and x[0] < d + datetime.timedelta(hours=1)]
combi_100 = [biden_ma_hours[100]]
combi_100.extend(points)
biden_100 = compute_cosine_similarities(combi_100)
plt.hist(biden_100[1:])
five_max = find_max_tweets(biden_100, amount=5)
for i in range(5):
    print([x for x in biden if x[0] > d and x[0] < d + datetime.timedelta(hours=1)][five_max[i]][1], biden_100[five_max[i]],'\n')

In [None]:
# calculate cosine differences for all points to mean at 480 and plot their histogram
d = min([x[0] for x in embs_biden]) + datetime.timedelta(hours=480)
points = [normalized(x[1]) for x in embs_biden if x[0] > d and x[0] < d + datetime.timedelta(hours=1)]
combi_480 = [biden_ma_hours[480]]
combi_480.extend(points)
biden_480 = compute_cosine_similarities(combi_480)
plt.hist(biden_480[1:])
five_max = find_max_tweets(biden_480, amount=5)
for i in range(5):
    print([x for x in biden if x[0] > d and x[0] < d + datetime.timedelta(hours=1)][five_max[i]][1], biden_480[five_max[i]], '\n')

#### some sentiment analysis

In [None]:
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import numpy as np

def load_sent_dataset(split = "train"):
    ds = tfds.load('sentiment140', split=split, shuffle_files=False)
    return ds

def train_model(X,y,name= "Sent_net"):
        
        print("training model", name)
        X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        loadModel = os.path.exists(name)
        if not loadModel:
            model = keras.Sequential()
            model.add(keras.Input(shape=(X[0].shape)))
            model.add(layers.Dense(100, activation="relu"))
            model.add(layers.Dense(50, activation="relu"))
            model.add(layers.Dense(1, activation = "sigmoid"))
            model.compile(optimizer="rmsprop",loss='mse')
            history = model.fit(X,y, epochs = 5, validation_data = (X_test,y_test), batch_size = 64,verbose = 3)
        
            model.save(name)
        else:
            model = keras.models.load_model(name)
        
        return model
    
def preprocess_dataset(ds, encoder):
    X = []
    X_text= []
    y= []
    if not os.path.exists('data/embeded_twitter_ds.pkl'):
        print("preprocessing dataset")
        for dicto in ds.take(100000):
            X_text.append(str(dicto["text"]))
            y.append([float(dicto["polarity"])/4.0])
        print(True if [x for x in X_text if type(x)!=type('bla')] else False)
        X = encoder.embed_bert(X_text)
        X, y = np.asarray(X), np.asarray(y)
        file = open('data/embeded_twitter_ds.pkl', 'wb')
        pickle.dump([X, y, X_text], file)
    else:
        file = open('data/embeded_twitter_ds.pkl', 'rb')
        X, y, X_text = pickle.load(file)     
    file.close()
    return X, y

In [None]:
if not os.path.exists('Sent_net_trained'):
    ds = load_sent_dataset()
    X, Y = preprocess_dataset(ds, embedder)
    with tf.device('/GPU:0'):
        sentiment_model = train_model(X, Y)
    sentiment_model.save('Sent_net_trained')
else:
    sentiment_model = keras.models.load_model('Sent_net_trained')

In [None]:
def predict_sentiment(data, model):
    times, tweets = zip(*data)
    embs = model.predict(tweets)
    z = zip(times, embs)
    return list(z)


    emb_list = []
    num_steps = int(math.ceil(len(text_list) / self.batch_size))
    print('Splitting into', num_steps, 'batches...')
    for i in range(num_steps):
        ul = min((i + 1) * self.batch_size, len(text_list))
        subset = text_list[i * self.batch_size:ul]
        me = self.bert.encode(subset)
        emb_list.append(me)
    embeddings = np.vstack(emb_list)


In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


In [None]:
with open('data/twitter/twitter_sentiments.pickle', 'wb') as handle:
    pickle.dump((biden_sentiment, trump_sentiment), handle)

In [None]:
if os.path.exists('data/twitter_sentiments.pickle'):
    with open('data/twitter/twitter_sentiments.pickle', 'rb') as handle:
        biden_sentiment, trump_sentiment = pickle.load(handle)
else:
    print('sentimenting biden...')
    biden_sentiment = predict_sentiment(embs_biden, sentiment_model)
    print('sentimenting trump...')
    trump_sentiment = predict_sentiment(embs_trump, sentiment_model)
    with open('data/twitter/twitter_sentiments.pickle', 'wb') as handle:
        pickle.dump((biden_sentiment, trump_sentiment), handle)
    

In [None]:
trump_sentiment[0]

### Markup of the data

In [None]:
len([x for x in trump if 'biden' in x[1].lower()])/len(trump)

In [None]:
len([x for x in biden if 'trump' in x[1].lower()])/len(biden)

In [None]:
both = [x for x in twitter['biden'] if 'trump' in x[1].lower()] + [x for x in twitter['trump'] if 'biden' in x[1].lower()]

In [None]:
len(both)

In [None]:
if os.path.exists('data/twitter/both_embeddings.pickle'):
    with open('data/twitter/both_embeddings.pickle', 'rb') as handle:
        embs_both = pickle.load(handle)
else:
    embs_both = embed(both)
    with open('data/twitter/both_embeddings.pickle', 'wb') as handle:
        pickle.dump(embs_both, handle)

both_ma_hours = create_moving_average(embs_both)
both_cos_hours = compute_cosine_similarities(both_ma_hours)
plt.plot(both_cos_hours)

In [None]:
both_ma_days = create_moving_average(embs_both, timeframe='days')
both_cos_days = compute_cosine_similarities(both_ma_days)
plt.plot(both_cos_days, marker='x')