In [2]:
from pathlib import Path
import sqlite3, pandas as pd

def load_sql(db_name, tbl_name):
    """Load SQLite database."""
    con = sqlite3.connect(f"database/{db_name}.db")
    df = pd.read_sql(f"SELECT * FROM {tbl_name}", con)
    con.close()
    return df

In [4]:
df = load_sql("tweets_v7", "tweets_v7")
df.sample(5)

Unnamed: 0,id_x,created_at,text,user_location,place_name,place_id,place_full_name,hashtags,postcode,country,longitude,latitude,region,district,county,rulebased_sent,nb_sent,svm_sent,dl_sent
1666,1587495934644076548,2022-11-01,reminder bp stand british petroleum owned gove...,,Garston,12f69ad404352073,"Garston, England",,L19 5NB,England,-2.886266,53.358869,North West,Liverpool,,0,0,0,0
2803,1590093953483018240,2022-11-08,sane thinking mind uk approach different count...,,Dewsbury,43dacd36a372f0d8,"Dewsbury, England",refugees,WF17 7JZ,England,-1.63652,53.701937,Yorkshire and The Humber,Kirklees,,0,0,0,0
417,1588984083375620096,2022-11-05,earlier afternoon arrived edinburgh driving ai...,"14 Royal Terrace, Edinburgh",Queensferry,49f93f5ee9d57aff,"Queensferry, Scotland",,EH30 9NF,Scotland,-3.399317,55.987232,,City of Edinburgh,,0,1,0,0
1173,1580098894339657729,2022-10-12,far biggest contributor military financial hum...,"Cardiff, Wales",Cardiff,68f3012fe4848e35,"Cardiff, Wales",,CF14 3UU,Wales,-3.194773,51.496873,,Cardiff,,0,0,0,0
229,1585694728841986070,2022-10-27,russia good ukraine bad corrupt country bring ...,"Stoke Poges, South East",South East,06168d1feda43857,"South East, England",,RG30 2DQ,England,-0.993347,51.451211,South East,Reading,,0,0,0,0


In [None]:
# ----------------------------------------------------------- #
#                     t-SNE clustering                        #
# ----------------------------------------------------------- #
# This code has been adapted from:
# Prabhakaran, S. (2018). Topic modeling visualisation - How to present the results of LDA models? [online] Machinelearningplus.com. Available at: https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/ [Accessed 2 Dec. 2022].

# from sklearn.manifold import TSNE
# from bokeh.plotting import figure, output_file, show
# from bokeh.models import Label
# from bokeh.io import output_notebook
# import matplotlib.colors as mcolors

# # Get topic weights
# # topic_weights = []
# # for row_list in lda_para_model.components_:
# #     topic_weights.append([w for w in row_list])

# topic_weights = []
# for row_list in lda_para_model.components_:
#     topic_weights.append([w for w in row_list])
    
# # Array of topic weights
# arr = pd.DataFrame(topic_weights).fillna(0).values

# # Keep the well separated points (optional)
# arr = arr[np.amax(arr, axis=1) > 0.35]

# # Dominant topic number in each tweet
# topic_num = np.argmax(arr, axis=1)

# # tSNE dimension reduction
# tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
# tsne_lda = tsne_model.fit_transform(arr)

# # plot the topic clusters using Bokeh
# output_notebook()
# n_topics = 4
# mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
# plot = figure(title="t-SNE Clustering of {} LDA Topics".format(n_topics), 
#               plot_width=900, plot_height=700)
# plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
# show(plot)

In [None]:
# ----------------------------------------------------------- #
#    Calculating Topic Distribution of Documents Over Time    #
# ----------------------------------------------------------- #

In [None]:
import numpy as np

days_data = []
days = np.unique(df['created_at'])

for day in days:
  W_day = lda_para_model.transform(count_para_vectors[days == day])
  days_data.append([day] + list(W_day.sum(axis=0) / W_day.sum() * 100.0))

In [None]:
topic_names = []
voc = count_para_vectorizer.get_feature_names_out()

for topic in lda_para_model.components_:
  important = topic.argsort()
  top_word = voc[important[-1]] + " " + voc[important[-2]]
  topic_names.append("Topic " + top_word)

In [None]:
from matplotlib import pyplot as plt

df_days = pd.DataFrame(days_data, columns=['day'] + topic_names).set_index('day')
df_days.plot.area(figsize=(16,6))

plt.title('Topics Distribition over Time')
plt.savefig("figures/topics_distribution.jpg")

In [None]:
# ----------------------------------------------------------- #
#                    Training SVC model                       #
# ----------------------------------------------------------- #

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], 
    df['rulebased_sent'],
    test_size=0.2,
    train_size=0.8,
    random_state=42,
    shuffle=True,
    stratify=df['rulebased_sent'])

In [None]:
tfidf = TfidfVectorizer()

X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [None]:
from sklearn import svm

model = svm.SVC(kernel='linear')
model.fit(X_train_tf, y_train)

Y_pred = model.predict(X_test_tf)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

# print("Accuracy score: ", accuracy_score(y_test, Y_pred))
# print(classification_report(y_test, Y_pred))
# print(confusion_matrix(y_test, Y_pred))

def create_confusion_matrix(y_test, Y_pred, title="Confusion Matrix"):
    cm = confusion_matrix(y_test, Y_pred)
    cm = cm / np.sum(cm)

    ax = sns.heatmap(cm, annot=True, cmap="Greens", fmt=".2%")

    ax.set_title(f"{title}\n")
    ax.set_xlabel("Predicted Values")
    ax.set_ylabel("Actual Values")

    ax.xaxis.set_ticklabels(["Negative", "Positive"])
    ax.yaxis.set_ticklabels(["Negative", "Positive"])

    title = title.replace(" ", "_")
    plt.savefig(f"figures/{title}.jpg")
    
create_confusion_matrix(y_test, Y_pred, title="SVC Confusion Matrix")

In [None]:
# ----------------------------------------------------------- #
#            Tuning BERTweet deep learning model              #
# ----------------------------------------------------------- #

In [None]:
# Convert dataset into dependent and independent features
X = list(df['text'])
y = list(df['rulebased_sent'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
from transformers import AutoTokenizer

model_name = "vinai/bertweet-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [None]:
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test))

In [None]:
from datasets import load_metric

def compute_metrics(eval_pred):
  metrics = ['accuracy', 'recall', 'precision', 'f1']
  metric = {}
  for met in metrics:
    metric[met] = load_metric(met)
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  metric_res = {}
  for met in metrics:
    metric_res[met] = metric[met].compute(predictions=predictions, references=labels)[met]
  return metric_res

In [None]:
from transformers import TFTrainingArguments

training_args = TFTrainingArguments(output_dir='./results',
                                    num_train_epochs=3,
                                    per_device_train_batch_size=4,
                                    per_device_eval_batch_size=8,
                                    warmup_steps=20,
                                    weight_decay=0.01,
                                    logging_dir='./logs',
                                    logging_steps=3,
                                    logging_strategy='epoch',
                                    evaluation_strategy='epoch',
                                    eval_steps=3)

In [None]:
from transformers import TFAutoModelForSequenceClassification, TFTrainer

with training_args.strategy.scope():
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
    
trainer = TFTrainer(model=model,
                    args=training_args,
                    train_dataset=train_dataset,
                    eval_dataset=test_dataset,
                    compute_metrics=compute_metrics)

trainer.train()

In [None]:
# Evaluate BERTweet model
trainer.evaluate(test_dataset)

output = trainer.predict(test_dataset)
predictions = output[1]

In [None]:
print(classification_report(y_test, output[1]))
create_confusion_matrix(y_test, predictions, "BERTweet Confusion Matrix")