In [None]:
import sqlite3, pandas as pd
from pathlib import Path
from datetime import datetime

# Function to load SQL table to DataFrame
def load_sql(db_name, tbl_name):
  """Load SQLite database."""
  con = sqlite3.connect(f'database/{db_name}.db')
  df = pd.read_sql(f"SELECT * FROM {tbl_name}", con)
  con.close()
  return df

def timestamp():
  """Create current timestamp, e.g., 20221107_123045."""
  return datetime.now().strftime("%Y%m%d_%H%M%S")

def save_csv(df, filename):
  """Save dataframe into CSV file."""
  filepath = Path(f'datasets/{filename}_{timestamp()}.csv')
  filepath.parent.mkdir(parents=True, exist_ok=True)
  df.to_csv(filepath)

### Load the Dataset

In [None]:
# Load Twitter dataframe
df = load_sql('tweets_v7', 'tweets_v7')
df[['text']].sample(10)

In [None]:
# Save dataset into CSV
save_csv(df, "tweets_v7")

### Count the number of tokens

In [None]:
from collections import Counter
import nltk

counter = None

def validate(token):
  return len(token) > 1 and token not in ['...', "n't", 'it', 'do']

for idx, tweet in df.iterrows():
  _tokens = nltk.tokenize.casual_tokenize(tweet['text'])
  tokens = []
  
  for token in _tokens:
    if validate(token):
      tokens.append(token)
  
  if counter is None:
    counter = Counter(tokens)
  else:
    counter.update(tokens)

### Word Frequency Analysis

In [None]:
counter.most_common(20)

In [None]:
# Transform counter into dataframe
min_freq = 10
freq_df = pd.DataFrame.from_dict(counter, orient='index', columns=['freq'])
freq_df = freq_df.query('freq >= @min_freq')
freq_df.index.name = 'token'

freq_df.sort_values(by='freq', ascending=False).head(5)

In [None]:
# Frequency diagram
ax = freq_df.sort_values(by='freq', ascending=False).head(30).plot(kind='barh', width=0.8, figsize=(10,7))
ax.invert_yaxis()
ax.set(xlabel='Frequency', ylabel='Token', title='Most Common Words')

In [None]:
ax = freq_df.sort_values(by='freq', ascending=False).tail(20).plot(kind='barh', width=0.9, figsize=(6,8))
ax.invert_yaxis()
ax.set(xlabel='Frequency', ylabel='Token', title='Least Common Words')

In [None]:
# Word clouds
from wordcloud import WordCloud
from matplotlib import pyplot as plt

def wordcloud(word_freq, title=None, max_words=100, stopwords=None):
  wc = WordCloud(width=800, height=600,
                 background_color="white", colormap="Paired",
                 max_font_size=200, max_words=max_words)
  # generate the word cloud
  wc.generate_from_frequencies(word_freq)
  plt.imshow(wc, interpolation='bilinear')
  plt.axis('off')

In [None]:
wordcloud(counter, max_words=300)

In [None]:
# Show 30 least common words
counter.most_common()[-30:]

### Topic Modelling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(min_df=5, max_df=0.7)
count_vectors = count_vectorizer.fit_transform(df['text'])

from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
W_lda_matrix = lda_model.fit_transform(count_vectors)
H_lda_matrix = lda_model.components_

In [None]:
def display_topics(model, features, no_top_words=5):
  
  for topic, word_vector in enumerate(model.components_):
    total = word_vector.sum()
    largest = word_vector.argsort()[::-1] # invert sort order
    print("\nTopic %02d" % topic)
    
    for i in range(0, no_top_words):
      print(" %s (%2.2f)" % (features[largest[i]], word_vector[largest[i]] * 100.0 / total))

In [None]:
display_topics(lda_model, count_vectorizer.get_feature_names_out())

In [None]:
def worldcloud_topic(model, features, no_top_words=40):
  for topic, words in enumerate(model.components_):
    size = {}
    largest = words.argsort()[::-1] # invert sort order
    
    for i in range(0, no_top_words):
      size[features[largest[i]]] = abs(words[largest[i]])
      
    wc = WordCloud(background_color='white', max_words=100, width=960, height=540)
    wc.generate_from_frequencies(size)
    
    plt.figure(figsize=(12,12))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.savefig(f'figures/{topic}.png')

In [None]:
worldcloud_topic(lda_model, count_vectorizer.get_feature_names_out())

### Using Clustering to Uncover Structure of Text

In [None]:
from sklearn.cluster import KMeans
import numpy as np

k_means_text = KMeans(n_clusters=10, random_state=42)
k_means_text.fit(count_vectors)

KMeans(n_clusters=10, random_state=42)

In [None]:
np.unique(k_means_text.labels_, return_counts=True)

sizes = []

for i in range(10):
  sizes.append({"cluster": i, "size": np.sum(k_means_text.labels_ == i)})
  
pd.DataFrame(sizes).set_index("cluster").sort_values(by='size', ascending=False).plot.bar(figsize=(16,9))

In [None]:
def wordcloud_clusters(model, vectors, features, no_top_words=40):
  for cluster in np.unique(model.labels_):
    size = {}
    words = vectors[model.labels_ == cluster].sum(axis=0).A[0]
    largest = words.argsort()[::-1] # invert sort order
    
    for i in range(0, no_top_words):
      size[features[largest[i]]] = abs(words[largest[i]])

    wc = WordCloud(background_color="white", max_words=100, width=960, height=540)
    wc.generate_from_frequencies(size)
    
    plt.figure(figsize=(12,12))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    
    plt.savefig(f"figures/cluster{cluster}.png")

In [None]:
wordcloud_clusters(k_means_text, count_vectors, count_vectorizer.get_feature_names_out())

### Topic Distribution Over Time

In [None]:
days_data = []
days = np.unique(df['created_at'])

for day in days:
  W_day = lda_model.transform(count_vectors[days == day])
  days_data.append([day] + list(W_day.sum(axis=0) / W_day.sum() * 100.0))

In [None]:
topic_names = []
voc = count_vectorizer.get_feature_names_out()

for topic in lda_model.components_:
  important = topic.argsort()
  top_word = voc[important[-1]] + " " + voc[important[-2]]
  topic_names.append("Topic " + top_word)

In [None]:
df_days = pd.DataFrame(days_data, columns=['day'] + topic_names).set_index('day')
df_days.plot.area(figsize=(16,6))

plt.title('Topics Distribition over Time')
plt.savefig(f"figures/topics_distribution.png")

### Relationships Between Categorical Variables

In [None]:
# Relationships between categorical variables
import pandas as pd

# Temporal rule-based sentiment
pd.crosstab(df.rulebased_sent, df.created_at)

# Rule-based sentiment by area, e.g., country, region, county, district
pd.crosstab(df.rulebased_sent, df.country)

In [None]:
pd.crosstab(df.nb_sent, df.country)

In [None]:
pd.crosstab(df.svm_sent, df.country)

In [None]:
pd.crosstab(df.dl_sent, df.country)

### Time Series Analysis

In [None]:
import matplotlib.pyplot as plt

def timeseries_plot(df, x='created_at', y='rulebased_sent', title='Rule-based Sentiment'):
  positive_df = df[[x,y]]
  positive_df = positive_df.loc[positive_df[y] == 1]
  positive_df = positive_df[[x,y]].groupby(x).count().reset_index()
  
  negative_df = df[[x,y]]
  negative_df = negative_df.loc[negative_df[y] == 0]
  negative_df = negative_df[[x,y]].groupby(x).count().reset_index()
  
  plt.figure(figsize=(12,6))
  plt.xlabel('Date')
  plt.ylabel('Sentiment')
  plt.title(title)
  plt.plot(positive_df[x], positive_df[y], color='green')
  plt.plot(negative_df[x], negative_df[y], color='red')
  
  ax = plt.gca()
  ax.set_xticklabels(labels=positive_df[x], rotation=90)
  ax.legend(['Positive', 'Negative'])
  
  plt.savefig(f"figures/timeseries_{x}_{y}.png")

In [None]:
def timeseries_analysis(df, sent=1, title='Temporal Analysis', country=None):
  columns = ['rulebased_sent', 'nb_sent', 'svm_sent', 'dl_sent']
  colours = ['#E3000E', '#92F22A', '#EE543A', '#2C82C9']

  df['created_at'] = pd.to_datetime(df['created_at']).dt.date

  plt.figure(figsize=(12,10))
  plt.xlabel('Date')
  plt.ylabel('Sentiments')
  plt.title(title)

  for idx, column in enumerate(columns):
    plot = df[['created_at', column, 'country']]
    plot = plot.loc[plot[column] == sent]

    if country:
      plot = plot.loc[plot['country'] == country]

    plot = plot.sort_values(by='created_at', ascending=True)
    plot = plot[['created_at', column]].groupby('created_at').count().reset_index()
    plt.plot(plot['created_at'], plot[column], color=colours[idx])

  xlabels = df[['created_at']].sort_values(by='created_at', ascending=True)
  xlabels = xlabels.groupby('created_at').count().reset_index()

  ax = plt.gca()
  ax.set_xticks(xlabels['created_at'])
  ax.set_xticklabels(labels=xlabels['created_at'], rotation=90)
  ax.legend(['VADER', 'MNB', 'SVM', 'BERTweet'])

  plt.savefig(f"figures/timeseries_{'positive' if sent == 1 else 'negative'}.png")

In [None]:
timeseries_analysis(df, sent=1, title='Positive Sentiment')
timeseries_analysis(df, sent=0, title='Negative Sentiment')

In [None]:
timeseries_analysis(df, sent=1, title='Positive Sentiment in England', country='England')
timeseries_analysis(df, sent=0, title='Negative Sentiment in England', country='England')

### Chi-2 Analysis

In [None]:
from scipy.stats import chi2_contingency
chi2_contingency(pd.crosstab(df.rulebased_sent, df.country))

### Check negation words