Clone the project repo containing the Twitter data

In [None]:
!git clone https://github.com/Data-Mining-2021/project.git

Cloning into 'project'...
remote: Enumerating objects: 194, done.[K
remote: Counting objects: 100% (194/194), done.[K
remote: Compressing objects: 100% (132/132), done.[K
remote: Total 194 (delta 81), reused 116 (delta 43), pack-reused 0[K
Receiving objects: 100% (194/194), 19.98 MiB | 10.82 MiB/s, done.
Resolving deltas: 100% (81/81), done.


Get all imports at once

In [None]:
import pandas as pd
!pip install langdetect
from langdetect import detect
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from joblib import dump, load
import nltk
nltk.download('punkt')
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Generate kmeans cluster file from TFIDF vectorization of tweets

In [None]:
def tokenize_tweet(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens


africa_df = pd.read_csv('/content/project/regions/Africa/africa_tweets_filtered.csv')

africa_text = africa_df['Text']

# vectorize into tfidf format (1-grams and 2-grams)
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=200000, max_df=0.8, tokenizer=tokenize_tweet, use_idf=True, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(africa_text)

# cluster using k-means++
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

# export cluster data
dump(km, 'doc_cluster.pkl')

['doc_cluster.pkl']

Organize TFIDF clusters into a dataframe w/ corresponding tweet data (username, country)

In [None]:
# reload the cluster data
km = load('doc_cluster.pkl')
clusters = km.labels_.tolist()

# enter tfidf cluster data and corresponding twitter users into dataframe
tweets = { 'cluster': clusters, 'username': africa_df['Username'].tolist(), 'country': africa_df['Country'].tolist() }
cluster_df = pd.DataFrame(tweets, index = [clusters], columns = ['cluster', 'username', 'country'])

# df['cluster'].value_counts() # number of tweets per cluster

print('Cluster occurance based on country:')
grouped = cluster_df['cluster'].groupby(cluster_df['country'])
grouped.value_counts()

Cluster occurance based on country:


country       cluster
Liberia       0            30
Nigeria       0          1005
              4            33
South Africa  0          4926
              4          1216
              1           350
              2           214
              3           135
Name: cluster, dtype: int64

Display cluster data

In [None]:
num_words = 10
print(f'Top {num_words} terms per cluster:\n')

# sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()

for i in range(num_clusters):
  print(f'Cluster {i} words:', end='')
  for index in order_centroids[i, :n]:
    print(f' {terms[index]},', end='')

  print(f'\nCluster {i} usernames:', end='')
  for u in cluster_df.loc[i]['username'].unique():
    print(f' {u},', end='')

  print(f'\nCluster {i} countries:', end='')
  for c in cluster_df.loc[i]['country'].unique():
    print(f' {c},', end='')

  print('\n')

Top 10 terms per cluster:

Cluster 0 words: people, south, covid19, s, africa, need, country, government, amp, work,
Cluster 0 usernames: CyrilRamaphosa, DrZweliMkhize, GeorgeWeahOff, MBuhari, femigbaja,
Cluster 0 countries: South Africa, Liberia, Nigeria,

Cluster 1 words: total number, total, number, number confirmed, number deaths, confirmed, deaths, cases total, recoveries, confirmed covid19,
Cluster 1 usernames: DrZweliMkhize,
Cluster 1 countries: South Africa,

Cluster 2 words: app, sa, sa app, covid alert, alert sa, alert, covid, use covid, app protect, ones community,
Cluster 2 usernames: DrZweliMkhize,
Cluster 2 countries: South Africa,

Cluster 3 words: statistics, covid19 statistics, covid19, statistics sa, sa, statistics south, sa august, july, sa july, august,
Cluster 3 usernames: CyrilRamaphosa, DrZweliMkhize,
Cluster 3 countries: South Africa,

Cluster 4 words: health, covid19, dr, workers, healthcare, minister, listentotheexperts, vaccine, says, mkhize,
Cluster 4 userna