Clone the project repo containing the Twitter data

In [1]:
!git clone https://github.com/Data-Mining-2021/project.git

fatal: destination path 'project' already exists and is not an empty directory.


Get all imports at once

In [2]:
import pandas as pd
!pip install langdetect
from langdetect import detect
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from joblib import dump, load
import nltk
nltk.download('punkt')
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Filter Text

In [3]:
# https://stackoverflow.com/a/49146722/330558
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# filter out non-english tweets - also checks for NaN and tweets only containing emojis
def filter_tweets(tweets_df):
    for index in tweets_df.index:
        currText = tweets_df.loc[index, 'Text']
        if currText != currText or not remove_emoji(currText).strip() or detect(currText) != 'en':
            tweets_df = tweets_df.drop([index])
    return tweets_df

Load in the data and then filter the data

In [4]:
nontrump_df = pd.read_csv('/content/project/Results/all_tweets_notrump_blanksremoved.csv')
alltweets_df = pd.read_csv('/content/project/Results/all_tweets.csv')
trump_df = pd.read_csv('/content/project/Results/trump_tweets.csv')
#nontrump_df = pd.read_csv('/content/project/all_tweets_notrump_blanksremoved.csv') #use later when clone isn't being a bitch

#filter the dataframe for each data set
nontrump_filtered_df = filter_tweets(nontrump_df)
nontrump_text = nontrump_filtered_df['Text']

alltweets_filtered_df = filter_tweets(alltweets_df)
alltweets_text = alltweets_filtered_df['Text']

trump_filtered_df = filter_tweets(trump_df)
trump_text = trump_filtered_df['Text']


Generate kmeans cluster file from TFIDF vectorization of tweets

In [5]:
def tokenize_tweet(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token) > 3:
            filtered_tokens.append(token)
    return filtered_tokens

# vectorize into tfidf format (1-grams and 2-grams)
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=200000, max_df=0.8, tokenizer=tokenize_tweet, use_idf=True, stop_words='english')


In [6]:
#perform Kmeans for the non-trump tweets
tfidf_matrix_nontrump = vectorizer.fit_transform(nontrump_text)

# cluster using k-means++
num_clusters_nontrump = 7
km = KMeans(n_clusters=num_clusters_nontrump)
km.fit(tfidf_matrix_nontrump)
clusters_nontrump = km.labels_.tolist()

# export cluster data
dump(km, 'nontrump_clusters_7.pkl')

['nontrump_clusters_7.pkl']

In [7]:
#perform Kmeans for the trump tweets with 7 and 10 clusters
tfidf_matrix_trump = vectorizer.fit_transform(trump_text)

# cluster using k-means++ with 7 clusters
num_clusters_trump1 = 7
km = KMeans(n_clusters=num_clusters_trump1)
km.fit(tfidf_matrix_trump)
clusters_trump1 = km.labels_.tolist()
# export cluster data
dump(km, 'trump_clusters_7.pkl')


# cluster using k-means++ with 10 clusters
num_clusters_trump2 = 10
km = KMeans(n_clusters=num_clusters_trump2)
km.fit(tfidf_matrix_trump)
clusters_trump2 = km.labels_.tolist()
# export cluster data
dump(km, 'trump_clusters_10.pkl')

['trump_clusters_10.pkl']

In [8]:
#perform Kmeans for the all tweets
tfidf_matrix_all = vectorizer.fit_transform(alltweets_text)
dump(tfidf_matrix_all, 'all_tweets_vectorized.pkl')

# cluster using k-means++
num_clusters_all = 7
km = KMeans(n_clusters=num_clusters_all)
km.fit(tfidf_matrix_all)
clusters_all = km.labels_.tolist()

# export cluster data
dump(km, 'alltweets_clusters_7.pkl')

['alltweets_clusters_7.pkl']

Organize TFIDF clusters into a dataframe w/ corresponding tweet data (username, country)

Also print the results for each data set

In [9]:
#show cluster occurrence by country for non trump tweets
# reload the cluster data
km_nontrump = load('nontrump_clusters_7.pkl')
clusters_nontrump = km_nontrump.labels_.tolist()

#define the vocabulary
nontrump_vocab = vectorizer.vocabulary_

# enter tfidf cluster data and corresponding twitter users into dataframe
tweets_nontrump = { 'cluster': clusters_nontrump, 'username': nontrump_filtered_df['Username'].tolist(), 'country': nontrump_filtered_df['Country'].tolist() }
cluster_nontrump_df = pd.DataFrame(tweets_nontrump, index = [clusters_nontrump], columns = ['cluster', 'username', 'country'])

print('Cluster occurrence based on country:')
grouped_nontrump = cluster_nontrump_df['cluster']
grouped_nontrump = cluster_nontrump_df['cluster'].groupby(cluster_nontrump_df['country'])
grouped_nontrump.value_counts()

Cluster occurrence based on country:


country       cluster
Australia     0          1001
              2           223
              5             1
              6             1
Canada        0          4645
              6           892
              2           479
              5           106
Chile         0           308
              2            42
India         0           534
              2             6
Israel        0          2944
              2           230
              6           205
              5             1
Liberia       0            29
              2             1
New Zealand   0           308
              2            14
Nigeria       0           984
              2            53
South Africa  0          5556
              2           708
              4           357
              1           209
              5             4
South Korea   0           198
              2             1
UN            0          2572
              2           138
USA           0          8109
              2   

In [10]:
#display the cluster data for nontrump tweets
num_words = 10
print(f'Top {num_words} terms per cluster:\n')

# sort cluster centers by proximity to centroid
order_centroids = km_nontrump.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(num_clusters_nontrump):
  print(f'Cluster {i} words:', end='')
  for index in order_centroids[i, :num_words]:
    print(f' {terms[index]},', end='')

  print(f'\nCluster {i} usernames:', end='')
  for u in cluster_nontrump_df.loc[i]['username'].unique():
    print(f' {u},', end='')

  print(f'\nCluster {i} countries:', end='')
  for c in cluster_nontrump_df.loc[i]['country'].unique():
    print(f' {c},', end='')

  print('\n')

Top 10 terms per cluster:

Cluster 0 words: priority- encouraged, global covax, trained respected, parties usually, reached 50th, work public, future storms, fight york, touch member, times realistic,
Cluster 0 usernames: JoeBiden, SpeakerPelosi, LeaderMcConnell, NYGovCuomo, GovRonDeSantis, Canada, JustinTrudeau, CanadianPM, fordnation, CyrilRamaphosa, DrZweliMkhize, GeorgeWeahOff, MBuhari, femigbaja, PresidentRuvi, IsraeliPM, nsitharaman, TheBlueHouseENG, ScottMorrisonMP, GregHuntMP, jacindaardern, AndrewLittleMP, mbachelet, antonioguterres, AminaJMohammed, volkan_bozkir,
Cluster 0 countries: USA, Canada, South Africa, Liberia, Nigeria, Israel, India, South Korea, Australia, New Zealand, Chile, UN,

Cluster 1 words: fulfil, ahppc, fugitive, aimed, place challenging, succesful conclusion, country second, reimbursements, reach health, veterans guide,
Cluster 1 usernames: DrZweliMkhize,
Cluster 1 countries: South Africa,

Cluster 2 words: nation especially, businesses local, nation faith

In [11]:
#show cluster occurrence by country for trump tweets
# reload the cluster data
km_trump = load('trump_clusters_7.pkl')
clusters_trump = km_trump.labels_.tolist()

#define the vocabulary
trump_vocab = vectorizer.vocabulary_

# enter tfidf cluster data and corresponding twitter users into dataframe
tweets_trump = { 'cluster': clusters_trump, 'username': trump_filtered_df['Username'].tolist(), 'country': trump_filtered_df['Country'].tolist() }
cluster_trump_df = pd.DataFrame(tweets_trump, index = [clusters_trump], columns = ['cluster', 'username', 'country'])

print('Cluster occurrence based on country:')
grouped_trump = cluster_trump_df['cluster']
grouped_trump = cluster_trump_df['cluster'].groupby(cluster_trump_df['country'])
grouped_trump.value_counts()


Cluster occurrence based on country:


country  cluster
Trump    5          3648
         0           459
         1           301
         3           237
         2           192
         6            93
         4            27
Name: cluster, dtype: int64

In [12]:
#display the cluster data for trump tweets
num_words = 10
print(f'Top {num_words} terms per cluster:\n')

# sort cluster centers by proximity to centroid
order_centroids = km_trump.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(num_clusters_trump1):
  print(f'Cluster {i} words:', end='')
  for index in order_centroids[i, :num_words]:
    print(f' {terms[index]},', end='')

  print(f'\nCluster {i} usernames:', end='')
  for u in cluster_trump_df.loc[i]['username'].unique():
    print(f' {u},', end='')

  print(f'\nCluster {i} countries:', end='')
  for c in cluster_trump_df.loc[i]['country'].unique():
    print(f' {c},', end='')

  print('\n')

Top 10 terms per cluster:

Cluster 0 words: arguing, faith family, islamabad president-elect, isixhosa, madam, coming suburbs, maddox prize, fake media, muafak, musa mthombheni,
Cluster 0 usernames: theRealDonaldTrump,
Cluster 0 countries: Trump,

Cluster 1 words: correct automobile, corporations provide, great victory, fully support, green deal, expertise, environments, experts, boris johnson, fattah al-burhan,
Cluster 1 usernames: theRealDonaldTrump,
Cluster 1 countries: Trump,

Cluster 2 words: mmjoshi murli_manohar_joshi, following necessary, mobilized communities, ahead today, israel wear, ahead path, israelelections israelex4, keeping promise, helped wuhan, help stop,
Cluster 2 usernames: theRealDonaldTrump,
Cluster 2 countries: Trump,

Cluster 3 words: campaign finance, cooperate need, campaign powered, monitor ongoing, monitor developments, administered doses, limiting, limited vaccine, moving c_mulroney, message appeared,
Cluster 3 usernames: theRealDonaldTrump,
Cluster 3 coun

In [13]:
#show cluster occurrence by country for all tweets
# reload the cluster data
km_alltweets = load('alltweets_clusters_7.pkl')
clusters_alltweets = km_alltweets.labels_.tolist()

#define the vocabulary
alltweets_vocab = vectorizer.vocabulary_

# enter tfidf cluster data and corresponding twitter users into dataframe
tweets_alltweets = { 'cluster': clusters_alltweets, 'username': alltweets_filtered_df['Username'].tolist(), 'country': alltweets_filtered_df['Country'].tolist() }
cluster_alltweets_df = pd.DataFrame(tweets_alltweets, index = [clusters_alltweets], columns = ['cluster', 'username', 'country'])

print('Cluster occurrence based on country:')
grouped_alltweets = cluster_alltweets_df['cluster']
grouped_alltweets = cluster_alltweets_df['cluster'].groupby(cluster_alltweets_df['country'])
grouped_alltweets.value_counts()

Cluster occurrence based on country:


country       cluster
Australia     1          1224
              4             1
              5             1
Canada        1          4933
              6           985
              3           188
              4            11
Chile         1           350
India         1           540
Israel        1          2565
              4           813
              3             1
Liberia       1            30
New Zealand   1           320
              4             2
Nigeria       1          1037
              3             1
South Africa  1          6253
              0           367
              2           209
              5             4
              4             1
South Korea   1           199
Trump         1          4912
              3            31
              4             3
              5             2
UN            1          2709
              3             1
              4             1
USA           1          8046
              3           649
              5   

In [14]:
#display the cluster data for all tweets
num_words = 10
print(f'Top {num_words} terms per cluster:\n')

# sort cluster centers by proximity to centroid
order_centroids = km_alltweets.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(num_clusters_all):
  print(f'Cluster {i} words:', end='')
  for index in order_centroids[i, :num_words]:
    print(f' {terms[index]},', end='')

  print(f'\nCluster {i} usernames:', end='')
  for u in cluster_alltweets_df.loc[i]['username'].unique():
    print(f' {u},', end='')

  print(f'\nCluster {i} countries:', end='')
  for c in cluster_alltweets_df.loc[i]['country'].unique():
    print(f' {c},', end='')

  print('\n')

Top 10 terms per cluster:

Cluster 0 words: total, total number, number, yesterday, today, number confirmed, number deaths, confirmed, confirmed covid19, deaths,
Cluster 0 usernames: NYGovCuomo, DrZweliMkhize,
Cluster 0 countries: USA, South Africa,

Cluster 1 words: people, covid19, today, thank, health, great, need, president, country, work,
Cluster 1 usernames: JoeBiden, SpeakerPelosi, LeaderMcConnell, NYGovCuomo, GovRonDeSantis, Canada, JustinTrudeau, CanadianPM, fordnation, CyrilRamaphosa, DrZweliMkhize, GeorgeWeahOff, MBuhari, femigbaja, PresidentRuvi, IsraeliPM, nsitharaman, TheBlueHouseENG, ScottMorrisonMP, GregHuntMP, jacindaardern, AndrewLittleMP, mbachelet, antonioguterres, AminaJMohammed, volkan_bozkir, theRealDonaldTrump,
Cluster 1 countries: USA, Canada, South Africa, Liberia, Nigeria, Israel, India, South Korea, Australia, New Zealand, Chile, UN, Trump,

Cluster 2 words: covid alert, alert, covid, alert protect, ones community, start using, community start, protect loved

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cluster_colors = {0: '#ff0000', 1: '#ffff00', 2: '#00ff00', 3: '#00ffff', 4: '#0000ff', 5: '#ff00ff', 6: '#888888'}


import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(1-cosine_similarity(tfidf_matrix_all))  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
print()
print()
clusters = km_alltweets.labels_.tolist()
#some ipython magic to show the matplotlib plots inline
%matplotlib inline 



In [None]:
xs.shape

In [None]:
ys.shape

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters)) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)