Clone the project repo containing the Twitter data

In [5]:
!git clone https://github.com/Data-Mining-2021/project.git

fatal: destination path 'project' already exists and is not an empty directory.


Get all imports at once

In [6]:
import pandas as pd
!pip install langdetect
from langdetect import detect
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from joblib import dump, load
import nltk
nltk.download('punkt')
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Filter Text

In [7]:
# https://stackoverflow.com/a/49146722/330558
def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

# filter out non-english tweets - also checks for NaN and tweets only containing emojis
def filter_tweets(tweets_df):
    for index in tweets_df.index:
        currText = tweets_df.loc[index, 'Text']
        if currText != currText or not remove_emoji(currText).strip() or detect(currText) != 'en':
            tweets_df = tweets_df.drop([index])
    return tweets_df

Load in the data and then filter the data

In [4]:
nontrump_df = pd.read_csv('/content/project/Results/all_tweets_notrump_blanksremoved.csv')
alltweets_df = pd.read_csv('/content/project/Results/all_tweets.csv')
trump_df = pd.read_csv('/content/project/Results/trump_tweets.csv')
#nontrump_df = pd.read_csv('/content/project/all_tweets_notrump_blanksremoved.csv') #use later when clone isn't being a bitch

#filter the dataframe for each data set
nontrump_filtered_df = filter_tweets(nontrump_df)
nontrump_text = nontrump_filtered_df['Text']

alltweets_filtered_df = filter_tweets(alltweets_df)
alltweets_text = alltweets_filtered_df['Text']

trump_filtered_df = filter_tweets(trump_df)
trump_text = trump_filtered_df['Text']


Generate kmeans cluster file from TFIDF vectorization of tweets

In [8]:
def tokenize_tweet(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token) and len(token) > 3:
            filtered_tokens.append(token)
    return filtered_tokens

# vectorize into tfidf format (1-grams and 2-grams)
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=200000, max_df=0.8, tokenizer=tokenize_tweet, use_idf=True, stop_words='english')


In [9]:
#perform Kmeans for the non-trump tweets
tfidf_matrix_nontrump = vectorizer.fit_transform(nontrump_text)

# cluster using k-means++
num_clusters_nontrump = 7
km = KMeans(n_clusters=num_clusters_nontrump)
km.fit(tfidf_matrix_nontrump)
clusters_nontrump = km.labels_.tolist()

# export cluster data
dump(km, 'nontrump_clusters_7.pkl')

['nontrump_clusters_7.pkl']

In [10]:
#perform Kmeans for the trump tweets with 7 and 10 clusters
tfidf_matrix_trump = vectorizer.fit_transform(trump_text)

# cluster using k-means++ with 7 clusters
num_clusters_trump1 = 7
km = KMeans(n_clusters=num_clusters_trump1)
km.fit(tfidf_matrix_trump)
clusters_trump1 = km.labels_.tolist()
# export cluster data
dump(km, 'trump_clusters_7.pkl')


# cluster using k-means++ with 10 clusters
num_clusters_trump2 = 10
km = KMeans(n_clusters=num_clusters_trump2)
km.fit(tfidf_matrix_trump)
clusters_trump2 = km.labels_.tolist()
# export cluster data
dump(km, 'trump_clusters_10.pkl')

['trump_clusters_10.pkl']

In [20]:
#perform Kmeans for the all tweets
tfidf_matrix_all = vectorizer.fit_transform(alltweets_text)
dump(tfidf_matrix_all, 'all_tweets_vectorized.pkl')

# cluster using k-means++
num_clusters_all = 7
km = KMeans(n_clusters=num_clusters_all)
km.fit(tfidf_matrix_all)
clusters_all = km.labels_.tolist()

# export cluster data
dump(km, 'alltweets_clusters_7.pkl')

['alltweets_clusters_7.pkl']

Organize TFIDF clusters into a dataframe w/ corresponding tweet data (username, country)

Also print the results for each data set

In [12]:
#show cluster occurrence by country for non trump tweets
# reload the cluster data
km_nontrump = load('nontrump_clusters_7.pkl')
clusters_nontrump = km_nontrump.labels_.tolist()

#define the vocabulary
nontrump_vocab = vectorizer.vocabulary_

# enter tfidf cluster data and corresponding twitter users into dataframe
tweets_nontrump = { 'cluster': clusters_nontrump, 'username': nontrump_filtered_df['Username'].tolist(), 'country': nontrump_filtered_df['Country'].tolist() }
cluster_nontrump_df = pd.DataFrame(tweets_nontrump, index = [clusters_nontrump], columns = ['cluster', 'username', 'country'])

print('Cluster occurrence based on country:')
grouped_nontrump = cluster_nontrump_df['cluster']
grouped_nontrump = cluster_nontrump_df['cluster'].groupby(cluster_nontrump_df['country'])
grouped_nontrump.value_counts()

Cluster occurrence based on country:


country       cluster
Australia     5          1223
              3             1
              4             1
Canada        5          5061
              4           987
              3            71
              2             2
Chile         5           350
India         5           543
Israel        5          3285
              4            56
              2            37
              3             1
Liberia       5            30
New Zealand   5           322
Nigeria       5          1036
              2             1
South Africa  5          6260
              6           357
              0           209
              3             4
              2             1
South Korea   5           199
UN            5          2711
              2             1
USA           5          7774
              2           918
              3           440
              1           285
Name: cluster, dtype: int64

In [13]:
#display the cluster data for nontrump tweets
num_words = 10
print(f'Top {num_words} terms per cluster:\n')

# sort cluster centers by proximity to centroid
order_centroids = km_nontrump.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(num_clusters_nontrump):
  print(f'Cluster {i} words:', end='')
  for index in order_centroids[i, :num_words]:
    print(f' {terms[index]},', end='')

  print(f'\nCluster {i} usernames:', end='')
  for u in cluster_nontrump_df.loc[i]['username'].unique():
    print(f' {u},', end='')

  print(f'\nCluster {i} countries:', end='')
  for c in cluster_nontrump_df.loc[i]['country'].unique():
    print(f' {c},', end='')

  print('\n')

Top 10 terms per cluster:

Cluster 0 words: fully power, ahmed saif, fully lived, aime hadrien, place china, country teams, success assured., reined away, veterans hispanic, reach learn,
Cluster 0 usernames: DrZweliMkhize,
Cluster 0 countries: South Africa,

Cluster 1 words: truly equal, yesterday meeting, time years, value women, quality time, treatment help, moment holiday, necessary says, services widening, performed immigrant,
Cluster 1 usernames: NYGovCuomo,
Cluster 1 countries: USA,

Cluster 2 words: underway currently, know thinking, know taking, reached critical, real worldenvironmentalhealthday, pandemic adopted, amazing news, wish grant, need advice, wish ignore,
Cluster 2 usernames: JoeBiden, SpeakerPelosi, LeaderMcConnell, NYGovCuomo, GovRonDeSantis, JustinTrudeau, fordnation, CyrilRamaphosa, MBuhari, PresidentRuvi, IsraeliPM, antonioguterres,
Cluster 2 countries: USA, Canada, South Africa, Nigeria, Israel, UN,

Cluster 3 words: week explosion, bipartisan natlgovsassoc, wee

In [14]:
#show cluster occurrence by country for trump tweets
# reload the cluster data
km_trump = load('trump_clusters_7.pkl')
clusters_trump = km_trump.labels_.tolist()

#define the vocabulary
trump_vocab = vectorizer.vocabulary_

# enter tfidf cluster data and corresponding twitter users into dataframe
tweets_trump = { 'cluster': clusters_trump, 'username': trump_filtered_df['Username'].tolist(), 'country': trump_filtered_df['Country'].tolist() }
cluster_trump_df = pd.DataFrame(tweets_trump, index = [clusters_trump], columns = ['cluster', 'username', 'country'])

print('Cluster occurrence based on country:')
grouped_trump = cluster_trump_df['cluster']
grouped_trump = cluster_trump_df['cluster'].groupby(cluster_trump_df['country'])
grouped_trump.value_counts()


Cluster occurrence based on country:


country  cluster
Trump    0          3850
         3           305
         5           257
         1           237
         2           200
         6            76
         4            31
Name: cluster, dtype: int64

In [15]:
#display the cluster data for trump tweets
num_words = 10
print(f'Top {num_words} terms per cluster:\n')

# sort cluster centers by proximity to centroid
order_centroids = km_trump.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(num_clusters_trump1):
  print(f'Cluster {i} words:', end='')
  for index in order_centroids[i, :num_words]:
    print(f' {terms[index]},', end='')

  print(f'\nCluster {i} usernames:', end='')
  for u in cluster_trump_df.loc[i]['username'].unique():
    print(f' {u},', end='')

  print(f'\nCluster {i} countries:', end='')
  for c in cluster_trump_df.loc[i]['country'].unique():
    print(f' {c},', end='')

  print('\n')

Top 10 terms per cluster:

Cluster 0 words: days suggestion, argentine president, history police, epidemic, mubarak president, ceremonies, including health, moon manifest, mn08, critical,
Cluster 0 usernames: theRealDonaldTrump,
Cluster 0 countries: Trump,

Cluster 1 words: campaign destroy, cooperating creates, campaign maximum, monitor symptoms, monitor manage, administered date, lincoln alexander, lincoln, moving devastating, message leaders,
Cluster 1 usernames: theRealDonaldTrump,
Cluster 1 countries: Trump,

Cluster 2 words: mn08, following states, mobilized politics, ahead sunday, israeli citizens, ahead meeting, israeli people, kelli, helping families, help thousands,
Cluster 2 usernames: theRealDonaldTrump,
Cluster 2 countries: Trump,

Cluster 3 words: correct listen, great yesterday, corps fellows, functions, green transportation, experts legal, epidemic, experts safe, borders work, days suggestion,
Cluster 3 usernames: theRealDonaldTrump,
Cluster 3 countries: Trump,

Cluster

In [16]:
#show cluster occurrence by country for all tweets
# reload the cluster data
km_alltweets = load('alltweets_clusters_7.pkl')
clusters_alltweets = km_alltweets.labels_.tolist()

#define the vocabulary
alltweets_vocab = vectorizer.vocabulary_

# enter tfidf cluster data and corresponding twitter users into dataframe
tweets_alltweets = { 'cluster': clusters_alltweets, 'username': alltweets_filtered_df['Username'].tolist(), 'country': alltweets_filtered_df['Country'].tolist() }
cluster_alltweets_df = pd.DataFrame(tweets_alltweets, index = [clusters_alltweets], columns = ['cluster', 'username', 'country'])

print('Cluster occurrence based on country:')
grouped_alltweets = cluster_alltweets_df['cluster']
grouped_alltweets = cluster_alltweets_df['cluster'].groupby(cluster_alltweets_df['country'])
grouped_alltweets.value_counts()

Cluster occurrence based on country:


country       cluster
Australia     4          1003
              1           221
              0             1
              6             1
Canada        4          4411
              0           990
              1           475
              3           174
              6            71
Chile         4           308
              1            42
India         4           530
              1             6
Israel        4          2986
              1           239
              0           150
Liberia       4            29
              1             1
New Zealand   4           308
              1            14
Nigeria       4           982
              1            55
South Africa  4          5549
              1           710
              5           367
              2           209
              6             4
South Korea   4           197
              1             2
Trump         4          4944
              1            17
              6             3
UN            4   

In [17]:
#display the cluster data for all tweets
num_words = 10
print(f'Top {num_words} terms per cluster:\n')

# sort cluster centers by proximity to centroid
order_centroids = km_alltweets.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(num_clusters_all):
  print(f'Cluster {i} words:', end='')
  for index in order_centroids[i, :num_words]:
    print(f' {terms[index]},', end='')

  print(f'\nCluster {i} usernames:', end='')
  for u in cluster_alltweets_df.loc[i]['username'].unique():
    print(f' {u},', end='')

  print(f'\nCluster {i} countries:', end='')
  for c in cluster_alltweets_df.loc[i]['country'].unique():
    print(f' {c},', end='')

  print('\n')

Top 10 terms per cluster:

Cluster 0 words: prime minister, prime, minister, minister justin, justin trudeau, trudeau, justin, watch, today prime, live prime,
Cluster 0 usernames: JustinTrudeau, CanadianPM, fordnation, IsraeliPM, ScottMorrisonMP,
Cluster 0 countries: Canada, Israel, Australia,

Cluster 1 words: health, care, health care, public, public health, workers, mental, mental health, minister, covid19,
Cluster 1 usernames: JoeBiden, SpeakerPelosi, LeaderMcConnell, NYGovCuomo, GovRonDeSantis, Canada, JustinTrudeau, CanadianPM, fordnation, CyrilRamaphosa, DrZweliMkhize, GeorgeWeahOff, MBuhari, femigbaja, PresidentRuvi, IsraeliPM, nsitharaman, TheBlueHouseENG, ScottMorrisonMP, GregHuntMP, AndrewLittleMP, mbachelet, antonioguterres, AminaJMohammed, volkan_bozkir, theRealDonaldTrump,
Cluster 1 countries: USA, Canada, South Africa, Liberia, Nigeria, Israel, India, South Korea, Australia, New Zealand, Chile, UN, Trump,

Cluster 2 words: covid alert, alert, covid, alert protect, ones c

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity

# #Make Trump 1 clusters to measure similarity between trump and other clusters
# #vectorizer_trump = TfidfVectorizer(ngram_range=(1, 2), max_features=200000, max_df=0.8, tokenizer=tokenize_tweet, use_idf=True, stop_words='english')
# tfidf_matrix_trump = vectorizer.fit_transform(trump_text)
# svd = TruncatedSVD(100)
# normalizer = Normalizer(copy=False)
# lsa = make_pipeline(svd, normalizer)
# tfidf_matrix_trump = lsa.fit_transform(tfidf_matrix_trump)
# # export cluster data
# dump(tfidf_matrix_trump, 'trump_tweets_vectorized_with_dimension_reduction.pkl')

# num_clusters_trump1 = 1
# km_trump = KMeans(n_clusters=num_clusters_trump1).fit(tfidf_matrix_trump)
# dump(km_trump, 'trump_cluster_1.pkl')
# trump_centroid = km_trump.cluster_centers_

# num_clusters_nontrump = 7
# #when run for the first time, uncomment the below line
# # tfidf_matrix_nontrump = lsa.fit_transform(tfidf_matrix_nontrump)
# dump(tfidf_matrix_nontrump, 'non_trump_tweets_vectorized_with_dimension_reduction.pkl')
# km_nontrump = KMeans(n_clusters=num_clusters_nontrump)
# km_nontrump.fit(tfidf_matrix_nontrump)
# dump(km_nontrump, 'non_trump_cluster_dim_reduction_7.pkl')
# non_trump_centroids = km_nontrump.cluster_centers_

# for i in range(num_clusters_nontrump):
#   non_trump_centroid = non_trump_centroids[i].reshape(1,-1)
#   print(f'Cluster {i} similarity with trump tweets: ', end='')
#   print(cosine_similarity(trump_centroid, non_trump_centroid)[0])



In [24]:
km_trump = load('trump_cluster_1.pkl')
trump_centroid = km_trump.cluster_centers_

km_nontrump = load('non_trump_cluster_dim_reduction_7.pkl')
non_trump_centroids = km_nontrump.cluster_centers_

for i in range(num_clusters_nontrump):
  non_trump_centroid = non_trump_centroids[i].reshape(1,-1)
  print(f'Cluster {i} similarity with trump tweets: ', end='')
  print(cosine_similarity(trump_centroid, non_trump_centroid)[0])

Cluster 0 similarity with trump tweets: [0.45748614]
Cluster 1 similarity with trump tweets: [0.34126566]
Cluster 2 similarity with trump tweets: [0.23121215]
Cluster 3 similarity with trump tweets: [0.2667449]
Cluster 4 similarity with trump tweets: [0.26625446]
Cluster 5 similarity with trump tweets: [0.29974988]
Cluster 6 similarity with trump tweets: [0.0676599]
