In [1]:
import numpy as np
import pandas as pd
from collections import Counter
#C=Counter(test_label)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from nltk.corpus import stopwords
from nltk.cluster import KMeansClusterer, cosine_distance, euclidean_distance
from sklearn import metrics
from matplotlib import pyplot as plt

In [3]:
train = pd.read_csv("cleaned_data.csv")
for i in range(2072):
    if train["tweet"].isnull()[i] == True:
        print(i)

In [4]:
# Clean data
train_text=train["tweet"]
train_text.dropna(axis=0, how='any', inplace=True)
train_text.reset_index(drop=True, inplace=True)

In [5]:
train_y=train['Polarity_cat'].drop(index=[1372, 1455])
train_y.reset_index(drop=True, inplace=True)

# Calculate tfidf

In [6]:
my_stopwords = set(stopwords.words('english'))
my_stopwords.update(['pfizer','pfizervaccine','vaccine'])
#tfidf_vect = TfidfVectorizer(stop_words=my_stopwords,min_df=1)
tfidf_vect = TfidfVectorizer(stop_words=my_stopwords,min_df=3,ngram_range=(1,3))
dtm= tfidf_vect.fit_transform(train_text)
print (dtm.shape)

(2072, 6306)


# 2 Cluster

In [8]:
#Cluster by 2 classes
num_clusters=2
clusterer_2 = KMeansClusterer(num_clusters, cosine_distance, repeats=50)
#clusterer_2 = KMeansClusterer(num_clusters, euclidean_distance, repeats=30)
clusters_2 = clusterer_2.cluster(dtm.toarray(), assign_clusters=True)

In [9]:
centroids=np.array(clusterer_2.means())
sorted_centroids = centroids.argsort()[:, ::-1]
voc_lookup= tfidf_vect.get_feature_names()
for i in range(num_clusters):
    
    # get words with top 20 tf-idf weight in the centroid
    top_words=[voc_lookup[word_index] \
               for word_index in sorted_centroids[i, :50]]
    print("Cluster %d:\n %s " % (i, "; ".join(top_words)))

Cluster 0:
 covid; dose; get; amp; first; got; vaccines; moderna; biontech; doses; one; vaccinated; people; second; vaccination; today; covidvaccine; astrazeneca; says; effective; new; jab; fda; getting; done; like; children; shot; week; first dose; received; news; india; az; day; eu; us; weeks; coronavirus; need; year; fully; mrna; second dose; health; data; study; know; real; take 
Cluster 1:
 old; yrs; yrs old; yrs old yrs; old yrs old; old yrs; died; trovan; august; year; second; year old; amp; kids; vaccines; drug; gravely; gravely injured died; gravely injured; injured died; injured; stage; pfecolleague; pfizerproud; oct; still; consent; claimed; suffered; consent children; consent children claimed; drugmaker tested; claimed trovan; claimed trovan kids; clinical stage; clinical stage development; drugmaker tested without; development drugmaker; development drugmaker tested; without parents consent; parents consent; children claimed trovan; trovan kids; children claimed; still cli

# 3 Cluster

In [11]:
#Cluster by 3 classes
num_clusters=3
clusterer_3 = KMeansClusterer(num_clusters, cosine_distance, repeats=50)
#clusterer_3 = KMeansClusterer(num_clusters, euclidean_distance, repeats=30)
clusters_3 = clusterer_3.cluster(dtm.toarray(), assign_clusters=True)

In [12]:
centroids=np.array(clusterer_3.means())
sorted_centroids = centroids.argsort()[:, ::-1]
voc_lookup= tfidf_vect.get_feature_names()
for i in range(num_clusters):
    
    # get words with top 20 tf-idf weight in the centroid
    top_words=[voc_lookup[word_index] \
               for word_index in sorted_centroids[i, :50]]
    print("Cluster %d:\n %s " % (i, "; ".join(top_words)))

Cluster 0:
 covid; dose; got; doses; first; get; biontech; second; fda; vaccinated; vaccination; today; says; coronavirus; eu; done; people; moderna; shot; effective; jab; week; second dose; covidvaccine; new; first dose; weeks; children; news; fully; study; one; side; received; getting; az; health; year; like; million; via; may; got first; could; effects; amp; uk; variant; day; side effects 
Cluster 1:
 vaccines; amp; india; moderna; astrazeneca; one; data; covid; pfizerproud; pfecolleague; modernavaccine; take; effective; transmission; people; best; made; trial; us; make; like; go; mrna; know; get; delta; covid vaccines; omicron; sputnik; preventing; pfizerproud pfecolleague; come; efficacy; astrazenecavaccine; company; scientists; program; good; use; covidvaccine; sinovac; going; also; delta alpha; alpha; new; booster; bhagat; npfizer; future 
Cluster 2:
 old; yrs old; yrs; old yrs; old yrs old; yrs old yrs; died; trovan; year; august; year old; second; day; gravely injured died; in

# 3 Cluster & Polarity score

In [13]:
df1=pd.concat([train_text, train_y], axis=1)
df1_train_0 = df1.sample(frac=0.8,random_state=0,axis=0)
df_train = df1[df1.index.isin(df1_train_0.index)]      #80% for training
df_test = df1[~df1.index.isin(df1_train_0.index)]  

In [14]:
train_text = df_train['tweet']
train_label = df_train['Polarity_cat']
test_text = df_test['tweet']
test_label = df_test['Polarity_cat']

In [15]:
my_stopwords = set(stopwords.words('english'))
my_stopwords.update(['pfizer','pfizervaccine','vaccine'])
#tfidf_vect = TfidfVectorizer(stop_words=my_stopwords,min_df=1)
tfidf_vect = TfidfVectorizer(stop_words=my_stopwords,min_df=3,ngram_range=(1,3))
dtm_train= tfidf_vect.fit_transform(train_text)
print (dtm_train.shape)

(1658, 5233)


In [16]:
num_clusters=3
clusterer = KMeansClusterer(num_clusters, cosine_distance, avoid_empty_clusters=True, repeats=30)
#clusterer = KMeansClusterer(num_clusters, euclidean_distance, repeats=30)
clusters = clusterer.cluster(dtm_train.toarray(), assign_clusters=True)

In [17]:
centroids=np.array(clusterer.means())
sorted_centroids = centroids.argsort()[:, ::-1]
voc_lookup= tfidf_vect.get_feature_names()
for i in range(num_clusters):
    
    # get words with top 20 tf-idf weight in the centroid
    top_words=[voc_lookup[word_index] \
               for word_index in sorted_centroids[i, :25]]
    print("Cluster %d:\n %s " % (i, "; ".join(top_words)))

Cluster 0:
 dose; got; get; first; second; one; moderna; today; vaccinated; first dose; mrna; people; second dose; fully; shot; done; like; go; everyone; az; vaccination; got first; finally; getting; immunity 
Cluster 1:
 covid; amp; biontech; doses; vaccines; effective; news; coronavirus; astrazeneca; study; eu; says; new; children; data; moderna; week; people; uk; omicron; delta; covidvaccine; vaccination; trial; via 
Cluster 2:
 old; yrs old; yrs; old yrs old; old yrs; yrs old yrs; died; trovan; august; year; year old; second; pfecolleague; pfizerproud; oct; day; claimed; get; drug; gravely; gravely injured died; gravely injured; injured died; injured; suffered 


In [18]:
test_dtm = tfidf_vect.transform(test_text)
predicted = [clusterer.classify(v) for v in test_dtm.toarray()]
predicted[0:10]

[1, 0, 1, 1, 2, 0, 0, 1, 0, 2]

In [19]:
confusion_df = pd.DataFrame(list(zip(test_label.values, predicted)),\
                            columns = ["label", "cluster"])
#confusion_df.head()
cluster_crosstab = pd.crosstab(index=confusion_df.cluster, columns=confusion_df.label)
cluster_crosstab

label,Negative,Neutral,Positive
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,16,38,64
1,27,61,86
2,28,35,59


In [20]:
cluster_crosstab.idxmax(axis = 1)[2]

'Positive'

In [21]:
cluster_dict={0:'Negative',\
              1:'Positive',\
              2:'Neutral'}
predicted_target=[cluster_dict[i] \
                  for i in predicted]
print(metrics.classification_report\
      (test_label, predicted_target))

              precision    recall  f1-score   support

    Negative       0.14      0.23      0.17        71
     Neutral       0.29      0.26      0.27       134
    Positive       0.49      0.41      0.45       209

    accuracy                           0.33       414
   macro avg       0.31      0.30      0.30       414
weighted avg       0.37      0.33      0.34       414



# 4 Cluster

In [24]:
num_clusters=4
clusterer_4 = KMeansClusterer(num_clusters, cosine_distance, avoid_empty_clusters=True, repeats=30)
#clusterer_4 = KMeansClusterer(num_clusters, euclidean_distance, repeats=30)
clusters_4 = clusterer_4.cluster(dtm.toarray(), assign_clusters=True)

KeyboardInterrupt: 

In [None]:
centroids=np.array(clusterer_4.means())
sorted_centroids = centroids.argsort()[:, ::-1]
voc_lookup= tfidf_vect.get_feature_names()
for i in range(num_clusters):
    
    # get words with top 20 tf-idf weight in the centroid
    top_words=[voc_lookup[word_index] \
               for word_index in sorted_centroids[i, :50]]
    print("Cluster %d:\n %s " % (i, "; ".join(top_words)))

# 7 Cluster

In [None]:
num_clusters=7
clusterer_7 = KMeansClusterer(num_clusters, cosine_distance, avoid_empty_clusters=True, repeats=25)
#clusterer_7 = KMeansClusterer(num_clusters, euclidean_distance, repeats=30)
clusters_7 = clusterer_7.cluster(dtm.toarray(), assign_clusters=True)

In [None]:
centroids=np.array(clusterer_7.means())
sorted_centroids = centroids.argsort()[:, ::-1]
voc_lookup= tfidf_vect.get_feature_names()
for i in range(num_clusters):
    
    # get words with top 20 tf-idf weight in the centroid
    top_words=[voc_lookup[word_index] \
               for word_index in sorted_centroids[i, :30]]
    print("Cluster %d:\n %s " % (i, "; ".join(top_words)))