# Generate top-ten tokens for each cluster (Twitter)

In [None]:
# Initialization work
import pandas as pd

filename ="/home/quinton/Documents/COMET_data/twitter_comet/Twts1722-Labeled-k5k10-Cos-Euc-METADATA.csv" #input("Enter the file path of the Twitter data file")
cluster_count = "5"
twitter_data = pd.read_csv(filename)
twitter_data.head()

At this point, we need to sort which posts correspond to which clusters (consider, one post can belong to multiple clusters)

In [None]:
reduced_data = twitter_data[["id", "clntxt", cluster_count]]

reduced_data.head()

In [None]:
# function to tokenize text
from pprint import pprint
import string
import nltk
nltk.download('stopwords')
import nltk.tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline

custom_filter_words = ["nan", "hey", "baj", "wowway", "though", "even", "gaye", 'u', "guys" ]
stop_words = set(stopwords.words('english'))

token_maker = nltk.tokenize.TweetTokenizer()


def df_tokenize(text):
    translate_table = dict((ord(char), None) for char in string.punctuation) # Remove punctuation
    tweet_text = text.to_string(index=False).translate(translate_table).lower()
    tokens = token_maker.tokenize(tweet_text)

    # remove extra stop words
    filtered_tokens = [w for w in tokens if w.lower() not in stop_words]
    filtered_tokens = [w for w in filtered_tokens if w.lower() not in custom_filter_words]
    
    return filtered_tokens


sorted_tokens = {}

# operate one cluster at a time
for iter in range(int(cluster_count)):
    cluster_n = reduced_data[reduced_data[cluster_count] == int(iter)]
    # Tokenize text in every column
    sorted_tokens[f"cluster_{iter}"] = (
        cluster_n[["clntxt"]]
        .apply(
            df_tokenize,
            axis=1,
        )
        .explode()
        .reset_index(drop=True)
        .value_counts()
        .rename_axis("token")
        .reset_index(name="frequency")
        .sort_values(["frequency"], ascending=False)
        #.to_csv(f"~/Desktop/cluster_word_freq_clstr{iter}.csv")
    )


In [None]:
# Visualizations
for iter, key in enumerate(sorted_tokens.keys()):
#     print(key)
#     pprint(sorted_tokens[key].head(10)) # Prints the top 10 tokens table
#     print('\n\n')

    wordcloud = WordCloud(width=800, height = 800, background_color='white').generate(" ".join(sorted_tokens[key]["token"].tolist()))
    plt.figure(figsize=(8,8), facecolor=None)
    plt.title(f"Cluster {iter}")
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)

plt.show()