# Load and clean up data

In [1]:
import sanitization


# Read the CSV file and extract the text data
# Downloaded from https://ciphix.io/ai/data.csv
with open('ciphix NLP/untranslated_data.csv', 'r') as file:
    lines_ = file.readlines()

## temporarily use less than the full data set for faster computation
lines = lines_[0:len(lines_) // 10]
#lines = lines_
print(len(lines))

264435


In [2]:
tokens = sanitization.sanitize_tokenize(lines)

[nltk_data] Downloading package stopwords to /home/tk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [3]:
import pickle

with open('ciphix NLP/tokens.pickle', 'wb') as file:
    pickle.dump(tokens, file)

In [1]:
import pickle

with open('ciphix NLP/tokens.pickle', 'rb') as file:
    tokens = pickle.load(file)

# Specificity score

Make a score for each word based on how often it occurs in general speech, i.e. outside our current dataset

In [22]:
# Make a global corpus from Brown

import nltk
import gensim.models
from nltk.corpus import brown

try:
    nltk.data.find('brown')
except LookupError:
    nltk.download('brown')

global_tokens = sanitization.sanitize_tokenize(brown.words()) # Use brown corpus as single tweet

# Add random twitter data to the curpus

import csv

# Read the CSV file and extract the text data
lines = []
#  Downloaded from https://www.kaggle.com/datasets/kazanova/sentiment140/discussion/60512
with open('ciphix NLP/training.1600000.processed.noemoticon.csv', 'r', encoding='ISO-8859-1') as file:
    reader = csv.reader(file)
    for line in reader:
        lines.append(line[5])  # Column 5 is the actual tweet
        
rnd_twitter_tokens = sanitization.sanitize_tokenize(lines) 

global_tokens.extend(rnd_twitter_tokens)

# Add our tokens to the set so we don't get problems with unknown tokens
global_tokens.extend(tokens)

[nltk_data] Downloading package brown to /home/tk/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Pack

In [23]:
import pickle

with open('ciphix NLP/global_tokens.pickle', 'wb') as file:
    pickle.dump(global_tokens, file)

In [2]:
import pickle

with open('ciphix NLP/global_tokens.pickle', 'rb') as file:
    global_tokens = pickle.load(file)

In [24]:
from nltk.probability import FreqDist

# Create a frequency distribution of all words from a different context then our current dataset
global_fdist = FreqDist()
for line in global_tokens:
    for word in line:
        global_fdist[word] += 1
        

In [None]:
import csv

assert(len(global_fdist))  # Only save when we have data

with open('ciphix NLP/freqs.csv', 'w') as file:
    writer = csv.writer(file)
    for word in global_fdist:
        writer.writerow([word, global_fdist.freq(word)])


In [4]:
import csv

# Create mapping from words to their frequency in 'normal language'
word2freq_dict = {}

with open('ciphix NLP/freqs.csv') as csvfile:
    reader = csv.reader(csvfile)
    for [word, freq] in reader:
        word2freq_dict[word] = float(freq)
        
assert(len(word2freq_dict))  # We should have loaded actual data
    

In [92]:

from nltk.probability import FreqDist

local_fdist = FreqDist()
for line in tokens:
    for word in line:
        local_fdist[word] += 1

In [5]:
from functools import reduce 

global_size = reduce(lambda count, l: count + len(l), global_tokens, 0)

local_size = reduce(lambda count, l: count + len(l), tokens, 0)

def word2freq(word):
    try: 
        return global_fdist[word]
        #return local_fdist[word]
    except KeyError:  # word didn't occur in our dataset
        return 1. / global_size

# Latent Dirichlet Allocation

In [6]:
from gensim import corpora, models

num_topics = 100
num_top_topics = 10
num_passes = 3

# Create a dictionary of the tokens
dictionary = corpora.Dictionary(tokens)

# Create a corpus of the tokens
corpus = [dictionary.doc2bow(line) for line in tokens]

# Train an LDA model on the corpus
lda_model = models.ldamodel.LdaModel(corpus=corpus,
                                     id2word=dictionary,
                                     num_topics=num_topics,
                                     passes=num_passes,
                                     alpha='auto',
                                     random_state=42
                                    )

# Print the most prevalent topics discussed in the text data
lda_model.print_topics(num_topics=num_top_topics, num_words=8)
    
# Findings:
#  Sometimes gives boorito, sometimes not, depending on random_state,`
#  even though I think that term is highly specific and would be a good candidate for a topic.

KeyboardInterrupt: 

In [35]:
from gensim.test.utils import datapath
from gensim import corpora
import os

# Save model to disk.
file = datapath(os.getcwd() + "/ciphix NLP/lda_model.lda")
lda_model.save(file)


file = datapath(os.getcwd() + "/ciphix NLP/dictionary.pickle")
dictionary.save(file)

In [5]:
from gensim.test.utils import datapath
from gensim import corpora
from gensim.models import LdaModel
import os

# Load model from disk.
file = datapath(os.getcwd() + "/ciphix NLP/lda_model.lda")
lda_model = LdaModel.load(file)


file = datapath(os.getcwd() + "/ciphix NLP/dictionary.pickle")
dictionary = corpora.Dictionary.load(file)

In [17]:
# Find topic best matching a tweet
def best_topic(line_tokens, lda_model):
    topic_probs = lda_model.get_document_topics(dictionary.doc2bow(line_tokens))
    return max(topic_probs, key=lambda pair: pair[1])

# Find tweet best matching a topic
def best_representatives(lda_model, num_tweets_to_check=100000):
    first_topic_probs = lda_model.get_document_topics(bow = dictionary.doc2bow(tokens[0]))
    best_topic_probs = { topic_id : (-1, 0) for topic_id in range(num_topics) }
    for i in range(num_tweets_to_check):
        line_tokens = tokens[i]
        bow = dictionary.doc2bow(line_tokens)
        topic_probs = lda_model.get_document_topics(bow)
        for topic_id, topic_prob in topic_probs:
            if topic_prob > best_topic_probs[topic_id][1]:
                best_topic_probs[topic_id] = (i, topic_prob)
    return best_topic_probs
    

representative_tweets = best_representatives(lda_model)

{0: (-1, 0),
 1: (1286, 0.35301313),
 2: (1940, 0.30972847),
 3: (-1, 0),
 4: (1692, 0.23932157),
 5: (644, 0.28705502),
 6: (-1, 0),
 7: (-1, 0),
 8: (-1, 0),
 9: (558, 0.36593404),
 10: (1694, 0.27029648),
 11: (-1, 0),
 12: (1904, 0.26082495),
 13: (1734, 0.3254792),
 14: (1866, 0.33220857),
 15: (1659, 0.28760242),
 16: (1641, 0.25386384),
 17: (-1, 0),
 18: (1765, 0.2545081),
 19: (17321, 0.1721748),
 20: (1688, 0.23882107),
 21: (234, 0.36048797),
 22: (641, 0.30806866),
 23: (86007, 0.22822103),
 24: (1398, 0.4296613),
 25: (1753, 0.34911054),
 26: (319, 0.38230428),
 27: (1695, 0.14855479),
 28: (-1, 0),
 29: (181, 0.21644239),
 30: (-1, 0),
 31: (1032, 0.44334382),
 32: (1650, 0.38540772),
 33: (626, 0.32067284),
 34: (1503, 0.39601025),
 35: (1776, 0.27324727),
 36: (-1, 0),
 37: (480, 0.27711806),
 38: (1047, 0.23139562),
 39: (-1, 0),
 40: (-1, 0),
 41: (1306, 0.3041109),
 42: (1660, 0.28152856),
 43: (120, 0.34525943),
 44: (-1, 0),
 45: (219, 0.23801398),
 46: (1165, 0.24

In [None]:
lda_model.get_document_topics(bow = dictionary.doc2bow(tokens[0]))

In [12]:
lda_model.show_topics()

[(338,
  '0.000*"radiate" + 0.000*"unitedstates" + 0.000*"eliza" + 0.000*"takeofftips" + 0.000*"kevingray" + 0.000*"uncarrierexperience" + 0.000*"fitnessmotivation" + 0.000*"behealthy" + 0.000*"fitfam" + 0.000*"outwards"'),
 (317,
  '0.000*"radiate" + 0.000*"unitedstates" + 0.000*"eliza" + 0.000*"takeofftips" + 0.000*"kevingray" + 0.000*"uncarrierexperience" + 0.000*"fitnessmotivation" + 0.000*"behealthy" + 0.000*"fitfam" + 0.000*"outwards"'),
 (302,
  '0.000*"radiate" + 0.000*"unitedstates" + 0.000*"eliza" + 0.000*"takeofftips" + 0.000*"kevingray" + 0.000*"uncarrierexperience" + 0.000*"fitnessmotivation" + 0.000*"behealthy" + 0.000*"fitfam" + 0.000*"outwards"'),
 (691,
  '0.000*"radiate" + 0.000*"unitedstates" + 0.000*"eliza" + 0.000*"takeofftips" + 0.000*"kevingray" + 0.000*"uncarrierexperience" + 0.000*"fitnessmotivation" + 0.000*"behealthy" + 0.000*"fitfam" + 0.000*"outwards"'),
 (539,
  '0.000*"radiate" + 0.000*"unitedstates" + 0.000*"eliza" + 0.000*"takeofftips" + 0.000*"kevingra

In [34]:
import math

num_topic_words = 10

def get_score(word, prob):
    freq = word2freq(word)
    if freq < 1. / local_size * 5:
        return 0
    return prob**2 / freq

def print_topic(topic_index, model):
    '''
    Print topics most specific keywords (i.e. P(word|model) / P(word_global) or other specificity score)
    '''
    topic = model.get_topics()[topic_index]
    weighted = []
    for i, prob in enumerate(topic):
        word = dictionary[i]
        #weighted.append((word, prob / word2freq(word), prob))
        #weighted.append((word, prob / (word2freq(word) - math.log2(word2freq(word))), prob))
        #weighted.append((word, prob**2 / word2freq(word), prob))
        weighted.append((word, get_score(word, prob), prob))
    weighted.sort(key=lambda pair: pair[1], reverse=True)
    print(f"Topic {topic_index}:")
    top_keywords = [f"{word} ({prob*100:.2f}%)" for word, weight, prob in weighted[0:num_topic_words]]
    print(', '.join(top_keywords))

# Get topics as list of tuples of word and probability
topics = lda_model.show_topics(num_top_topics + 4, num_words=1, formatted=False)

for topic_index, topic in topics:
    print("")
    print_topic(topic_index, lda_model)
    print(lda_model.print_topic(topic_index))
    best_line_idx, prob = representative_tweets[topic_index]
    if best_line_idx >= 0:
        print(lines[best_line_idx])
        assert(best_topic(tokens[best_line_idx], lda_model)[0] == topic_index)



Topic 3:
unawa (0.00%), speedwaybrick (0.00%), uttranscript (0.00%), bobus (0.00%), warde (0.00%), reallifestruggles (0.00%), inbond (0.00%), looreen (0.00%), aahhrrgh (0.00%), lyingdriver (0.00%)
0.000*"ledsham" + 0.000*"vilicent" + 0.000*"needtogethere" + 0.000*"deltacomfortplus" + 0.000*"nicefollowup" + 0.000*"roostiruez" + 0.000*"normie" + 0.000*"shitier" + 0.000*"carrioncrow" + 0.000*"freeandfabulous"

Topic 83:
unawa (0.00%), speedwaybrick (0.00%), uttranscript (0.00%), bobus (0.00%), warde (0.00%), reallifestruggles (0.00%), inbond (0.00%), looreen (0.00%), aahhrrgh (0.00%), lyingdriver (0.00%)
0.000*"ledsham" + 0.000*"vilicent" + 0.000*"needtogethere" + 0.000*"deltacomfortplus" + 0.000*"nicefollowup" + 0.000*"roostiruez" + 0.000*"normie" + 0.000*"shitier" + 0.000*"carrioncrow" + 0.000*"freeandfabulous"

Topic 77:
unawa (0.00%), speedwaybrick (0.00%), uttranscript (0.00%), bobus (0.00%), warde (0.00%), reallifestruggles (0.00%), inbond (0.00%), looreen (0.00%), aahhrrgh (0.00%)

In [None]:
# Update LDA with new documents

lda_model.update(corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False)


In [75]:
# Classify new doc
from gensim.models.ldamodel import LdaModel
from gensim.corpora import Dictionary
import sanitization

line = lines[865]

# Preprocess the new document
[new_doc] = sanitization.sanitize_tokenize([line])

# Get the topic distribution for the new document
doc_topics = lda_model.get_document_topics(dictionary.doc2bow(new_doc))

# Sort the topic distribution in descending order of probability
doc_topics.sort(key=lambda x: x[1], reverse=True)

# Get the index of the topic with the highest probability
most_likely_topic = doc_topics[0][0]

print(line)
print_topic(most_likely_topic)
lda_model.print_topic(most_likely_topic)

they killed off a character on one of my favorite shows and now i'm upset 
Topic 19:
option (11.4%), charge (11.5%), report (8.6%), fee (4.3%), problem (13.3%), case (6.5%), gift (2.4%), great (14.2%), one (23.0%), bar (0.8%)


[nltk_data] Downloading package stopwords to /home/tk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


'0.230*"one" + 0.142*"great" + 0.133*"problem" + 0.115*"charge" + 0.114*"option" + 0.086*"report" + 0.065*"case" + 0.043*"fee" + 0.024*"gift" + 0.015*"job"'

### Findings

With too many topics, multiple topics collapse to the same one.
This is not overfitting, but a limitation of LDA's.

# Extract bigrams; word pairs which often go together


In [5]:

import itertools

from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

words = list(itertools.chain.from_iterable(tokens))  # concatenate list of lists into single list

# Find the most common bigrams in the text
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.apply_freq_filter(3)
bigrams = finder.nbest(bigram_measures.raw_freq, 40)

# Print the extracted terminology
print("Terminology:")
print(bigrams)


Terminology:
[('let', 'know'), ('email', 'address'), ('sorry', 'hear'), ('please', 'send'), ('customer', 'service'), ('take', 'look'), ('please', 'let'), ('direct', 'message'), ('help', 'send'), ('phone', 'number'), ('happy', 'help'), ('would', 'like'), ('help', 'please'), ('please', 'contact'), ('please', 'follow'), ('send', 'email'), ('send', 'note'), ('make', 'sure'), ('please', 'check'), ('look', 'like'), ('please', 'reach'), ('happy', 'halloween'), ('please', 'help'), ('need', 'help'), ('closer', 'look'), ('take', 'closer'), ('account', 'email'), ('anything', 'else'), ('could', 'help'), ('look', 'please'), ('need', 'assistance'), ('team', 'connect'), ('thanks', 'reaching'), ('contact', 'number'), ('hear', 'please'), ('please', 'provide'), ('confirmation', 'number'), ('could', 'please'), ('get', 'back'), ('help', 'hi')]


### Findings:
- Bigrams extract couplings of words which might be a better candidate for a topic than a single word
- However, it's implementation work for minimal improvement
- Leaving this for now.

# Test of TF-IDF

In [43]:


from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.corpus

stop_words = set(nltk.corpus.stopwords.words('english'))

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words=list(stop_words))

# Fit the vectorizer to the documents and transform the documents into TF-IDF vectors
tfidf_vectors = vectorizer.fit_transform(lines)

import scipy 

inverse_voc = dict((v, k) for k, v in vectorizer.vocabulary_.items())

# Print the first 5 TF-IDF vectors
for i in range(5):
    print(f"Tweet {i+1}: \"{' '.join(tokens[i])}\" TF-IDF Vector:\n")
    cx = scipy.sparse.coo_matrix(tfidf_vectors[i])
    for i, j, v in zip(cx.row, cx.col, cx.data):
        print(inverse_voc[j], v)
    print('\n')

# tfidf_vectors[i] is a sparse matrix where only the words in one tweet get a score

Tweet 1: "understand would like assist would need get private secure link assist" TF-IDF Vector:

link 0.2163770131669878
secured 0.4076803360531025
private 0.3254862045213924
get 0.16785541829098044
need 0.19876920172398266
assist 0.42952594902547303
like 0.1856933517828438
would 0.4012626207478553
understand 0.25915693398943357
115712 0.4076803360531025


Tweet 2: "propose" TF-IDF Vector:

propose 0.8626924156014288
sprintcare 0.5057289749102494


Tweet 3: "send several private message one respond usual" TF-IDF Vector:

usual 0.435252488121658
responding 0.393306048272026
one 0.2386736509698984
messages 0.36480363903287266
several 0.3780932738873274
sent 0.2712929625413191
sprintcare 0.3249438025495913
private 0.37904610162742797


Tweet 4: "please send us private message far assist click message top profile" TF-IDF Vector:

profile 0.36789227341541386
top 0.3305398565133689
click 0.31304490062567364
message 0.47713555571699484
us 0.14152121450761193
send 0.19574471761291398
please 0

# Weighted K-means clustering on word2vecs weighted by occurrence/global frequency or TF-IDF score
Problems:
- Doesn't take word co-occurrence into account
- If we take the TF-IDF per tweet then longer tweets which often contain 'please help' will get lower scores,
because the term freqs are lower because tweets are longer and because those words occur often, even though they are significant
- Longer tweets get significantly lower scores. TF-IDF isn't good for separate tweets.
- We don't have a proper way to represent the cluster by a single topic or just a couple of terms.

In general we suffer from the problem that in order to distinguish current trends from usual we need data on what usual looks like. I've tried to fix this by comparing the tweets word frequency against a global word frequency obtained from the Brown corpus. However, that corpus is not really representative for twitter data.


In [None]:
from gensim.models import Word2Vec

# Define the Word2Vec model parameters
word2vec = Word2Vec(tokens, vector_size=100, window=15, min_count=1, workers=4)
word2vec.save("word2vec.model")

> Use the extended token set which includes a large corpus of Brown. Use the cell below instead!

In [None]:
from gensim.models import Word2Vec

word2vec = Word2Vec(global_tokens, vector_size=256, window=15, min_count=1, workers=4)
word2vec.save("word2vec.model")

In [None]:
from gensim.models import Word2Vec

word2vec = Word2Vec.load("word2vec.model")

In [27]:
def word2weight(word):
    return 1. / (word2freq(word) + 1e-10)

## Main part of the method using TF-IDF

In [3]:


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import itertools

from gensim.models import Word2Vec
import gensim.downloader

import numpy as np

words = list(itertools.chain.from_iterable(tokens))  # concatenate list of lists into single list

# Create the TF-IDF vectorizer and Fit it to the all tweets as one document
stop_words = set()  # Stop words have already been filtered out from tokens!
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
tfidf_vectors = vectorizer.fit_transform([' '.join(words)])  # Join all tweets into single doc
#tfidf_vectors = vectorizer.fit_transform([' '.join(token) for token in tokens]) 

# Get the vocabulary and the TF-IDF abs
weights = np.squeeze(tfidf_vectors.toarray())

vocabulary = vectorizer.vocabulary_
word_vecs = [word2vec.wv[word] for word in vocabulary.keys()]

# Define the number of clusters
num_clusters = 10

# Create the KMeans object and fit it to the TF-IDF vectors
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(word_vecs, sample_weight=weights)

# Get the cluster labels and the cluster centers
labels = kmeans.labels_
centers = kmeans.cluster_centers_

from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=10)
neigh.fit(word_vecs)

# Print the top words of each cluster, weighted by their TF-IDF score
for i, center in enumerate(centers):
    print("Cluster", i+1, "Top Words:")
    [dists], [indices] = neigh.kneighbors([center])
    for word_index in indices:
        word_weighted_score = weights[word_index]
        word = word2vec.wv.index_to_key[word_index]
        print(f"\t{word} (score: {word_weighted_score:.2e})")
        

NameError: name 'word2vec' is not defined

## Main part of the method using specificity score based on occurrence/global_frequency


In [35]:

from sklearn.cluster import KMeans



from nltk.probability import FreqDist

twitter_fdist = FreqDist()
for line in tokens:
    for word in line:
        twitter_fdist[word] += 1

words = [word for word in twitter_fdist]

weights = [word2weight(word) * fdist[word] for word in words]

word_vecs = [word2vec.wv[word] for word in words]

# Define the number of clusters
num_clusters = 10

# Create the KMeans object and fit it to the TF-IDF vectors
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(word_vecs, sample_weight=weights)

# Get the cluster labels and the cluster centers
labels = kmeans.labels_
centers = kmeans.cluster_centers_

from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=50)
neigh.fit(word_vecs)

# Print the top words of each cluster, weighted by their specificity score
for i, center in enumerate(centers):
    print("Cluster", i+1, "Top Words:")
    [neighbor_indices] = neigh.kneighbors([center], return_distance=False)
    sorted_neighbors = sorted(neighbor_indices, key=lambda i: - weights[i])
    for word_index in sorted_neighbors[0:10]:  # get the 10 nearest neighbors with highest specificity scores
        word_weighted_score = weights[word_index]
        word = words[word_index]
        print(f"\t{word} (score: {word_weighted_score:.2e})")
        


# Findings:
- Resulting clusters seem kinda random.
- Perhaps the word2vec mapping is a highly discontinuous space, meaning that the clusters span over multiple subjects.
- Perhaps the clusters span a wide area of space and the words closest to the cluster center aren't good representatives of the set.
- Perhaps the word2vec mapping just isn't that great; the idea is that similar / co-occurring words are mapped closer together,
but it doesn't say anything about that unrelated words should be farther apart. There's a relevance cutoff beyond which 
the distance between mapped words doesn't mmean anything anymore.
- Problem: this technique doesn't take significance into account. Any random word may happen to be at the center of a cluster even if it's not used often.

# GAN with a Dirichlet latent space
- Create a generator which converts points sampled from a Dirichlet distribution into random tweets.
- Create a discriminator which predicts whether a given tweet is real.
- Provide discriminator with real samples using word2vec as input.
- Provide discriminator with fake samples from the generator.
- Train network, including parameters of the Dirichlet distribution.

### Feature extraction
- Evaluate corners of the latent space, and optimize input location for the realness score of the discriminator, together with a penalty for longer tweets.
- Compare all thusly obtained iconic tweets based on their realness score.
- Take the top 10 as most prevalent topics.

# Hierarchical Dirichlet Process 


In [None]:
from gensim.models import HdpModel

# Train HDP model
hdp_model = HdpModel(corpus, id2word=dictionary)

In [None]:
# Print topics
for topic in hdp_model.show_topics():
    print(topic)

In [107]:
# Get topics as list of tuples of word and probability
topics = hdp_model.show_topics(num_top_topics, num_words=0, formatted=False)

for topic_index, topic in topics:
    print_topic(topic_index, hdp_model)
    print(hdp_model.print_topic(topic_index))

Topic 0:
us (1.94%), please (1.83%), order (0.80%), amazon (0.44%), help (1.14%), service (0.71%), contact (0.53%), delivery (0.42%), send (0.82%), hi (0.86%)
0.019*"us" + 0.018*"please" + 0.012*"get" + 0.011*"help" + 0.009*"sorry" + 0.009*"hi" + 0.008*"send" + 0.008*"order" + 0.007*"know" + 0.007*"service"
Topic 1:
página (0.01%), happyoctober (0.01%), uncleanly (0.01%), sacramentoriver (0.01%), mentiona (0.01%), sportspak (0.01%), borjhar (0.01%), pancakesammmiright (0.01%), illogan (0.01%), alyhey (0.01%)
0.011*"us" + 0.010*"please" + 0.008*"get" + 0.007*"help" + 0.005*"hi" + 0.005*"thanks" + 0.005*"sorry" + 0.004*"send" + 0.004*"look" + 0.004*"know"
Topic 2:
justcomcastthings (0.01%), kanno (0.01%), toomanysecondchances (0.01%), uthiru (0.01%), goingbacktoandroid (0.01%), otaylakesrd (0.01%), wuste (0.01%), zealandallblacks (0.01%), nocompensation (0.01%), صار (0.01%)
0.010*"us" + 0.010*"please" + 0.008*"get" + 0.007*"help" + 0.005*"hi" + 0.005*"thanks" + 0.004*"sorry" + 0.004*"sen

# Findings

### How to display topics of LDA-type models?
LDA type models give $P(word|topic)$, which inherently favors the display of more common words.

We can use Bayes rule to get $P(topic|word)=P(word|topic)*P(topic)/P(word)$.
(Note that ordering words by $P(topic|word)=P(word|topic)/P(word)$ for each topic is equivalent)
This makes sense, because we would want to see the words which are most significant for that topic.
However, that inherently favors the display of super uncommon words which happen to be present more in those specific topics. 
In NLP overfitting is a bigger problem because a large amount of posts have unique tokens in them.

We could use some other formula to order to words for a topic, like 

$$\frac{P(word|topic)}{P(word)^2 - log(P(word))} $$

That would discourage very uncommon words from being displayed, while favoring less common words over more common words.
However, this seems rather contrived.
