# Load and clean up data

In [1]:
import sanitization


# Read the CSV file and extract the text data
with open('ciphix NLP/translated_data.csv', 'r') as file:
    lines_ = file.readlines()

## temporarily use less than the full data set for faster computation
# lines = lines_[0:len(lines_) // 10]
lines = lines_
print(len(lines))

tokens = sanitization.sanitize_tokenize(lines)

262466


[nltk_data] Downloading package stopwords to /home/tk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


# Specificity score

Make a score for each word based on how often it occurs in general speech, i.e. outside our current dataset

In [3]:
# Make a global corpus from Brown

import nltk
import gensim.models
from nltk.corpus import brown

try:
    nltk.data.find('brown')
except LookupError:
    nltk.download('brown')

global_tokens = sanitization.sanitize_tokenize(brown.words()) # Use brown corpus as single tweet

[nltk_data] Downloading package brown to /home/tk/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to /home/tk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [4]:
# Add random twitter data to the curpus

import csv

# Read the CSV file and extract the text data
lines = []
#  Downloaded from https://www.kaggle.com/datasets/kazanova/sentiment140/discussion/60512
with open('ciphix NLP/training.1600000.processed.noemoticon.csv', 'r', encoding='ISO-8859-1') as file:
    reader = csv.reader(file)
    for line in reader:
        lines.append(line[5])  # Column 5 is the actual tweet
        
rnd_twitter_tokens = sanitization.sanitize_tokenize(lines) 

global_tokens.extend(rnd_twitter_tokens)

[nltk_data] Downloading package stopwords to /home/tk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/tk/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [5]:
# Add our tokens to the set so we don't get problems with unknown tokens
global_tokens.extend(tokens)

In [7]:
from nltk.probability import FreqDist

# Create a frequency distribution of all words from a different context then our current dataset
global_fdist = FreqDist()
for line in global_tokens:
    for word in line:
        global_fdist[word] += 1
        

In [8]:
import csv

with open('ciphix NLP/freqs.csv', 'w') as file:
    writer = csv.writer(file)
    for word in global_fdist:
        writer.writerow([word, global_fdist.freq(word)])

import csv

In [32]:
# Create mapping from words to their frequency in 'normal language'
word2freq_dict = {}

with open('ciphix NLP/freqs.csv') as csvfile:
    reader = csv.reader(csvfile)
    for [word, freq] in reader:
        word2freq_dict[word] = float(freq)
    
def word2freq(word):
    try: 
        return word2freq_dict[word]
    except KeyError:  # word didn't occur in our dataset
        return 1. / len(word2freq_dict)

# Latent Dirichlet Allocation

In [2]:
from gensim import corpora, models

num_topics = 20
num_top_topics = 10
num_topic_words = 10

# Create a dictionary of the tokens
dictionary = corpora.Dictionary(tokens)

# Create a corpus of the tokens
corpus = [dictionary.doc2bow(line) for line in tokens]

# Train an LDA model on the corpus
lda_model = models.ldamodel.LdaModel(corpus=corpus,
                                     id2word=dictionary,
                                     num_topics=num_topics,
                                     passes=40,
                                     alpha='auto',
                                     random_state=42)

# Print the most prevalent topics discussed in the text data
lda_model.print_topics(num_topics=num_top_topics, num_words=8)
    
# Findings:
#  Sometimes gives boorito, sometimes not, depending on random_state,`
#  even though I think that term is highly specific and would be a good candidate for a topic.

[(17,
  '0.096*"delay" + 0.074*"feedback" + 0.062*"much" + 0.054*"appreciate" + 0.047*"pass" + 0.036*"bag" + 0.033*"airport" + 0.029*"travel"'),
 (3,
  '0.077*"card" + 0.071*"anything" + 0.058*"game" + 0.040*"ever" + 0.039*"play" + 0.039*"end" + 0.038*"else" + 0.038*"credit"'),
 (2,
  '0.202*"team" + 0.109*"support" + 0.076*"connect" + 0.054*"touch" + 0.053*"report" + 0.041*"case" + 0.040*"note" + 0.031*"fill"'),
 (5,
  '0.096*"ca" + 0.082*"change" + 0.074*"guy" + 0.058*"error" + 0.045*"code" + 0.036*"log" + 0.034*"battery" + 0.030*"expect"'),
 (11,
  '0.083*"way" + 0.055*"free" + 0.048*"think" + 0.047*"feel" + 0.043*"someone" + 0.039*"something" + 0.038*"many" + 0.029*"everything"'),
 (12,
  '0.157*"work" + 0.093*"app" + 0.055*"show" + 0.051*"device" + 0.036*"fly" + 0.032*"seem" + 0.029*"possible" + 0.024*"currently"'),
 (13,
  '0.162*"service" + 0.118*"customer" + 0.099*"call" + 0.039*"offer" + 0.036*"package" + 0.031*"care" + 0.029*"people" + 0.025*"wrong"'),
 (8,
  '0.132*"time" + 

In [31]:
# Get topics as list of tuples of word and probability
topics = lda_model.show_topics(num_top_topics, num_words=len(dictionary), formatted=False)

# Weight topic word probabilities by specificity
for i, (topic_index, topic) in enumerate(topics):
    weighted = [(word, prob / word2freq(word)) for word, prob in topic]
    weighted.sort(key=lambda pair: pair[1], reverse=True)
    print(f"Topic {i}:")
    top_keywords = [word for word, weight in weighted[0:num_topic_words]]
    print(', '.join(top_keywords))


Topic 0:
rebooking, guaranteed, downgrade, joanne, baggage, agent, feedback, delay, subscription, glitching
Topic 1:
dmed, tracking, autopay, spotify, faulty, verification, fortnite, rider, unlock, ensure
Topic 2:
godaddy, fraud, digit, hardwired, connect, chorizo, unhelpful, team, cancellation, needful
Topic 3:
divert, amzl, nrc, queso, instruction, flt, urgently, transaction, securely, hurricane
Topic 4:
gpu, hassle, gamertag, fraudulent, uninstalling, psn, reinstall, lhr, resolution, complaint
Topic 5:
fios, deactivate, cc, installation, deduction, device, patrickullmann, deactivation, app, refer
Topic 6:
usps, misleading, rep, airline, jfk, customer, preferred, package, service, fare
Topic 7:
deplane, rebook, lte, troubleshoot, notification, azhelp, doorbell, enable, october, constantly
Topic 8:
aateam, amazon, deliver, cox, delivery, uber, prime, driver, ubereats, incompetent
Topic 9:
glitchy, cust, kindly, detail, confirmation, provide, assistance, carrier, assist, reach


# Extract bigrams; word pairs which often go together


In [5]:

import itertools

from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

words = list(itertools.chain.from_iterable(tokens))  # concatenate list of lists into single list

# Find the most common bigrams in the text
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.apply_freq_filter(3)
bigrams = finder.nbest(bigram_measures.raw_freq, 40)

# Print the extracted terminology
print("Terminology:")
print(bigrams)


Terminology:
[('let', 'know'), ('email', 'address'), ('sorry', 'hear'), ('please', 'send'), ('customer', 'service'), ('take', 'look'), ('please', 'let'), ('direct', 'message'), ('help', 'send'), ('phone', 'number'), ('happy', 'help'), ('would', 'like'), ('help', 'please'), ('please', 'contact'), ('please', 'follow'), ('send', 'email'), ('send', 'note'), ('make', 'sure'), ('please', 'check'), ('look', 'like'), ('please', 'reach'), ('happy', 'halloween'), ('please', 'help'), ('need', 'help'), ('closer', 'look'), ('take', 'closer'), ('account', 'email'), ('anything', 'else'), ('could', 'help'), ('look', 'please'), ('need', 'assistance'), ('team', 'connect'), ('thanks', 'reaching'), ('contact', 'number'), ('hear', 'please'), ('please', 'provide'), ('confirmation', 'number'), ('could', 'please'), ('get', 'back'), ('help', 'hi')]


### Findings:
- Bigrams extract couplings of words which might be a better candidate for a topic than a single word
- However, it's implementation work for minimal improvement
- Leaving this for now.

# Test of TF-IDF

In [43]:


from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.corpus

stop_words = set(nltk.corpus.stopwords.words('english'))

# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words=list(stop_words))

# Fit the vectorizer to the documents and transform the documents into TF-IDF vectors
tfidf_vectors = vectorizer.fit_transform(lines)

import scipy 

inverse_voc = dict((v, k) for k, v in vectorizer.vocabulary_.items())

# Print the first 5 TF-IDF vectors
for i in range(5):
    print(f"Tweet {i+1}: \"{' '.join(tokens[i])}\" TF-IDF Vector:\n")
    cx = scipy.sparse.coo_matrix(tfidf_vectors[i])
    for i, j, v in zip(cx.row, cx.col, cx.data):
        print(inverse_voc[j], v)
    print('\n')

# tfidf_vectors[i] is a sparse matrix where only the words in one tweet get a score

Tweet 1: "understand would like assist would need get private secure link assist" TF-IDF Vector:

link 0.2163770131669878
secured 0.4076803360531025
private 0.3254862045213924
get 0.16785541829098044
need 0.19876920172398266
assist 0.42952594902547303
like 0.1856933517828438
would 0.4012626207478553
understand 0.25915693398943357
115712 0.4076803360531025


Tweet 2: "propose" TF-IDF Vector:

propose 0.8626924156014288
sprintcare 0.5057289749102494


Tweet 3: "send several private message one respond usual" TF-IDF Vector:

usual 0.435252488121658
responding 0.393306048272026
one 0.2386736509698984
messages 0.36480363903287266
several 0.3780932738873274
sent 0.2712929625413191
sprintcare 0.3249438025495913
private 0.37904610162742797


Tweet 4: "please send us private message far assist click message top profile" TF-IDF Vector:

profile 0.36789227341541386
top 0.3305398565133689
click 0.31304490062567364
message 0.47713555571699484
us 0.14152121450761193
send 0.19574471761291398
please 0

# Weighted K-means clustering on word2vecs weighted by occurrence/global frequency or TF-IDF score
Problems:
- Doesn't take word co-occurrence into account
- If we take the TF-IDF per tweet then longer tweets which often contain 'please help' will get lower scores,
because the term freqs are lower because tweets are longer and because those words occur often, even though they are significant
- Longer tweets get significantly lower scores. TF-IDF isn't good for separate tweets.
- We don't have a proper way to represent the cluster by a single topic or just a couple of terms.

In general we suffer from the problem that in order to distinguish current trends from usual we need data on what usual looks like. I've tried to fix this by comparing the tweets word frequency against a global word frequency obtained from the Brown corpus. However, that corpus is not really representative for twitter data.


In [None]:
from gensim.models import Word2Vec

# Define the Word2Vec model parameters
word2vec = Word2Vec(tokens, vector_size=100, window=15, min_count=1, workers=4)
word2vec.save("word2vec.model")

> Use the extended token set which includes a large corpus of Brown. Use the cell below instead!

In [None]:
from gensim.models import Word2Vec

word2vec = Word2Vec(global_tokens, vector_size=256, window=15, min_count=1, workers=4)
word2vec.save("word2vec.model")

In [None]:
from gensim.models import Word2Vec

word2vec = Word2Vec.load("word2vec.model")

In [27]:
def word2weight(word):
    return 1. / (global_fdist[word] + 1e-10)

## Main part of the method using TF-IDF

In [3]:


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import itertools

from gensim.models import Word2Vec
import gensim.downloader

import numpy as np

words = list(itertools.chain.from_iterable(tokens))  # concatenate list of lists into single list

# Create the TF-IDF vectorizer and Fit it to the all tweets as one document
stop_words = set()  # Stop words have already been filtered out from tokens!
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
tfidf_vectors = vectorizer.fit_transform([' '.join(words)])  # Join all tweets into single doc
#tfidf_vectors = vectorizer.fit_transform([' '.join(token) for token in tokens]) 

# Get the vocabulary and the TF-IDF abs
weights = np.squeeze(tfidf_vectors.toarray())

vocabulary = vectorizer.vocabulary_
word_vecs = [word2vec.wv[word] for word in vocabulary.keys()]

# Define the number of clusters
num_clusters = 10

# Create the KMeans object and fit it to the TF-IDF vectors
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(word_vecs, sample_weight=weights)

# Get the cluster labels and the cluster centers
labels = kmeans.labels_
centers = kmeans.cluster_centers_

from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=10)
neigh.fit(word_vecs)

# Print the top words of each cluster, weighted by their TF-IDF score
for i, center in enumerate(centers):
    print("Cluster", i+1, "Top Words:")
    [dists], [indices] = neigh.kneighbors([center])
    for word_index in indices:
        word_weighted_score = weights[word_index]
        word = word2vec.wv.index_to_key[word_index]
        print(f"\t{word} (score: {word_weighted_score:.2e})")
        

NameError: name 'word2vec' is not defined

## Main part of the method using specificity score based on occurrence/global_frequency


In [35]:

from sklearn.cluster import KMeans



from nltk.probability import FreqDist

twitter_fdist = FreqDist()
for line in tokens:
    for word in line:
        twitter_fdist[word] += 1

words = [word for word in twitter_fdist]

weights = [word2weight(word) * fdist[word] for word in words]

word_vecs = [word2vec.wv[word] for word in words]

# Define the number of clusters
num_clusters = 10

# Create the KMeans object and fit it to the TF-IDF vectors
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(word_vecs, sample_weight=weights)

# Get the cluster labels and the cluster centers
labels = kmeans.labels_
centers = kmeans.cluster_centers_

from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=50)
neigh.fit(word_vecs)

# Print the top words of each cluster, weighted by their specificity score
for i, center in enumerate(centers):
    print("Cluster", i+1, "Top Words:")
    [neighbor_indices] = neigh.kneighbors([center], return_distance=False)
    sorted_neighbors = sorted(neighbor_indices, key=lambda i: - weights[i])
    for word_index in sorted_neighbors[0:10]:  # get the 10 nearest neighbors with highest specificity scores
        word_weighted_score = weights[word_index]
        word = words[word_index]
        print(f"\t{word} (score: {word_weighted_score:.2e})")
        


# Findings:
- Resulting clusters seem kinda random.
- Perhaps the word2vec mapping is a highly discontinuous space, meaning that the clusters span over multiple subjects.
- Perhaps the clusters span a wide area of space and the words closest to the cluster center aren't good representatives of the set.
- Perhaps the word2vec mapping just isn't that great; the idea is that similar / co-occurring words are mapped closer together,
but it doesn't say anything about that unrelated words should be farther apart. There's a relevance cutoff beyond which 
the distance between mapped words doesn't mmean anything anymore.
- Problem: this technique doesn't take significance into account. Any random word may happen to be at the center of a cluster even if it's not used often.