In [4]:
import sanitization


# Read the CSV file and extract the text data
with open('ciphix NLP/translated_data.csv', 'r') as file:
    text_ = file.readlines()

## temporarily use less than the full data set for faster computation
lines = text_[0:len(text_) // 10]
len(lines)

tokens = sanitization.sanitize_tokenize(lines)

[nltk_data] Downloading package stopwords to /home/tk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
for word in ['hello', 'you', 'to', 'why', 'in', 'is', 'was', 'go', 'say', 'yes']:
    print(word, word in stop_words)

hello False
you True
to True
why True
in True
is True
was True
go False
say False
yes False


In [5]:
from gensim import corpora, models

# Create a dictionary of the tokens
dictionary = corpora.Dictionary(tokens)

# Create a corpus of the tokens
corpus = [dictionary.doc2bow(token) for token in tokens]

# Train an LDA model on the corpus
lda_model = models.ldamodel.LdaModel(corpus=corpus,
                                     id2word=dictionary,
                                     num_topics=20,
                                     passes=40,
                                     alpha='auto',
                                     random_state=42)

# Print the most prevalent topics discussed in the text data
topics = lda_model.print_topics(num_words=9)
for topic in topics:
    print(topic)
    
# Findings:
#  sometimes gives boorito, sometimes not, depending on random_state

(0, '0.074*"time" + 0.039*"delivery" + 0.029*"available" + 0.027*"pay" + 0.023*"show" + 0.019*"every" + 0.018*"first" + 0.017*"ty" + 0.016*"home"')
(1, '0.037*"store" + 0.037*"connect" + 0.031*"currently" + 0.026*"io" + 0.022*"connection" + 0.022*"set" + 0.017*"leave" + 0.016*"local" + 0.015*"im"')
(2, '0.138*"thanks" + 0.043*"info" + 0.030*"hi" + 0.029*"feedback" + 0.028*"update" + 0.023*"working" + 0.023*"appreciate" + 0.021*"sure" + 0.019*"soon"')
(3, '0.036*"offer" + 0.036*"point" + 0.029*"another" + 0.025*"nothing" + 0.023*"stopping" + 0.022*"told" + 0.020*"stop" + 0.020*"get" + 0.020*"cost"')
(4, '0.046*"issue" + 0.041*"halloween" + 0.037*"happy" + 0.028*"hello" + 0.027*"would" + 0.026*"assistance" + 0.024*"sorry" + 0.021*"inconvenience" + 0.021*"idea"')
(5, '0.087*"u" + 0.071*"help" + 0.058*"know" + 0.053*"let" + 0.041*"please" + 0.036*"sorry" + 0.030*"hear" + 0.028*"hey" + 0.018*"like"')
(6, '0.084*"please" + 0.076*"u" + 0.054*"look" + 0.035*"email" + 0.033*"address" + 0.030*"s

In [5]:
# Extract bigrams; word pairs which often go together

import itertools

from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

words = list(itertools.chain.from_iterable(tokens))  # concatenate list of lists into single list

# Find the most common bigrams in the text
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(words)
finder.apply_freq_filter(3)
bigrams = finder.nbest(bigram_measures.raw_freq, 40)

# Print the extracted terminology
print("Terminology:")
print(bigrams)

Terminology:
[('let', 'know'), ('email', 'address'), ('sorry', 'hear'), ('please', 'send'), ('customer', 'service'), ('take', 'look'), ('please', 'let'), ('direct', 'message'), ('help', 'send'), ('phone', 'number'), ('happy', 'help'), ('would', 'like'), ('help', 'please'), ('please', 'contact'), ('please', 'follow'), ('send', 'email'), ('send', 'note'), ('make', 'sure'), ('please', 'check'), ('look', 'like'), ('please', 'reach'), ('happy', 'halloween'), ('please', 'help'), ('need', 'help'), ('closer', 'look'), ('take', 'closer'), ('account', 'email'), ('anything', 'else'), ('could', 'help'), ('look', 'please'), ('need', 'assistance'), ('team', 'connect'), ('thanks', 'reaching'), ('contact', 'number'), ('hear', 'please'), ('please', 'provide'), ('confirmation', 'number'), ('could', 'please'), ('get', 'back'), ('help', 'hi')]


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer


# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words=list(stop_words))

# Fit the vectorizer to the documents and transform the documents into TF-IDF vectors
tfidf_vectors = vectorizer.fit_transform(text)

import scipy 

inverse_voc = dict((v, k) for k, v in vectorizer.vocabulary_.items())

# Print the TF-IDF vectors
for i in range(5):
    print(f"Tweet {i+1}: \"{' '.join(tokens[i])}\" TF-IDF Vector:\n")
    cx = scipy.sparse.coo_matrix(tfidf_vectors[i])
    for i, j, v in zip(cx.row, cx.col, cx.data):
        print(inverse_voc[j], v)
    print('\n')

Tweet 1: "understand would like assist would need get private secured link assist" TF-IDF Vector:

link 0.23162194562021757
secured 0.3689624733283365
private 0.3118269598581018
get 0.18865171494823785
need 0.21390621720424144
assist 0.47064827907513135
like 0.19525074664453232
would 0.4258537746485734
understand 0.28632450217497574
115712 0.32965248487891047


Tweet 2: "propose" TF-IDF Vector:

propose 0.8149003051737236
sprintcare 0.5796011496087392


Tweet 3: "sent several private message one responding usual" TF-IDF Vector:

usual 0.4389380444777185
responding 0.391855012156571
one 0.25182262048069975
messages 0.37203513688161577
several 0.37203513688161577
sent 0.2793632468586064
sprintcare 0.3282217801891854
private 0.35745596027636406


Tweet 4: "please send private message assist click message top profile" TF-IDF Vector:

profile 0.35595395911300093
top 0.3498938268461945
click 0.3350921077350238
message 0.49954705339067396
us 0.15265592540593806
send 0.21662029096312777
please

In [None]:
from gensim.models import Word2Vec

# Get pre-trained Word2Vec transformation to the input words
word2vec = gensim.downloader.load('word2vec-google-news-300')

In [76]:
from gensim.models import Word2Vec

# Define the Word2Vec model parameters
word2vec = Word2Vec(tokens, min_count=1)

In [59]:
# Weighted K-means clustering on word2vecs weighted by TF-IDF score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

import itertools

from gensim.models import Word2Vec
import gensim.downloader

import numpy as np

words = list(itertools.chain.from_iterable(tokens))  # concatenate list of lists into single list


# Create the TF-IDF vectorizer and Fit it to the all tweets as one document
vectorizer = TfidfVectorizer(stop_words=list(stop_words))
tfidf_vectors = vectorizer.fit_transform([' '.join(words)])

# Get the vocabulary and the TF-IDF weights
vocabulary = vectorizer.vocabulary_
weights = np.squeeze(tfidf_vectors.toarray())


word_vecs = [word2vec.wv[word] for word in vocabulary.keys()]

# Define the number of clusters
num_clusters = 10

# Create the KMeans object and fit it to the TF-IDF vectors
kmeans = KMeans(n_clusters=num_clusters)
kmeans.fit(word_vecs, sample_weight=weights)

# Get the cluster labels and the cluster centers
labels = kmeans.labels_
centers = kmeans.cluster_centers_

from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=10)
neigh.fit(word_vecs)

# Print the top words of each cluster, weighted by their TF-IDF score
for i, center in enumerate(centers):
    print("Cluster", i+1, "Top Words:")
    [dists], [indices] = neigh.kneighbors([center])
    for word_index in indices:
        word_weighted_score = weights[word_index]
        word = word2vec.wv.index_to_key[word_index]
        print(f"\t{word} (score: {word_weighted_score:.2e})")

In [None]:
labels

In [None]:
for line in tokens[0:30]:
    print(' '.join(line))

In [None]:
for i, line in enumerate(tokens):
    if 'wa' in line:
        print(text[i])
        print(' '.join(line))

In [None]:
lemmatizer.lemmatize('was')

In [None]:
vectorizer.fit_transform(["you wasted wasted"]).toarray()