# Libraries

In [1]:
import tqdm
import spacy
import numpy as np
import regex as re
import pandas as pd
import seaborn as sns

from gsdmm import MovieGroupProcess

import gensim
# from gensim.models import wrappers
# from gensim.models.wrappers import LdaMallet
import gensim.corpora as corpora
from gensim.corpora import dictionary
from gensim.utils import simple_preprocess
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning)


# Think these can be removed
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Importing -- Fixing columns

In [2]:
full = pd.read_csv('Data/Data-Cleaned/238k-Uncleaned')

In [3]:
# Renaming tweets column, dropping unnamed column, making tweets strings

full['tweets'] = full['0']
full.reset_index(inplace = True)
full = full.drop(columns = ['0', 'Unnamed: 0', 'index'])
# eh
full.drop_duplicates(inplace = True, ignore_index = True)

# Preprocessing ---- will need to test different preprocessing later

In [24]:
# Function for Gensim simple preprocessor --- 
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
# STOPWORDS ==========================================
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words]for doc in texts]
# Stop Words list -- Can be Edited
stop_words = stopwords.words('english')
stop_words.extend(['tesla', 'c,', 'x', 't', 'p', 'amp', 'car',
                   'get', 'go', 'use', 'elon', 'musk', 'elon_musk'])
wnl = WordNetLemmatizer()

# BIGRAMS===============================================
def make_bigrams(texts):
    return bigram_mod[texts]

# TRIGRAMS =============================================
def make_trigrams(texts):
    return trigram_mod[bigram_mod[texts]]

# LEMMATIZING ==========================================
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None    
def lemmatize_texts(tweet):
     for i in tweet:
        word = (map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(tweet))) 
        word_lem = " ".join([wnl.lemmatize(x[0], x[1]) for x in word if x[1] is not None])
        return word_lem

In [25]:
# remove punctuation and lowercase all
full['preprep'] = full['tweets'].map(lambda x: re.sub('[,\.!?]', '', x.lower()))
#  Removing @ handles, links-- strip whitespace breaks and tabs
full['preprep'] = full['preprep'].map(lambda x: re.sub(r"@\w+|http\S+", "", x).strip()\
                                      .replace("\r", "").replace("\n", "").replace("\t", ""))

In [26]:
# Creates a list of all tweets in full.preprep
data = full.preprep.values.tolist()

# Uses gensims simple preprocessor on all tweets in list
# Ouputs list of lists of tokenized tweets
data_words = list(sent_to_words(data))

In [27]:
# Bigram ---
# higher hyperparameter values =  fewer phrases.
bigram = gensim.models.Phrases(data_words, min_count=2, threshold=20) # orig -- 3/ 75
bigram_mod = gensim.models.phrases.Phraser(bigram)

# TriGrams --
trigram = gensim.models.Phrases(bigram[data_words], threshold=60)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [28]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# lemmatize
data_lemmatized = [lemmatize_texts(tweet) for tweet in data_words_bigrams]

In [29]:
print(len(data_words), len(data_lemmatized))

126510 126510


In [30]:
# Getting indices of all nones
indices_of_nones = [(i) for i,x  in enumerate(data_lemmatized) if x == None]
len(indices_of_nones)

384

In [31]:
# Dropping None values using indices aquired above
lem_drop_None = [(i, x) for (i, x) in enumerate(data_lemmatized) if i not in indices_of_nones]
len(lem_drop_None)

126126

In [32]:
# Getting indices of all short tweets
indices_short_tweets = [i for (i,x) in lem_drop_None if (len(x.split(' ')) <= 5)]
len(indices_short_tweets)

38518

In [33]:
lem_noShort_noNone = [(i, x) for (i, x) in lem_drop_None if i not in indices_short_tweets]
len(lem_noShort_noNone)

87608

### The Rosetta Stone

In [52]:
print((lem_noNone[8654]), 
       '\n', (lem_noShort_noNone[8654]), 
      '\n',(full.preprep.iloc[13374]))

['count', 'stack', 'racial', 'lawsuit', 'fill', 'twitter', 'mostly', 'emotion', 'speculator'] 
 (13374, 'count stack racial lawsuit fill twitter mostly emotion speculator') 
 all you have to do is count the the stack of racial lawsuits filled against tesla  twitter is mostly emotion and speculators


In [35]:
lem_noNone = [x.split() for (i, x) in lem_noShort_noNone]

In [36]:
id2word = corpora.Dictionary(lem_noNone)
print(len(id2word)) 

62499


In [37]:
# This can be Tuned --- and probably should be
id2word.filter_extremes(no_below=6, no_above=.95)
print(len(id2word))

13597


In [38]:
corpus = [id2word.doc2bow(d) for d in lem_noNone]

# Modeling

#### ============================================================================

In [41]:
lem_tokens = pd.Series(lem_noNone)

In [43]:
gsdmm = MovieGroupProcess(K=5, alpha=0.01, beta=0.01, n_iters=15)
docs = lem_tokens.to_numpy()
vocab_length = len(id2word)
# fit GSDMM model
y = gsdmm.fit(docs, vocab_length)

In stage 0: transferred 61023 clusters with 5 clusters populated
In stage 1: transferred 37813 clusters with 5 clusters populated
In stage 2: transferred 28601 clusters with 5 clusters populated
In stage 3: transferred 21685 clusters with 5 clusters populated
In stage 4: transferred 16948 clusters with 5 clusters populated
In stage 5: transferred 14216 clusters with 5 clusters populated
In stage 6: transferred 12400 clusters with 5 clusters populated
In stage 7: transferred 11050 clusters with 5 clusters populated
In stage 8: transferred 10385 clusters with 5 clusters populated
In stage 9: transferred 9841 clusters with 5 clusters populated
In stage 10: transferred 9313 clusters with 5 clusters populated
In stage 11: transferred 9068 clusters with 5 clusters populated
In stage 12: transferred 8562 clusters with 5 clusters populated
In stage 13: transferred 8363 clusters with 5 clusters populated
In stage 14: transferred 7968 clusters with 5 clusters populated


In [44]:
doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 7)

Number of documents per topic : [ 3381 32089  5774 27287 19077]
Most important clusters (by number of docs inside): [1 3 4 2 0]

Cluster 1 : [('stock', 11182), ('buy', 6305), ('twitter', 4953), ('make', 3362), ('company', 3346), ('think', 2970), ('go', 2929)]

Cluster 3 : [('car', 5450), ('make', 3848), ('electric_car', 3794), ('buy', 3651), ('ev', 3522), ('electric', 3362), ('battery', 2463)]

Cluster 4 : [('car', 2319), ('people', 1842), ('know', 1622), ('make', 1555), ('say', 1546), ('think', 1401), ('drive', 1344)]

Cluster 2 : [('stock', 858), ('tsla', 782), ('esg_index', 584), ('say', 462), ('twitter', 388), ('elonmusk', 381), ('news', 369)]

Cluster 0 : [('spot', 1203), ('enter', 1165), ('free', 900), ('find', 896), ('information_including', 892), ('c', 892), ('st_may', 891)]


In [23]:
# V1
# With elon-musk-elon_musk

doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 7)

Number of documents per topic : [30002 16964  3551  4927 33469]
Most important clusters (by number of docs inside): [4 0 1 3 2]

Cluster 4 : [('buy', 6128), ('elon', 4229), ('make', 4058), ('people', 3930), ('company', 3542), ('musk', 3526), ('think', 3298)]

Cluster 0 : [('car', 6101), ('make', 3421), ('electric', 2929), ('ev', 2915), ('battery', 2528), ('electric_car', 2446), ('buy', 2367)]

Cluster 1 : [('stock', 9662), ('twitter', 3946), ('buy', 2873), ('musk', 2353), ('price', 1935), ('elon', 1919), ('sell', 1842)]

Cluster 3 : [('spot', 1243), ('elon_musk', 918), ('esg_index', 780), ('teslaradar', 619), ('motor', 595), ('model_dual', 592), ('unitedstates_model', 473)]

Cluster 2 : [('enter', 1193), ('free', 947), ('find', 935), ('c', 896), ('st_may', 893), ('information_including', 892), ('month_prize', 670)]


# =============================================================================