In [157]:
import pandas as pd

import regex as re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import gensim
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD



from pprint import pprint

In [3]:
full = pd.read_csv('Data/Data-Cleaned/Tesla-Cleaned-113k')

In [14]:
# full

In [5]:
full['tweets'] = full['0']
full = full.drop(columns = '0')
full['tweets'] = full['tweets'].astype('string')
full.dropna(inplace = True)

In [6]:
sia = SentimentIntensityAnalyzer()

In [7]:
comp_dic = {}
for i, tweet in full['tweets'].iteritems():
    comp_dic[i] = sia.polarity_scores(tweet)['compound']

comp_scores = pd.Series(comp_dic, name = 'sentiment')
df_sent = full.merge(comp_scores, left_index = True, right_index = True)
df = df_sent.set_index('Unnamed: 0')

In [24]:
df['tokens'] = [i.split() for i in df['tweets']]

In [25]:
type(df['tokens'].iloc[0])

list

In [154]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 113409 entries, 1527025892458999814 to 1527731676310388737
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   tweets     113409 non-null  string 
 1   sentiment  113409 non-null  float64
 2   tokens     113409 non-null  object 
dtypes: float64(1), object(1), string(1)
memory usage: 7.5+ MB


In [152]:
df.head()

Unnamed: 0_level_0,tweets,sentiment,tokens
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1527025892458999814,solar grid run computer radio fridge entire ya...,0.0,"[solar, grid, run, computer, radio, fridge, en..."
1527025892031086593,boycott tesla teslastock elonsproblems vote blue,-0.3182,"[boycott, tesla, teslastock, elonsproblems, vo..."
1527025884690993153,tesla never survive carbon offset implement de...,0.296,"[tesla, never, survive, carbon, offset, implem..."
1527025878974271489,funny fanbase take profit sell tesla,0.7003,"[funny, fanbase, take, profit, sell, tesla]"
1527025852008976384,right nonsense position find sort hard believe...,0.6124,"[right, nonsense, position, find, sort, hard, ..."


# Creating Baseline Below =============================================

### Create Gensim Dictionary object
#### Is a value count of all unique tokens

In [212]:
id2word = Dictionary(df['tokens'])
print(len(id2word)) 

41913


###  Filtering Extremes using Gensim 

    **CAN BE CHANGED/ IMPROVED**
    



In [213]:
# id2word.filter_extremes(no_below=4, no_above=.95) #Original -- 12656
id2word.filter_extremes(no_below=6, no_above=.90)
print(len(id2word))

9967


In [214]:
corpus = [id2word.doc2bow(d) for d in df['tokens']]

## LdaMulticore -------------------------

In [223]:
# Instantiating a Base LDA model
base_model = LdaMulticore(corpus=corpus, num_topics=5, id2word=id2word, workers=12, passes=5)

In [224]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]


In [225]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [226]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id + 1} ------")
    print(t, end="\n\n")

------ Topic 1 ------
tesla car musk elon people door know say make worker

------ Topic 2 ------
tesla musk elon amp esg p twitter index say bot

------ Topic 3 ------
tesla amp get model enter fire go year month worth

------ Topic 4 ------
tesla stock buy twitter price go elon get share sell

------ Topic 5 ------
car tesla electric buy make get ev go vehicle battery



In [227]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 



Perplexity:  -7.204121021376377


In [228]:
# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['tokens'], 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)


Coherence Score:  0.32503549774936147


In [229]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()

In [230]:
gensimvis.prepare(base_model, corpus, id2word)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


# Finish Baseline Model ==================================

# ====================================================================

#### Attempting a GridSearch on the LDA model

In [158]:
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(df['tweets'])

In [161]:
# CV Param
search_params = {'n_components': [5, 10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9], 'batch_size':[128, 256]}

# Init Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, cv = 5, param_grid=search_params, n_jobs = -1)

# Grid Search
model.fit(data_vectorized)

In [163]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'batch_size': 128, 'learning_decay': 0.5, 'n_components': 5}
Best Log Likelihood Score:  -2296728.4523857282
Model Perplexity:  1781.5712854487754


In [None]:
best_lda_model

In [None]:
GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=100, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001, 
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1, 
                                                 random_state=None,
                                                 topic_word_prior=None, 
                                                 total_samples=1000000.0, 
                                                 verbose=0),
             n_jobs=-2,
             param_grid={'n_topics': [10, 15, 20, 30], 
                         'learning_decay': [0.5, 0.7, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
             scoring=None, verbose=0)