# Libraries

In [1]:
import tqdm
import spacy
import numpy as np
import regex as re
import pandas as pd
import seaborn as sns

import plotly.graph_objects as go
import plotly.offline as pyo



import pyLDAvis
import pyLDAvis.gensim_models as gensim_models
import pyLDAvis.gensim_models as gensimvis

import gensim
# from gensim.models import wrappers
# from gensim.models.wrappers import LdaMallet
import gensim.corpora as corpora
from gensim.corpora import dictionary
from gensim.utils import simple_preprocess
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning)


# Think these can be removed
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD


Please use `triu` from the `scipy.linalg` namespace, the `scipy.linalg.special_matrices` namespace is deprecated.



# Importing -- Fixing columns

In [2]:
full = pd.read_csv('Data/Data-Cleaned/238k-Uncleaned')

In [3]:
# Renaming tweets column, dropping unnamed column, making tweets strings

full['tweets'] = full['0']
full.reset_index(inplace = True)
full = full.drop(columns = ['0', 'Unnamed: 0', 'index'])
# eh
full.drop_duplicates(inplace = True, ignore_index = True)

# Preprocessing ---- will need to test different preprocessing later

In [150]:
# Function for Gensim simple preprocessor --- 
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
# STOPWORDS ==========================================
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words]for doc in texts]
# Stop Words list -- Can be Edited
stop_words = stopwords.words('english')
stop_words.extend(['tesla', 'c,', 'x', 't', 'p', 'amp', 'car', 'get', 'go', 'use'])
wnl = WordNetLemmatizer()

# BIGRAMS===============================================
def make_bigrams(texts):
    return bigram_mod[texts]

# TRIGRAMS =============================================
def make_trigrams(texts):
    return trigram_mod[bigram_mod[texts]]

# LEMMATIZING ==========================================
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:         
        return None    
def lemmatize_texts(tweet):
     for i in tweet:
        word = (map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(tweet))) 
        word_lem = " ".join([wnl.lemmatize(x[0], x[1]) for x in word if x[1] is not None])
        return word_lem

In [179]:
# remove punctuation and lowercase all
full['preprep'] = full['tweets'].map(lambda x: re.sub('[,\.!?]', '', x.lower()))
#  Removing @ handles, links-- strip whitespace breaks and tabs
full['preprep'] = full['preprep'].map(lambda x: re.sub(r"@\w+|http\S+", "", x).strip()\
                                      .replace("\r", "").replace("\n", "").replace("\t", ""))

In [153]:
# Creates a list of all tweets in full.preprep
data = full.preprep.values.tolist()

# Uses gensims simple preprocessor on all tweets in list
# Ouputs list of lists of tokenized tweets
data_words = list(sent_to_words(data))

In [154]:
# Bigram ---
# higher hyperparameter values =  fewer phrases.
bigram = gensim.models.Phrases(data_words, min_count=2, threshold=20) # orig -- 3/ 75
bigram_mod = gensim.models.phrases.Phraser(bigram)

# TriGrams --
trigram = gensim.models.Phrases(bigram[data_words], threshold=60)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [155]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)
# lemmatize
data_lemmatized = [lemmatize_texts(tweet) for tweet in data_words_bigrams]

In [156]:
print(len(data_words), len(data_lemmatized))

126510 126510


In [157]:
# Getting indices of all nones
indices_of_nones = [(i) for i,x  in enumerate(data_lemmatized) if x == None]
len(indices_of_nones)

370

In [158]:
# Dropping None values using indices aquired above
lem_drop_None = [(i, x) for (i, x) in enumerate(data_lemmatized) if i not in indices_of_nones]
len(lem_drop_None)

126140

In [159]:
# Getting indices of all short tweets
indices_short_tweets = [i for (i,x) in lem_drop_None if (len(x.split(' ')) <= 5)]
len(indices_short_tweets)

37227

In [160]:
lem_noShort_noNone = [(i, x) for (i, x) in lem_drop_None if i not in indices_short_tweets]
len(lem_noShort_noNone)

88913

### The Rosetta Stone

In [180]:
# The Same! 
print((lem_noShort_noNone[8654]), '\n', (full.preprep.iloc[13142]))

(13142, 'wonder elon antic go affect bottom_line conservative prefer fossil_fuel vehicle lot liberal buyer doubt looking_elsewhere electric vehicle') 
 i’ve been wondering how elon’s antics are going to affect tesla’s bottom line conservatives prefer fossil fuel vehicles and lots of liberal buyers are no doubt looking elsewhere for an electric vehicle


In [162]:
lem_noNone = [x.split() for (i, x) in lem_noShort_noNone]

In [163]:
id2word = corpora.Dictionary(lem_noNone)
print(len(id2word)) 

62603


In [164]:
# This can be Tuned --- and probably should be
id2word.filter_extremes(no_below=6, no_above=.95)
print(len(id2word))

13660


In [165]:
corpus = [id2word.doc2bow(d) for d in lem_noNone]

In [184]:
print((lem_noNone[8654], '\n\n\n', 
       lem_noShort_noNone[8654]), '\n\n',
      (full.preprep.iloc[13142]))

(['wonder', 'elon', 'antic', 'go', 'affect', 'bottom_line', 'conservative', 'prefer', 'fossil_fuel', 'vehicle', 'lot', 'liberal', 'buyer', 'doubt', 'looking_elsewhere', 'electric', 'vehicle'], '\n\n\n', (13142, 'wonder elon antic go affect bottom_line conservative prefer fossil_fuel vehicle lot liberal buyer doubt looking_elsewhere electric vehicle')) 

 i’ve been wondering how elon’s antics are going to affect tesla’s bottom line conservatives prefer fossil fuel vehicles and lots of liberal buyers are no doubt looking elsewhere for an electric vehicle


# Modeling

In [118]:
# Instantiating a Base LDA model
base_model = LdaMulticore(corpus=corpus,
                          num_topics=5,
                          id2word=id2word,
                          workers=7,
                          passes=5,
                          chunksize=2000,
                          random_state = 42)
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]
topics = [' '.join(t[0:10]) for t in words]

for id, t in enumerate(topics): 
    print(f"------ Topic {id + 1} ------")
    print(t, end="\n\n")

------ Topic 1 ------
ev car make company year good spot get market many

------ Topic 2 ------
buy go people think make get stock company electric_car elon

------ Topic 3 ------
use get say go model know driver see time musk

------ Topic 4 ------
stock twitter musk price elon_musk elon buy tsla share make

------ Topic 5 ------
get electric fire car drive vehicle new buy think truck



In [119]:
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=lem_noNone, 
                                   dictionary=id2word, coherence='c_v')

coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base, '\n\nPerplexity: ', base_perplexity)


Coherence Score:  0.2828890631617448 

Perplexity:  -7.868100484595867


In [120]:
pyLDAvis.enable_notebook()
gensimvis.prepare(base_model, corpus, id2word)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [121]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)



In [122]:
# Every Tweet Classified by Dominant Topic
df_topic_sents_keywords = format_topics_sentences(ldamodel = base_model, corpus = corpus, texts = lem_noNone)

In [123]:
sent_topics_Sorteddf = pd.DataFrame()
sent_topics_Outdf = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, group in sent_topics_Outdf:
    sent_topics_Sorteddf = pd.concat([sent_topics_Sorteddf, 
                                             group.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)
# Reset Index    
sent_topics_Sorteddf.reset_index(drop=True, inplace=True)
# Format
sent_topics_Sorteddf.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]



In [36]:
sent_topics_Sorteddf

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0,0.9744,"musk, elon, twitter, say, buy, stock, ev, car,...","[follow, money, build, obama, ev, tax, benefit..."
1,1,0.972,"get, people, elon, buy, think, company, know, ...","[know, billionaire, help, many, ppl, world, tr..."
2,2,0.9744,"car, ev, make, battery, year, go, model, elect...","[price, power, matter, ability, sell, evs, pro..."
3,3,0.973,"stock, musk, elon, twitter, buy, price, go, ts...","[high, volume, option, alert, day, come, follo..."
4,4,0.9741,"car, electric, get, buy, drive, people, make, ...","[government, also, fine, company, price, gouge..."


In [None]:
# Topic Distributions VISUALIZATION
topic_dist = pd.Series(df_topic_sents_keywords.Dominant_Topic, dtype = 'category')
sns.countplot(topic_dist)

In [None]:
def original_tweet_sentiment(cleaned_to_index, original, cleaned_to_topic):
    comp_dic = {}
    sia = SentimentIntensityAnalyzer()
    
    for i in range(len(cleaned_to_index)):
        ind = cleaned_to_index[i][0]
        tweet = original.iloc[ind]
        comp_dic[i] = tweet, sia.polarity_scores(tweet)['compound']
        
    comp_scores = pd.DataFrame(comp_dic).T
    comp_scores.columns = ['Original_tweet', 'Sentiment']
    
    df_sent = cleaned_to_topic.merge(comp_scores, left_index = True, right_index = True)
    
    return df_sent

In [None]:
df_sent = original_tweet_sentiment(lem_noShort_noNone, full['tweets'],df_topic_sents_keywords )
df_sent

In [None]:
grp_sent = df_sent.Sentiment.groupby(by = df_sent['Dominant_Topic']).mean()
grp_sent

#### ============================================================================

In [166]:
lem_tokens = pd.Series(lem_noNone)

In [167]:
lem_tokens

0        [solar, grid, run, computer, radio, fridge, en...
1        [never, survive, carbon_offsets, implement, de...
2        [right, nonsense, position, find, sort, hard, ...
3        [ford, pe, ration, pe_ratio, tsla, still, long...
4        [think, back, time, work, employee, brag, work...
                               ...                        
88908    [yes, beg, borrow, steal, invest, tsla, world,...
88909    [owh, rethink, thesis, always_thought, regulat...
88910    [overused, example, elon_musk, alienate, almos...
88911    [found, spacex, successful, also, arguably, in...
88912    [want, ev, table, musk, show, world, ten, year...
Length: 88913, dtype: object

In [168]:
from gsdmm import MovieGroupProcess

In [185]:
gsdmm = MovieGroupProcess(K=5, alpha=0.01, beta=0.01, n_iters=15)
docs = lem_tokens.to_numpy()
vocab_length = len(id2word)
# fit GSDMM model
y = gsdmm.fit(docs, vocab_length)

In stage 0: transferred 62187 clusters with 5 clusters populated
In stage 1: transferred 37740 clusters with 5 clusters populated
In stage 2: transferred 28365 clusters with 5 clusters populated
In stage 3: transferred 21427 clusters with 5 clusters populated
In stage 4: transferred 16268 clusters with 5 clusters populated
In stage 5: transferred 12192 clusters with 5 clusters populated
In stage 6: transferred 9160 clusters with 5 clusters populated
In stage 7: transferred 7594 clusters with 5 clusters populated
In stage 8: transferred 6777 clusters with 5 clusters populated
In stage 9: transferred 6315 clusters with 5 clusters populated
In stage 10: transferred 5920 clusters with 5 clusters populated
In stage 11: transferred 5741 clusters with 5 clusters populated
In stage 12: transferred 5580 clusters with 5 clusters populated
In stage 13: transferred 5521 clusters with 5 clusters populated
In stage 14: transferred 5530 clusters with 5 clusters populated


In [174]:
# V2 BETTER THAN V1
# K=6, alpha=0.3, beta=0.6, n_iters=15

doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 7)

Number of documents per topic : [25366 16880  6978 17173 11327 11189]
Most important clusters (by number of docs inside): [0 3 1 4 5 2]

Cluster 0 : [('buy', 5430), ('elon', 3648), ('people', 3435), ('make', 3080), ('musk', 2979), ('think', 2699), ('electric_car', 2486)]

Cluster 3 : [('stock', 10970), ('twitter', 4269), ('buy', 3251), ('musk', 2530), ('sell', 2020), ('price', 2016), ('elon', 1877)]

Cluster 1 : [('car', 3582), ('make', 2571), ('ev', 2566), ('company', 2289), ('year', 1700), ('buy', 1686), ('market', 1497)]

Cluster 4 : [('find', 1362), ('car', 1280), ('enter', 1202), ('free', 1121), ('fire', 1099), ('door', 1002), ('c', 896)]

Cluster 5 : [('car', 2112), ('electric_car', 1963), ('electric', 1942), ('battery', 1528), ('charge', 1519), ('spot', 1199), ('ev', 1106)]

Cluster 2 : [('elon_musk', 1803), ('esg_index', 1045), ('esg', 777), ('musk', 699), ('elonmusk', 538), ('say', 525), ('crash', 493)]


In [178]:
# K=5, alpha=0.3, beta=0.6, n_iters=15

doc_count = np.array(gsdmm.cluster_doc_count)
print('Number of documents per topic :', doc_count)

# Topics sorted by the number of document they are allocated to
top_index = doc_count.argsort()[-15:][::-1]
print('Most important clusters (by number of docs inside):', top_index)

# define function to get top words per topic
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts = sorted(cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print("\nCluster %s : %s"%(cluster, sort_dicts))

# get top words in topics
top_words(gsdmm.cluster_word_distribution, top_index, 7)

Number of documents per topic : [18530 23719 32164  9585  4915]
Most important clusters (by number of docs inside): [2 1 0 3 4]

Cluster 2 : [('buy', 6341), ('car', 5764), ('make', 4906), ('electric_car', 4767), ('people', 3802), ('ev', 3540), ('company', 3323)]

Cluster 1 : [('stock', 11746), ('twitter', 4886), ('buy', 4327), ('elon', 3135), ('musk', 3080), ('go', 2519), ('price', 2345)]

Cluster 0 : [('car', 2643), ('drive', 1531), ('model', 1499), ('make', 1399), ('know', 1323), ('fire', 1269), ('people', 1258)]

Cluster 3 : [('elon_musk', 2239), ('musk', 1425), ('esg_index', 1174), ('say', 1163), ('esg', 1136), ('company', 899), ('elon', 746)]

Cluster 4 : [('enter', 1190), ('find', 1013), ('free', 1001), ('st_may', 895), ('c', 894), ('information_including', 892), ('t', 671)]


# Below is the grid search the chunky one

### Here
## We 
# Go

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=1500,
                                           passes=8,
                                           alpha=a,
                                           eta=b, 
                                           workers = 7)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=lem_noNone, dictionary=id2word, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 3
max_topics = 9
step_size = 1
topics_range = range(min_topics, max_topics, step_size)
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')
# Validation sets
num_of_docs = len(corpus)
corpus_sets = [#gensim.utils.ClippedCorpus(corpus, num_of_docs*0.25), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.5)), 
               gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), 
               corpus]

corpus_title = ['50% Corpus', '75% Corpus', '100% Corpus']

model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }
# Can take a long time to run
if 1 == 1:
    pbar = tqdm.tqdm(total=540)
    
    # iterate through validation corpuses
    for i in range(len(corpus_sets)):
        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha:
                # iterare through beta values
                for b in beta:
                    # get the coherence score for the given parameters
                    cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, 
                                                  k=k, a=a, b=b)
                    # Save the model results
                    model_results['Validation_Set'].append(corpus_title[i])
                    model_results['Topics'].append(k)
                    model_results['Alpha'].append(a)
                    model_results['Beta'].append(b)
                    model_results['Coherence'].append(cv)
                    
                    pbar.update(1)
    results_df = pd.DataFrame(model_results)
    pbar.close()

In [33]:
results_df = pd.read_csv('Data/first-gs-results-24h')
results_df.drop(columns = 'Unnamed: 0', inplace = True)

In [34]:
results_df[(results_df['Coherence'] >= 0.38) & (results_df['Topics'] == 5 )]

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
240,75% Corpus,5,0.01,0.01,0.384454
245,75% Corpus,5,0.31,0.01,0.381647
260,75% Corpus,5,symmetric,0.01,0.38038
420,100% Corpus,5,0.01,0.01,0.38121
430,100% Corpus,5,0.61,0.01,0.384243
440,100% Corpus,5,symmetric,0.01,0.389232


# =============================================================================

# Below, will be trying random grid search results which had a high coherence to try and get coherent topics

In [176]:
newer_model = LdaMulticore(corpus=corpus,
                          num_topics=5,
                          id2word=id2word,
                          workers=4,
                          passes=10,
                          alpha = 0.61 , 
                          eta = 0.01,
                          chunksize=1200,
                          random_state = 100)
newer_words = [re.findall(r'"([^"]*)"',t[1]) for t in newer_model.print_topics()]
newer_topics = [' '.join(t[0:10]) for t in newer_words]

for id, t in enumerate(newer_topics): 
    print(f"------ Topic {id + 1} ------")
    print(t, end="\n\n")

------ Topic 1 ------
say elon_musk battery new day spacex musk leave ceo elonmusk

------ Topic 2 ------
stock buy twitter musk go sell elon price market back

------ Topic 3 ------
company world driver also issue crash news make self_driving autopilot

------ Topic 4 ------
car ev see model electric year look vehicle find first

------ Topic 5 ------
people know make think drive electric_car work need get want



In [177]:
pyLDAvis.enable_notebook()
gensimvis.prepare(newer_model, corpus, id2word)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


# =============================================================================

# Below, will be trying to reduce size of corpus to increase model coherence

In [None]:
len(lem_noNone)

In [None]:
longer_tweets = [(tweet) for (i, tweet) in enumerate(lem_noNone) if i not in counter]
len(longer_tweets)

In [None]:
new_id2word = corpora.Dictionary(longer_tweets)
print(len(new_id2word)) 

In [None]:
new_id2word.filter_extremes(no_below=9, no_above=.90)
print(len(new_id2word))

In [None]:
new_corpus = [new_id2word.doc2bow(d) for d in longer_tweets]

In [None]:
# Instantiating a Base LDA model
newer_model = LdaMulticore(corpus=new_corpus,
                          num_topics=5,
                          id2word=new_id2word,
                          workers=6,
                          passes=8,
                          alpha = 0.91 , 
                          eta = 0.61,
                          chunksize=1500,
                          random_state = 100)
newer_words = [re.findall(r'"([^"]*)"',t[1]) for t in newer_model.print_topics()]
newer_topics = [' '.join(t[0:10]) for t in newer_words]

for id, t in enumerate(newer_topics): 
    print(f"------ Topic {id + 1} ------")
    print(t, end="\n\n")

In [None]:
newer_perplexity = newer_model.log_perplexity(new_corpus)

# Compute Coherence Score
newer_coherence_model = CoherenceModel(model= newer_model, texts=longer_tweets, 
                                   dictionary=new_id2word, coherence='c_v')

new_coherence_lda_model_base = newer_coherence_model.get_coherence()
print('\nCoherence Score: ', new_coherence_lda_model_base, '\nPerplexity: ', newer_perplexity)

In [None]:
pyLDAvis.enable_notebook()
gensimvis.prepare(newer_model, new_corpus, new_id2word)

# =============================================================================

# Below --- SENTIMENT Need to link tweets to topics, and calculate sentiment by topic

In [60]:
# first two indexed 0 - 92855
print(df_topic_sents_keywords[0].iloc[92855],
      '\n', lem_noShort_noNone[92855],
      '\n', full['preprep'][126509] )
# These are all the same tweet

['want', 'ev', 'table', 'musk', 'show', 'world', 'year', 'old'] 
 (126509, 'want ev table musk show world year old') 
 i want an ev but tesla is off the table for me  musk is showing the world that he's a ten year old inside


In [65]:
lem_noShort_noNone[52244][0]


67647

In [58]:
full['preprep'][126509]

"i want an ev but tesla is off the table for me  musk is showing the world that he's a ten year old inside"

In [None]:
"""
cleaned to index = lem_noSHort_noNone
original = full['preprep']
"""

In [102]:
df_sent.Original_tweet.iloc[0]

"@testcranker @ITGuy1959 My solar is 100% off grid. It runs the computers, radios and fridge. My entire yard isn't big enough to recharge a Tesla."

In [None]:
"""
Note above, 
the original dataframe is indexed from 0, as in it is in the same form as an enumerated list
we created a function earlier to find all of the missing index values somewhere around creating lem_noNone

this function can be used to set proper indexs on the new lists past the funnkiness somehow... easier said than done 
"""

# Below - Create count vectorizer word cloud for 'EDA' -- Code is in FSM

In [61]:
cv=CountVectorizer(analyzer='word')
data=cv.fit_transform(full['preprep'])
df_dtm = pd.DataFrame(data.toarray(), columns=cv.get_feature_names())
df_dtm.index=df_grouped.index
df_dtm.head(3)

NameError: name 'df_grouped' is not defined

# ========================================================================================================================================================================================================================================================================================================================================================================

# SCRAP

# ========================================================================================================================================================================================================================================================================================================================================================================

## Attempting to get the top documents per topic 

In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=newer_model, corpus=new_corpus, texts=longer_tweets)
df_topic_sents_keywords

In [None]:
df_topic_sents_keywords.Dominant_Topic.value_counts()

In [None]:
df_topic_sents_keywords.Topic_Keywords.value_counts()

In [None]:
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], 
                                            axis=0)
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

sent_topics_sorteddf_mallet.head(10)

In [None]:
sent_topics_sorteddf_mallet.iloc[2]['Keywords']

In [None]:
# sent_topics_sorteddf_mallet.iloc[2]['Representative Text']

In [None]:
# longer_tweets[:10]

In [None]:
df_topic_sents_keywords

In [None]:
df_topic_sents_keywords[(df_topic_sents_keywords['Dominant_Topic'] == 4) & (df_topic_sents_keywords['Perc_Contribution'] >= 0.84)]

In [None]:
len(ind)

In [None]:
len(counter)

In [None]:
sum_dropped = counter + ind
len(sum_dropped)

In [None]:
# full_tester = full.drop(full.index[ind])
full_tester = full.drop(full.index[sum_dropped])
full_tester.reset_index(inplace = True)

In [None]:
full_tester

In [None]:
full_tester.tweets.iloc[104646]

In [None]:
len(longer_tweets)

In [None]:
len(new_corpus)

In [None]:
def find_common_strings(string_list, input_string):
    common_strings = []
    for s in string_list:
        if set(s).intersection(set(input_string)) == set(s):
            common_strings.append(s)
    return common_strings

In [None]:
def find_common_strings(string_list, input_string):
    common_strings = []
    for i, s in enumerate(string_list):
        if set(s).intersection(set(input_string)) == set(s):
            common_strings.append((i, s))
            
    pos_dic = {}
    for i, string in common_strings:
        string_split = string.split(' ')
        input_split = input_string.split(' ')
        common = []
        for word in input_split:
            if word in string_split:
                common.append(word)
        pos_dic[i] = common
        
    
    
    
    return common_strings, pos_dic

In [None]:
test_string = (' ').join(df_topic_sents_keywords[0].iloc[0])
test_string

In [None]:
big_list, ind_dict = find_common_strings(full['preprep'], test_string)

In [None]:
print(len(ind_dict), len(big_list))

In [None]:
potentials = []
for key, value in ind_dict.items():
    if value != []:
        potentials.append(key)

In [None]:
for i in potentials:
    print(full['preprep'].iloc[i])

In [None]:
test_string[::-1].find(' ')

In [None]:
for s in full['preprep']:
    

In [None]:
# string_test = find_common_strings(lem_noNone,test_string )
# string_test

# ========================================================================================================================================================================================================================================================================================================================================================================

# SCRAP

# ========================================================================================================================================================================================================================================================================================================================================================================

In [None]:
# Adding Sentiment
sia = SentimentIntensityAnalyzer()
comp_dic = {}
for i, tweet in df_topic_sents_keywords[0].iteritems():
    comp_dic[i] = sia.polarity_scores(tweet)['compound']

comp_scores = pd.Series(comp_dic, name = 'sentiments')
df_sent = df_topic_sents_keywords.merge(comp_scores, left_index = True, right_index = True)
# df_sent = df_sent.set_index('Unnamed: 0')



In [None]:
for i in range(len(doc_set)):
    print(ldamodel[corpus[i]])

In [None]:
# CODE WORKS --- JUST USE IT LATER 

# Adding Sentiment
sia = SentimentIntensityAnalyzer()
comp_dic = {}
for i, tweet in full['tweets'].iteritems():
    comp_dic[i] = sia.polarity_scores(tweet)['compound']

comp_scores = pd.Series(comp_dic, name = 'sentiment')
df_sent = full.merge(comp_scores, left_index = True, right_index = True)
df = df_sent.set_index('Unnamed: 0')

In [None]:
vectorizer = CountVectorizer()
data_vectorized = vectorizer.fit_transform(df.drop('tokens'))



# CV Param
search_params = {'n_components': [5, 10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9], 'batch_size':[128, 256]}

# Init Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, cv = 5, param_grid=search_params, n_jobs = -1)



# Grid Search
model.fit(data_vectorized)

# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))


best_lda_model

In [None]:
GridSearchCV(cv=None, error_score='raise',
             estimator=LatentDirichletAllocation(batch_size=128, 
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1, 
                                                 learning_decay=0.7, 
                                                 learning_method=None,
                                                 learning_offset=10.0, 
                                                 max_doc_update_iter=100, 
                                                 max_iter=10,
                                                 mean_change_tol=0.001, 
                                                 n_components=10, 
                                                 n_jobs=1,
                                                 perp_tol=0.1, 
                                                 random_state=None,
                                                 topic_word_prior=None, 
                                                 total_samples=1000000.0, 
                                                 verbose=0),
             n_jobs=-2,
             param_grid={'n_topics': [10, 15, 20, 30], 
                         'learning_decay': [0.5, 0.7, 0.9]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
             scoring=None, verbose=0)