# Libraries

In [1]:
import pandas as pd

import regex as re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import pyLDAvis
import pyLDAvis.gensim_models as gensim_models
import pyLDAvis.gensim_models as gensimvis


import gensim
# from gensim.models import wrappers
# from gensim.models.wrappers import LdaMallet
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD


import nltk
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer


from pprint import pprint

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

  from scipy.linalg.special_matrices import triu


# Importing -- Fixing columns

In [2]:
full = pd.read_csv('Data/Data-Cleaned/238k-Uncleaned')

In [3]:
# Renaming tweets column, dropping unnamed column, making tweets strings

full['tweets'] = full['0']
full = full.drop(columns = '0')
# full['tweets'] = full['tweets'].astype('string')
# full.dropna(inplace = True)

# Preprocessing

In [6]:
def Series_Preprocessor(tweet):
    stopWords = stopwords.words('english')
    stopWords.extend(['tesla', 'c,', 'x', 't', 'p', 'amp'])
    ps = PorterStemmer()
    wnl = WordNetLemmatizer()

        # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None 
    
#   Lowercase all 
    tweet = tweet.lower()
    
#   Removing @ handles, links-- strip whitespace breaks and tabs
    tweet = re.sub(r"@\w+|http\S+", "", tweet).strip().replace("\r", "").replace("\n", "").replace("\t", "")
    
#   Tokenize,make list of words, removing punctuation and stopwords
    tweet = [x for x in word_tokenize(tweet) if ((x.isalpha()) & (x not in stopWords)) ]
    
#   Map part of speech tags to words and use words/tags to lemmatize accuratley
    tweet = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(tweet))) 
    tweet = " ".join([wnl.lemmatize(x[0], x[1]) for x in tweet if x[1] is not None])

    return tweet 

In [7]:
df = full['tweets'].apply(Series_Preprocessor)

In [8]:
df

0         solar grid run computer radio fridge entire ya...
1                boycott teslastock elonsproblems vote blue
2         never survive carbon offset implement democrat...
3                            funny fanbase take profit sell
4         right nonsense position find sort hard believe...
                                ...                        
237654    yes beg borrow steal invest world valuable com...
237655    owh rethink thesis always think regulation bui...
237656    overused example elon musk alienate almost yea...
237657    found spacex successful also arguably innovati...
237658               want ev table musk show world year old
Name: tweets, Length: 237659, dtype: object

In [9]:
df['tokens'] = [i.split() for i in df]

In [10]:
# Dictionary is a gensim tool
id2word = Dictionary(df['tokens'])
print(len(id2word)) 

44577


In [11]:
# id2word.filter_extremes(no_below=4, no_above=.95) #Original -- 12656
id2word.filter_extremes(no_below=6, no_above=.90)
print(len(id2word))

14944


In [12]:
corpus = [id2word.doc2bow(d) for d in df['tokens']]

## LdaMulticore -------------------------

In [13]:
# Instantiating a Base LDA model
base_model = LdaMulticore(corpus=corpus, num_topics=5, id2word=id2word, workers=12, passes=5)

In [14]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]


In [15]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [16]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id + 1} ------")
    print(t, end="\n\n")

------ Topic 1 ------
stock buy go musk twitter elon price think market say

------ Topic 2 ------
musk tsla esg stock index elon get fsd new beta

------ Topic 3 ------
car make electric get ev battery drive year fire go

------ Topic 4 ------
musk elon world company want car get good free people

------ Topic 5 ------
car electric door get buy vehicle go crash open new



In [17]:
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=df['tokens'], 
                                   dictionary=id2word, coherence='c_v')

coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base, '\n\nPerplexity: ', base_perplexity)


Coherence Score:  0.3247128095292947 

Perplexity:  -7.586604868453203


In [18]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()

In [19]:
gensimvis.prepare(base_model, corpus, id2word)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [26]:
base_model.get_topic_terms(0, topn= 20)

[(151, 0.030353038),
 (144, 0.022119746),
 (160, 0.011935298),
 (126, 0.009758756),
 (154, 0.009500694),
 (157, 0.008582936),
 (102, 0.0077510867),
 (73, 0.007669238),
 (125, 0.007654948),
 (83, 0.007473401),
 (40, 0.007462548),
 (292, 0.00735998),
 (31, 0.007244815),
 (48, 0.006824767),
 (74, 0.006505299),
 (190, 0.0065035443),
 (164, 0.0056932033),
 (45, 0.005619609),
 (64, 0.005194856),
 (22, 0.00517804)]

In [29]:
word = id2word[144]

'buy'

In [30]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
# CODE WORKS --- JUST USE IT LATER 

# Adding Sentiment
sia = SentimentIntensityAnalyzer()
comp_dic = {}
for i, tweet in full['tweets'].iteritems():
    comp_dic[i] = sia.polarity_scores(tweet)['compound']

comp_scores = pd.Series(comp_dic, name = 'sentiment')
df_sent = full.merge(comp_scores, left_index = True, right_index = True)
df = df_sent.set_index('Unnamed: 0')