In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("./clean_processed_tweet_data.csv")

In [None]:
Clean_Tweet = df[['original_text','polarity']]

In [None]:
def text_category (p):
  if p > 0:
    return 'positive'
  elif p < 0:
    return 'negative'
  else:
    return 'neutral'

In [None]:
score = pd.Series([text_category(val) for val in Clean_Tweet['polarity']])
Clean_Tweet = pd.concat([Clean_Tweet, score.rename("score")], axis=1)

labels = ['neutral', 'positive', 'negative']
positive = len(Clean_Tweet[Clean_Tweet['score'] == "positive"])
negative = len(Clean_Tweet[Clean_Tweet['score'] == "negative"])
neutral = len(Clean_Tweet[Clean_Tweet['score'] == "neutral"])


fig, ax = plt.subplots(1, 1, figsize = (10,4))


ax.bar(x=labels, height=[negative, positive, neutral], color='red')
ax.set_title('Barchart of score column')
ax.set_xticklabels(labels, rotation=90)


fig.suptitle('Score column plots')


fig.show()

In [None]:
Clean_Tweet = Clean_Tweet.drop(Clean_Tweet[Clean_Tweet.score == 'neutral'].index)
scoremap = pd.Series([1 if val == 'positive' else 0 for val in Clean_Tweet['score']])
Clean_Tweet['scoremap'] = scoremap

In [None]:
fig=plt.figure()
ax = fig.add_subplot(1,1,1)
ax.hist(Clean_Tweet['scoremap'],bins = 5,color='red')
plt.title('Tweet distribution')
plt.xlabel('Sentiment')
plt.ylabel('Analysis')
plt.show()

In [None]:
import re
import string
import spacy
import gensim
from gensim import corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
Clean_Tweet['original_text']=Clean_Tweet['original_text'].astype(str)
Clean_Tweet['original_text'] = Clean_Tweet['original_text'].apply(lambda x: x.lower())
Clean_Tweet['original_text']= Clean_Tweet['original_text'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))

In [None]:
from nltk.corpus import stopwords
import nltk
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
from gensim.models.coherencemodel import CoherenceModel


In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('english')

In [None]:
def remove_stopwords(text):
    textArr = text.split(' ')
    rem_text = " ".join([i for i in textArr if i not in stop_words])
    return rem_text

In [None]:
Clean_Tweet['original_text']=Clean_Tweet['original_text'].apply(remove_stopwords)

In [None]:
nlp = spacy.load('en_core_web_md', disable=['parser', 'ner'])

def lemmatization(texts,allowed_postags=['NOUN', 'ADJ']): 
       output = []
       for sent in texts:
             doc = nlp(sent) 
             output.append([token.lemma_ for token in doc if token.pos_ in allowed_postags ])
       return output

In [None]:
text_list=Clean_Tweet['original_text'].tolist()
tokenized_reviews = lemmatization(text_list)

In [None]:
dictionary = corpora.Dictionary(tokenized_reviews)
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]

In [None]:
LDA = gensim.models.ldamodel.LdaModel
lda_model = LDA(corpus=doc_term_matrix, id2word=dictionary, num_topics=5, random_state=100,
                chunksize=500, passes=50,iterations=100)

In [None]:
lda_model.print_topics()

In [None]:
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(lda_model, doc_term_matrix, dictionary)
LDAvis_prepared

In [None]:
print('\nPerplexity: ', lda_model.log_perplexity(doc_term_matrix,total_docs=10000)) 
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_reviews, dictionary=dictionary , coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)