## NLP: LDA and Sentiment analyses of text survey data

Helpful links: 
- https://towardsdatascience.com/a-complete-exploratory-data-analysis-and-visualization-for-text-data-29fb1b96fb6a
- https://www.analyticsvidhya.com/blog/2020/04/beginners-guide-exploratory-data-analysis-text-data/#4

In [None]:
import pandas as pd
import numpy as np

survey_data_raw = pd.read_excel("Working Remotely Survey.xlsx")

### Survey Question: Do you have any additional concerns about\nworking remotely that you would like to share at this time?

##### 1. Prep data from raw data file

In [None]:
add_concerns = survey_data_raw.iloc[:,51:52]
add_concerns.rename(columns={ add_concerns.columns[0]: "Additional_Concerns" }, inplace = True)
add_concerns = add_concerns.dropna()
add_concerns = add_concerns.reset_index(drop=True)
add_concerns.head()

##### 2.  Preprocess Text Data

In [None]:
#!pip install textblob
from textblob import TextBlob

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.stem.snowball import SnowballStemmer

lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
#stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    #sentence=sentence.replace('{html}',"") 
    sentence = str(TextBlob(sentence).correct()) #correct spelling
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)

add_concerns['cleanText']=add_concerns['Additional_Concerns'].map(lambda s:preprocess(s)) 

#remove common words we don't want included in LDA topics, as needed (context spefic)
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('working', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('work', '') 
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('home', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('would', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('remotely', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('remote', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('concerns', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('think', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('able', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('concern', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('office', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('day', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('feel', '')
add_concerns['cleanText'] = add_concerns['cleanText'].str.replace('also', '')

add_concerns['cleanText'] = add_concerns['cleanText'].apply(word_tokenize)

In [None]:
add_concerns

##### 3. Topic Modelling

https://honingds.com/blog/topic-modeling-latent-dirichlet-allocation-lda/

In [None]:
#!pip install gensim
#!pip install pyLDAvis
import gensim
from gensim import corpora
import pyLDAvis.gensim

#Create a Gensim dictionary from the tokenized data 
cleaned = add_concerns['cleanText']
#Creating term dictionary of corpus, where each unique term is assigned an index.
dictionary = corpora.Dictionary(cleaned)
#Filter terms which occurs in less than 1 answer and more than 80% of the answers.
dictionary.filter_extremes(no_below=1, no_above=0.8)
#convert the dictionary to a bag of words corpus 
corpus = [dictionary.doc2bow(tokens) for tokens in cleaned]
print(corpus[:1])

In [None]:
[[(dictionary[id], freq) for id, freq in cp] for cp in corpus[:1]]

##### Build LDA

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 8, id2word=dictionary, passes=30)
ldamodel.save('model_combined.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
   print(topic)

Topic-Words matrix contains the probability distribution of words generated from those topics. By running the LDA algorithm on the above data produces the above outputs.

In [None]:
get_document_topics = ldamodel.get_document_topics(corpus[0])
print(get_document_topics) #entry 0 is x% related to topic n

In [None]:
def dominant_topic(ldamodel, corpus, texts):
     #Function to find the dominant topic in each review
     sent_topics_df = pd.DataFrame() 
     # Get main topic in each review
     for i, row in enumerate(ldamodel[corpus]):
         row = sorted(row, key=lambda x: (x[1]), reverse=True)
         # Get the Dominant topic, Perc Contribution and Keywords for each review
         for j, (topic_num, prop_topic) in enumerate(row):
             if j == 0:  # =&gt; dominant topic
                 wp = ldamodel.show_topic(topic_num,topn=4)
                 topic_keywords = ", ".join([word for word, prop in wp])
                 sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
             else:
                 break
     sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
     contents = pd.Series(texts)
     sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
     return(sent_topics_df)

In [None]:
df_dominant_topic = dominant_topic(ldamodel=ldamodel, corpus=corpus, texts=add_concerns['Additional_Concerns']) 
df_dominant_topic.head()

### Sentiment Analysis

In [None]:
add_concerns_2 = survey_data_raw.iloc[:,51:52]
add_concerns_2.rename(columns={ add_concerns_2.columns[0]: "Additional_Concerns" }, inplace = True)
add_concerns_2 = add_concerns_2.dropna()
add_concerns_2 = add_concerns_2.reset_index(drop=True)
add_concerns_2.head()

In [None]:
def preprocess(ReviewText):
    ReviewText = ReviewText.str.replace("(<br/>)", "")
    ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
    ReviewText = ReviewText.str.replace('(&amp)', '')
    ReviewText = ReviewText.str.replace('(&gt)', '')
    ReviewText = ReviewText.str.replace('(&lt)', '')
    ReviewText = ReviewText.str.replace('(\xa0)', ' ')  
    return ReviewText

add_concerns_2['Additional_Concerns'] = preprocess(add_concerns_2['Additional_Concerns'])

add_concerns_2['polarity'] = add_concerns_2['Additional_Concerns'].map(lambda text: TextBlob(text).sentiment.polarity) #calculate sentiment polarity which lies in the range of [-1,1] where 1 means positive sentiment and -1 means a negative sentiment.
add_concerns_2['review_len'] = add_concerns_2['Additional_Concerns'].astype(str).apply(len) #Create new feature for the length of the review
add_concerns_2['word_count'] = add_concerns_2['Additional_Concerns'].apply(lambda x: len(str(x).split())) #Create new feature for the word count of the review.

In [None]:
print('3 random reviews with the relatively high positive sentiment polarity: \n')
cl = add_concerns_2.loc[add_concerns_2.polarity >= 0.6, ['Additional_Concerns']].sample(3).values
for c in cl:
    print(c[0])

In [None]:
print('3 random reviews with the most neutral sentiment(zero) polarity: \n')
cl = add_concerns_2.loc[add_concerns_2.polarity == 0.0, ['Additional_Concerns']].sample(3).values
for c in cl:
    print(c[0])

In [None]:
print('3 reviews with the most negative polarity: \n')
cl = add_concerns_2.loc[add_concerns_2.polarity <= 0.8, ['Additional_Concerns']].sample(3).values
for c in cl:
    print(c[0])

In [None]:
#!pip install plotly==4.9.0
#!pip install cufflinks
from plotly.offline import init_notebook_mode, iplot
from plotly.graph_objs import *

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

add_concerns_2['polarity'].iplot(
    kind='hist',
    bins=50,
    xTitle='polarity',
    linecolor='black',
    yTitle='count',
    title='Sentiment Polarity Distribution')

### Top 20 Words after removing stop words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_words(add_concerns_2['Additional_Concerns'], 20)

for word, freq in common_words:
    print(word, freq)
    
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in review after removing stop words')

### Top BiGrams

In [None]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(add_concerns_2['Additional_Concerns'], 20)

for word, freq in common_words:
    print(word, freq)
    
df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in review after removing stop words')

### Top Trigrams

In [None]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_trigram(add_concerns_2['Additional_Concerns'], 20)

for word, freq in common_words:
    print(word, freq)

df6 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in review after removing stop words')

### Additional storage code - as reference, if needed

In [None]:
#Tokenization
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize 

add_concerns['Additional_Concerns'] = add_concerns['Additional_Concerns'].apply(word_tokenize)
add_concerns.head()

In [None]:
#Stemming
import nltk
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

add_concerns['stemmed'] = add_concerns['Additional_Concerns'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.

In [None]:
#Remove Stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

add_concerns['processed'] = add_concerns['stemmed'].apply(lambda x: [item for item in x if item not in stop_words])
add_concerns

In [None]:
#Word cloud of product categories
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
text = " ".join(i for i in add_concerns.processed)
wordcloud = WordCloud().generate(text)

wordcloud = WordCloud(max_font_size=70, max_words=15, background_color="black").generate(text)
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()