**Installing Requirements**

In [None]:
!pip install textacy==0.11.0

In [2]:
!pip3 install Xlsxwriter



In [None]:
!pip install vaderSentiment

In [None]:
!python -m spacy download en_core_web_sm

In [5]:
import spacy
import os
import pandas as pd
import textacy

In [6]:
import re
import html

In [None]:
'Mounting google drive'
from google.colab import drive 
drive.mount('/content/gdrive')

In [8]:
# Change path to data folder in zip provided
path = r'/content/gdrive/MyDrive/3. IU Courses/Courses/2. Social Data Mining/Social Media Project/ILS-Z639 Final Project Deliverable - Abhinav Bajpai/data' 

**Text Cleaning Functions**

In [9]:
'Declaring Text Cleaning Functions'
RE_SUSPICIOUS = re.compile(r'[&#<>{}\[\]\\]')
def impurity(text, min_len=10):
    # returns the share of suspicious characters in a text
    if text == None or len(text) < min_len:
        return 0
    else:
        return len(RE_SUSPICIOUS.findall(text)) / len(text)

def clean(text):
    # convert html escapes like &amp; to characters.
    text = html.unescape(text)
    # tags like <tab>
    text = re.sub(r'<[^<>]*>', ' ', text)
    # markdown URLs like [Some text](https://....)
    text = re.sub(r'\[([^\[\]]*)\]\([^\(\)]*\)', r'\1', text)
    # text or code in brackets like [0]
    text = re.sub(r'\[[^\[\]]*\]', ' ', text)
    # standalone sequences of specials, matches &# but not #cool
    text = re.sub(r'(?:^|\s)[&#<>{}\[\]+|\\:-]{1,}(?:\s|$)', ' ', text)
    # standalone sequences of hyphens like --- or ==
    text = re.sub(r'(?:^|\s)[\-=\+]{2,}(?:\s|$)', ' ', text)
    # sequences of white spaces
    text = re.sub(r'\s+', ' ', text)
    # remove >>
    text = re.sub(r">\S+", ' ', text)
    # remove #
    text = re.sub(r"#\S+", ' ', text)
    # remove <<
    text = re.sub(r"<\S+", ' ', text)
    return text.strip()


**Reading Dataset and Additional Text Cleaning Steps**

In [10]:
Top5BrandSample = pd.read_pickle(os.path.join(path, "Top5BrandSample.pkl"))
Top5BrandSample = Top5BrandSample.reset_index()

In [None]:
'Applying clean and impurity function'
Top5BrandSample['reviewText'] = Top5BrandSample['reviewText'].str.lower()
Top5BrandSample['cleanReview'] = Top5BrandSample['reviewText'].apply(lambda x: clean(str(x)))
Top5BrandSample['impurity'] = Top5BrandSample['cleanReview'].apply(impurity, min_len=20)
Top5BrandSample[['cleanReview', 'impurity']].sort_values(by='impurity', ascending=False).head(5) 

In [12]:
'Additional Text cleaning Using Textacy'
import textacy.preprocessing as tprep
def normalize(text):
 ''' Removes additional text impurities using built-in textacy functions''' 
 text = tprep.normalize.hyphenated_words(text)
 text = tprep.normalize.quotation_marks(text)
 text = tprep.normalize.unicode(text)
 text = tprep.remove.accents(text)
 text = tprep.replace.urls(text)
 text = tprep.replace.emails(text) 
 text = tprep.replace.hashtags(text) 
 text = tprep.replace.numbers(text) 
 text = tprep.replace.phone_numbers(text) 
 text = tprep.replace.user_handles(text) 
 text = tprep.replace.emojis(text)
 return text

Top5BrandSample['cleanReview'] = Top5BrandSample['cleanReview'].map(normalize)


**Lemmatization Using Spacy**




In [13]:
nlp = spacy.load("en_core_web_sm", disable=["ner"])

In [14]:
def extract_lemmas(doc, **kwargs):
    ''' Extract Lemmas '''
    extractedLemma = [t.lemma_ for t in textacy.extract.words(doc, **kwargs)]
    return extractedLemma

def extract_noun_phrases(doc, preceding_pos=['NOUN'], sep='_'):
    '''Extract Noun Phrases '''
    patterns = []
    for pos in preceding_pos:
        patterns.append(f"POS:{pos} POS:NOUN:+")
    spans = textacy.extract.matches.token_matches(doc, patterns=patterns)
    nounPhrase = [sep.join([t.lemma_ for t in s]) for s in spans]    
    return nounPhrase

def extract_nlp(doc):
    ''' Extract Various Combinations of POS'''
    posResults= {
        'lemmas': extract_lemmas(doc,
                                  exclude_pos=['PART', 'PUNCT',
                                             'DET', 'PRON', 'SYM', 'SPACE'],
                                filter_stops=True)}
    return posResults

nlp_columns = list(extract_nlp(nlp.make_doc('')).keys())

for col in nlp_columns:
    Top5BrandSample[col] = None

In [None]:
batch_size = 50
for i in range(0, len(Top5BrandSample), batch_size):
    docs = nlp.pipe(Top5BrandSample['cleanReview'][i:i + batch_size])
    for j, doc in enumerate(docs):
        for col, values in extract_nlp(doc).items():
            Top5BrandSample[col].iloc[i + j] = values

In [16]:
# Writing intermediary file output
Top5BrandSample.to_pickle(os.path.join(path,'CleanText.pkl'))

**Creating N-grams**

---



In [17]:
Top5BrandSample = pd.read_pickle(os.path.join(path,'CleanText.pkl'))

In [18]:
sentences = Top5BrandSample.lemmas.values

In [19]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser
bigram = Phrases(sentences, min_count=1, threshold=3, delimiter=b'_')  
bigram_phraser = Phraser(bigram)

In [20]:
Top5BrandSample['lemmasNgrams']=None
i=0
for sent in sentences:
  Top5BrandSample.at[i, 'lemmasNgrams'] = bigram_phraser[sent]
  i=i+1

In [None]:
Top5BrandSample[['lemmas','lemmasNgrams']].head(5)

**Topic Model**


---



In [22]:
from gensim.models import word2vec

In [23]:
lemmaSenetence = Top5BrandSample.lemmasNgrams.values

In [24]:
model = word2vec.Word2Vec(lemmaSenetence, size=50, min_count=3, iter=20)

*Assigning Topics*


In [25]:
topicSubject =['battery','screen','volume','memory','return_policy','customer_service','value','durable','network','freeze','camera','design','user_interface','security']
topicBattery = model.wv.most_similar('battery', topn=2000)
topicScreen = model.wv.most_similar('screen', topn=2000)
topicVolume = model.wv.most_similar('volume', topn=2000)
topicMemory = model.wv.most_similar('memory', topn=2000)
topicReturnPolicy = model.wv.most_similar('return_policy', topn=2000)
topicCustomerService = model.wv.most_similar('customer_service', topn=2000)
topicValue = model.wv.most_similar('value', topn=2000) 
topicDurable = model.wv.most_similar('durable', topn=2000)
topicNetwork = model.wv.most_similar('network', topn=2000)
topicPhoneFreeze = model.wv.most_similar('freeze', topn=2000)
topicPhoneCamera = model.wv.most_similar('camera', topn=2000)
topicDesign = model.wv.most_similar('design', topn=2000)
topicInterface = model.wv.most_similar('user_interface', topn=2000)
topicSecurity = model.wv.most_similar('security', topn=2000)

In [26]:
topicCols = ['topicBattery', 'topicScreen', 'topicVolume', 'topicMemory', 'topicReturnPolicy','topicCustomerService','topicValue','topicDurable','topicNetwork','topicPhoneFreeze','topicPhoneCamera','topicDesign','topicInterface','topicSecurity']
for col in topicCols:
    Top5BrandSample[col] = None

topicColsScore = ['topicBatteryS', 'topicScreenS', 'topicVolumeS', 'topicMemoryS', 'topicReturnPolicyS','topicCustomerServiceS','topicValueS','topicDurableS','topicNetworkS','topicPhoneFreezeS','topicPhoneCameraS','topicDesignS','topicInterfaceS','topicSecurityS']
for col in topicColsScore:
    Top5BrandSample[col] = None    

In [27]:
'Exporting top 10 words in each topic to a csv'
df=pd.DataFrame()
df['topic']=''
df['words']='' 
j=0
for col in topicCols:
  i=0
  for items in eval(col):
    #print(items)
    df.loc[j,'topic'] = col
    df.at[j,'words'] = items
    i=i+1
    j=j+1
    if i ==10:
      break
df.to_csv(os.path.join(path,'topic_list.csv'))

In [28]:
def sim_word(topicName):
     "Extracting only words from similarity output of wordvec"
     simList = set([w[0] for w in topicName if w[1]>.5])
     return simList

def sim_score(topicName):
     "Extracting word with score from similarity output of wordvec"
     simScore = set([w for w in topicName if w[1]>.5])
     return simScore

In [29]:
"Putting all the sim words in one dictionary by topic"
topicDict = {}
scoreDict = {}
i=0
for col in topicCols:  
  topicDict.update({col: sim_word(eval(col))}) 
  topicDict[col].add(topicSubject[i]) 
  scoreDict.update({col: sim_score(eval(col))}) 
  scoreDict[col].add((topicSubject[i],1))   
  i=i+1

In [None]:
'Sample checks'
n=2
v=Top5BrandSample['lemmasNgrams'][n]
#v= ['work', 'screen', 'dirty', 'look_like', 'phone', 'drop', 'coffee', 'stain', 'spot']
for topic_, sim_ in topicDict.items():
  if len(set(sim_).intersection(set(v)))>0:
    common =(set(sim_).intersection(set(v)))
    print (topic_, common)
    cosScore=0
    for items in scoreDict[topic_]:
      # Avg. Similarity score
      if items[0] in common:
        print(items[1])
        cosScore = items[1] + cosScore
    cosScoreAvg =  cosScore/len(common)
    assert cosScoreAvg <= 1  
    print(cosScoreAvg)

print(Top5BrandSample['reviewText'][n])

In [31]:
'Finding common words between lemmas of review text and most similar words by each topic'
Top5BrandSample['allTopics']=''
for index, row in Top5BrandSample.iterrows():
  lemmaCheck = row['lemmasNgrams']
  for topic_, sim_ in topicDict.items(): 
    common = set(sim_).intersection(set(lemmaCheck))
    if len(common)>0:
      Top5BrandSample.at[index, topic_] = common
      Top5BrandSample.loc[index, 'allTopics'] = topic_ + ', ' + Top5BrandSample.loc[index, 'allTopics']
      cosScore=0  # Cosine similarity score
      for items in scoreDict[topic_]:
        # Avg. Similarity score
        if items[0] in common:
          # print(items[1])
          cosScore = items[1] + cosScore
      cosScoreAvg =  cosScore/len(common) # Avg. cosine similarity score if there are multiple common words 
      assert cosScoreAvg <= 1
      Top5BrandSample.loc[index, (topic_+'S')] = cosScoreAvg
      

In [None]:
Top5BrandSample.groupby('allTopics')['asin'].count().sort_values(ascending=False).head(12)

**Sentiment Analysis**

In [33]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [34]:
analyzer = SentimentIntensityAnalyzer()
Top5BrandSample['sentiment'] = Top5BrandSample['cleanReview'].apply(lambda x: analyzer.polarity_scores(x))
Top5BrandSample['compound'] = Top5BrandSample['sentiment'].apply(lambda score_dict: score_dict['compound'])
Top5BrandSample['positive'] = Top5BrandSample['sentiment'].apply(lambda score_dict: score_dict['pos'])
Top5BrandSample['negative'] = Top5BrandSample['sentiment'].apply(lambda score_dict: score_dict['neg'])
Top5BrandSample['neutral'] = Top5BrandSample['sentiment'].apply(lambda score_dict: score_dict['neu'])

In [35]:
# Assigning labels (positive, neutral and negative sentiments)
Top5BrandSample.loc[Top5BrandSample.compound > .5, 'sentiment_type'] = 'POSITIVE'
Top5BrandSample.loc[(Top5BrandSample.compound >= -.5) & (Top5BrandSample.compound <= .5), 'sentiment_type'] = 'NEUTRAL'
Top5BrandSample.loc[Top5BrandSample.compound < -.5, 'sentiment_type'] = 'NEGATIVE'

In [None]:
Top5BrandSample.columns

In [None]:
Top5BrandSample.groupby(['allTopics','sentiment_type'])['asin'].count().sort_values(ascending=False).head(10)

In [38]:
Subset = Top5BrandSample[['allTopics','overall','reviewText','reviewTime','brand',
                          'category','date', 'lenReview', 'cleanReview',
                          'lemmas', 'lemmasNgrams', 'topicBattery',
                          'topicScreen', 'topicVolume', 'topicMemory', 'topicReturnPolicy',
                          'topicCustomerService', 'topicValue', 'topicDurable', 'topicNetwork',
                          'topicPhoneFreeze', 'topicPhoneCamera',
                          'topicDesign', 'topicInterface', 'topicSecurity',
                          'topicBatteryS', 'topicScreenS',
                          'topicVolumeS', 'topicMemoryS', 'topicReturnPolicyS',
                          'topicCustomerServiceS', 'topicValueS', 'topicDurableS',
                          'topicNetworkS', 'topicPhoneFreezeS', 'topicPhoneCameraS', 
                          'topicDesignS', 'topicInterfaceS', 'topicSecurityS',
                          'sentiment', 'compound', 'positive', 'negative', 'neutral',
                          'sentiment_type']]

In [39]:
# Change path to results folder in zip provided
path = r'/content/gdrive/MyDrive/3. IU Courses/Courses/2. Social Data Mining/Social Media Project/ILS-Z639 Final Project Deliverable - Abhinav Bajpai/results' 
Subset.to_excel(os.path.join(path,'final_output.xlsx'), index=False, engine= 'xlsxwriter')

**Reference :** 
Blueprints for Text Analytics Using Python
by Jens Albrecht, Sidharth Ramachandran, and Christian Winkler
https://github.com/blueprints-for-text-analytics-python/blueprints-text#readme 