# Rule based and lexicon based methods

## Using VADER (Valence Aware Dictionary and sentiment Reasoner)

In [1]:
!pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


***Make sure you are set to handle UTF-8 encoding in your terminal or IDE***

In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
vader_obj = SentimentIntensityAnalyzer()

In [4]:
vader_obj.polarity_scores('I do not like Amazon')

{'neg': 0.547, 'neu': 0.453, 'pos': 0.0, 'compound': -0.3875}

[Read about scoring](https://github.com/cjhutto/vaderSentiment#python-demo-and-code-examples)


[How compound score is calculated](https://stackoverflow.com/questions/40325980/how-is-the-vader-compound-polarity-score-calculated-in-python-nltk)

In [5]:
vader_obj.polarity_scores('I am :)')

{'neg': 0.0, 'neu': 0.4, 'pos': 0.6, 'compound': 0.4588}

In [6]:
vader_obj.polarity_scores('Your service has never been good')

{'neg': 0.325, 'neu': 0.675, 'pos': 0.0, 'compound': -0.3412}

In [7]:
vader_obj.polarity_scores('I hate flowers')

{'neg': 0.649, 'neu': 0.351, 'pos': 0.0, 'compound': -0.5719}

In [8]:
vader_obj.polarity_scores('The product was not bad')

{'neg': 0.0, 'neu': 0.584, 'pos': 0.416, 'compound': 0.431}

In [9]:
sentences = ["VADER is smart, handsome, and funny.",  # positive sentence example
             "VADER is smart, handsome, and funny!",  # punctuation emphasis handled correctly (sentiment intensity adjusted)
             "VADER is very smart, handsome, and funny.", # booster words handled correctly (sentiment intensity adjusted)
             "VADER is VERY SMART, handsome, and FUNNY.",  # emphasis for ALLCAPS handled
             "VADER is VERY SMART, handsome, and FUNNY!!!", # combination of signals - VADER appropriately adjusts intensity
             "VADER is VERY SMART, uber handsome, and FRIGGIN FUNNY!!!", # booster words & punctuation make this close to ceiling for score
             "VADER is not smart, handsome, nor funny.",  # negation sentence example
             "The book was good.",  # positive sentence
             "At least it isn't a horrible book.",  # negated negative sentence with contraction
             "The book was only kind of good.", # qualified positive sentence is handled correctly (intensity adjusted)
             "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence
             "Today SUX!",  # negative slang with capitalization emphasis
             "Today only kinda sux! But I'll get by, lol", # mixed sentiment example with slang and constrastive conjunction "but"
             "Make sure you :) or :D today!",  # emoticons handled
             "Catch utf-8 emoji such as such as 💘 and 💋 and 😁",  # emojis handled
             "Not bad at all"  # Capitalized negation
             ]

In [10]:
for sentence in sentences:
  vader_score = vader_obj.polarity_scores(sentence)
  print(f'{sentence} {"-"*20} {vader_score}')

VADER is smart, handsome, and funny. -------------------- {'neg': 0.0, 'neu': 0.254, 'pos': 0.746, 'compound': 0.8316}
VADER is smart, handsome, and funny! -------------------- {'neg': 0.0, 'neu': 0.248, 'pos': 0.752, 'compound': 0.8439}
VADER is very smart, handsome, and funny. -------------------- {'neg': 0.0, 'neu': 0.299, 'pos': 0.701, 'compound': 0.8545}
VADER is VERY SMART, handsome, and FUNNY. -------------------- {'neg': 0.0, 'neu': 0.246, 'pos': 0.754, 'compound': 0.9227}
VADER is VERY SMART, handsome, and FUNNY!!! -------------------- {'neg': 0.0, 'neu': 0.233, 'pos': 0.767, 'compound': 0.9342}
VADER is VERY SMART, uber handsome, and FRIGGIN FUNNY!!! -------------------- {'neg': 0.0, 'neu': 0.294, 'pos': 0.706, 'compound': 0.9469}
VADER is not smart, handsome, nor funny. -------------------- {'neg': 0.646, 'neu': 0.354, 'pos': 0.0, 'compound': -0.7424}
The book was good. -------------------- {'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}
At least it isn't a horr

## Using Afinn

In [11]:
!pip install afinn

Collecting afinn
  Downloading afinn-0.1.tar.gz (52 kB)
Building wheels for collected packages: afinn
  Building wheel for afinn (setup.py): started
  Building wheel for afinn (setup.py): finished with status 'done'
  Created wheel for afinn: filename=afinn-0.1-py3-none-any.whl size=53455 sha256=cbf5ccce98f1640a7e4c1669754af572d2b1e14d9306dc556cad132659ce54c7
  Stored in directory: c:\users\anirban\appdata\local\pip\cache\wheels\9d\16\3a\9f0953027434eab5dadf3f33ab3298fa95afa8292fcf7aba75
Successfully built afinn
Installing collected packages: afinn
Successfully installed afinn-0.1


In [12]:
from afinn import Afinn
afn = Afinn()

In [13]:
afn_score = afn.score('I do not like Amazon')
afn_score

2.0

In [14]:
def afinn_sentiment_score(text_str):
  """
  Desc: Sentiment score based on afinn lib

  Arg:
    text_str (String) - single string value

  Return:
    (Sting) - "Positive" if score > 0,
              "Negative" if score < 0, 
              else "Neutral"
  """
  afn_score = afn.score(text_str)
  if (afn_score > 0):
    return 'Positive'
  elif (afn_score < 0):
    return 'Negative'
  else:
    return 'Neutral'

In [15]:
afinn_sentiment_score('I am :)')

'Neutral'

In [16]:
afinn_sentiment_score('Your service has never been good')

'Positive'

In [17]:
afinn_sentiment_score('Your service has never been better')

'Positive'

In [18]:
afinn_sentiment_score('I HATE FLOWERS')

'Negative'

In [19]:
afinn_sentiment_score('The product was not bad')

'Negative'

## Using text blob for sentiment analysis

In [20]:
!pip install -U textblob
!python -m textblob.download_corpora

Requirement already up-to-date: textblob in c:\anaconda\envs\pyakc\lib\site-packages (0.15.3)
Finished.


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to
[nltk_data]     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\conll2000.zip.
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


In [21]:
from textblob import TextBlob

To do any kind of text processing using TextBlob, we need to follow two steps listed below:
*   Convert any string to TextBlob object.  [TextBlobs Are Like Python Strings!](https://textblob.readthedocs.io/en/dev/quickstart.html#textblobs-are-like-python-strings)
*   Call functions of TextBlob to do a specific task

In [22]:
blob_obj = TextBlob('I do not like Amazon')
blob_score = blob_obj.sentiment
blob_score

Sentiment(polarity=0.0, subjectivity=0.0)

In [23]:
blob_obj = TextBlob('I am :)')
blob_score = blob_obj.sentiment
blob_score

Sentiment(polarity=0.5, subjectivity=1.0)

In [24]:
blob_obj = TextBlob('The product was not bad')
blob_score = blob_obj.sentiment
blob_score

Sentiment(polarity=0.3499999999999999, subjectivity=0.6666666666666666)

In [25]:
blob_obj = TextBlob('I hate flowers')
blob_score = blob_obj.sentiment
blob_score

Sentiment(polarity=-0.8, subjectivity=0.9)

In [26]:
blob_obj = TextBlob('Your service has never been good')
blob_score = blob_obj.sentiment
blob_score

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)

In [27]:
# Only polarity score
polarity_score = TextBlob('Your service has never been good').polarity
polarity_score

0.7

In [28]:
# Only subjectivity score
subjectivity_score = TextBlob('Your service has never been good').subjectivity
subjectivity_score

0.6000000000000001

## NLTK Method (WordNet & SentiWordNet) based

In [29]:
!pip install -U nltk
import nltk
nltk.download('all') # Use this for everything. Then next 2 lines are not required
nltk.download('sentiwordnet')
nltk.download('wordnet')
from nltk.corpus import sentiwordnet as swn

Collecting nltk
  Downloading nltk-3.6.5-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.6.2
    Uninstalling nltk-3.6.2:
      Successfully uninstalled nltk-3.6.2
Successfully installed nltk-3.6.5


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming

[nltk_data]    |   Unzipping corpora\product_reviews_2.zip.
[nltk_data]    | Downloading package pros_cons to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\pros_cons.zip.
[nltk_data]    | Downloading package qc to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\qc.zip.
[nltk_data]    | Downloading package reuters to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Package reuters is already up-to-date!
[nltk_data]    | Downloading package rte to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\rte.zip.
[nltk_data]    | Downloading package semcor to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package senseval to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\senseval.zip.
[nltk

[nltk_data]    |   Unzipping
[nltk_data]    |       taggers\averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package perluniprops to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping misc\perluniprops.zip.
[nltk_data]    | Downloading package nonbreaking_prefixes to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping corpora\nonbreaking_prefixes.zip.
[nltk_data]    | Downloading package vader_lexicon to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    | Downloading package porter_test to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping stemmers\porter_test.zip.
[nltk_data]    | Downloading package wmt15_eval to
[nltk_data]    |     C:\Users\anirban\AppData\Roaming\nltk_data...
[nltk_data]    |   Unzipping models\wmt15_eval.zip.
[nltk_data]    | Downloading package mwa_ppdb to
[nltk_data]    |     C:\User

In [30]:
# Check words similar meaning words and different usage as per language 
list(swn.senti_synsets('hate'))

[SentiSynset('hate.n.01'), SentiSynset('hate.v.01')]

Here `hate` can be `Noun` -> **She looked at him with eyes full of hate** \
or `hate` cab be a `Verb` -> **I hate the smell of cigarettes**

In [31]:
from nltk.tag import pos_tag
token = nltk.word_tokenize('She looked at him with eyes full of hate')
after_tagging = nltk.pos_tag(token)
after_tagging

[('She', 'PRP'),
 ('looked', 'VBD'),
 ('at', 'IN'),
 ('him', 'PRP'),
 ('with', 'IN'),
 ('eyes', 'NNS'),
 ('full', 'JJ'),
 ('of', 'IN'),
 ('hate', 'NN')]

**She said its not good to `hate` some one** - This can be classified as 'neg'

In [32]:
token = nltk.word_tokenize('I hate the smell of cigarettes')
after_tagging = nltk.pos_tag(token)
after_tagging

[('I', 'PRP'),
 ('hate', 'VBP'),
 ('the', 'DT'),
 ('smell', 'NN'),
 ('of', 'IN'),
 ('cigarettes', 'NNS')]

In [33]:
polarity_words= swn.senti_synsets('hate')
words = list(polarity_words)

for word in words:
  print(f'Type: {word},\n\
  Postive score: {word.pos_score()},\n\
  Negative score: {word.neg_score()},\n\
  Objectivity score: {word.obj_score()}', '\n')

Type: <hate.n.01: PosScore=0.125 NegScore=0.375>,
  Postive score: 0.125,
  Negative score: 0.375,
  Objectivity score: 0.5 

Type: <hate.v.01: PosScore=0.0 NegScore=0.75>,
  Postive score: 0.0,
  Negative score: 0.75,
  Objectivity score: 0.25 



The Sentiment of the sentence as a whole is determined by obtaining the difference between the postive and negative scores of the contextual tokens/words that use SentiWordNet to calculate the polarity.

https://nlpforhackers.io/sentiment-analysis-intro/

# Machine learning based methods (Simple ML or DL models or pre-trained deep NN language models)

## Working with NLTK and Scikit learn

In [34]:
import pandas as pd
import os

In [35]:
os.chdir('/content/drive/MyDrive/Colab Notebooks/NUS Work/Sentiment Mining/Data Sets')

FileNotFoundError: [WinError 3] The system cannot find the path specified: '/content/drive/MyDrive/Colab Notebooks/NUS Work/Sentiment Mining/Data Sets'

In [None]:
tweets_raw = pd.read_csv('Tweets.csv.zip')
tweets_raw.head(2)

In [None]:
tweets_raw.info()

In [None]:
tweets_data = tweets_raw[['text', 'airline_sentiment']]
tweets_data.head(2)

In [None]:
tweets_raw['text'][100]

In [None]:
tweets_data.info()

In [None]:
tweets_data['airline_sentiment'].value_counts()

In [None]:
tweets_data = tweets_data[tweets_data['airline_sentiment'] != 'neutral']
tweets_data['airline_sentiment'].value_counts()

#### Before the Analysis data visualization

In [None]:
# Cufflinks is a very useful add-on to plot figures directly from the dataframe series object
!pip install cufflinks

import cufflinks as cf

cf.go_offline()

def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-1.5.1.min.js?noext',
            },
          });
        </script>
        '''))

In [None]:
configure_plotly_browser_state()
tweets_raw['airline_sentiment'].iplot(kind='hist', bins=20, xTitle='polarity', linecolor='black', yTitle='count', title='Sentiment Polarity Distribution')

In [None]:
# Nltk word frequency
all_words=[]        
for i in range(len(tweets_raw['text'])):
    all_words += tweets_raw['text'][i].split()

#Get word frequency        
nlp_words = nltk.FreqDist(all_words)
plot1 = nlp_words.plot(20, color='salmon', title='Word Frequency')

In [None]:
#Bigrams
bigrm = list(nltk.bigrams(all_words))
words_2 = nltk.FreqDist(bigrm)
words_2.plot(20, color='salmon', title='Bigram Frequency')

In [None]:
import nltk
from nltk.collocations import *
bigram= nltk.collocations.BigramAssocMeasures()
Collocation = BigramCollocationFinder.from_words(all_words)
Collocation.nbest(bigram.pmi, 10)

In [None]:
#Trigrams
trigrm = list(nltk.trigrams(all_words))
words_2 = nltk.FreqDist(trigrm)
words_2.plot(20, color='salmon', title='Trigram Frequency')

In [None]:
import nltk
from nltk.collocations import *
trigram= nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(all_words)
finder.nbest(trigram.pmi, 10)

In [None]:
# Word cloud
!pip install wordcloud

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
wordcloud_before = WordCloud().generate(' '.join(tweets_raw['text']))

**Try passing stopwords ->** `WordCloud(stopwords=STOPWORDS)`

In [None]:
plt.figure(figsize=(15,15))
plt.imshow(wordcloud_before, interpolation='bilinear')

#### Data prep

In [None]:
################ Issues for students to solve #######################
tweets_data.head()

In [None]:
tweets_data.reset_index(inplace=True, drop=True)
tweets_data.head()

In [None]:
## Data cleaning starts here

# Lower case all
tweets_data['text_lower'] = tweets_data['text'].str.lower()
print(f"Before ---- {tweets_data['text'].iloc[0]}\nAfter ---- {tweets_data['text_lower'].iloc[0]}")

In [None]:
## Install emot
!pip install emot
import emot
from emot.emo_unicode import UNICODE_EMO, EMOTICONS

In [None]:
### Detect emoji ###
def detect_emoji(text):
  if emot.emoji(text)['flag']:
    return 1
  else:
    return 0

tweets_data["text_emoji_Y/N_before"] = tweets_data["text_lower"].apply(lambda text: detect_emoji(text))
print(f"No of tweets with emoji: {tweets_data['text_emoji_Y/N_before'].sum()}")

In [None]:
## Show top 10 text with emoji's
list(tweets_data[tweets_data["text_emoji_Y/N_before"] == 1]["text"])[:10]

In [None]:
UNICODE_EMO.items()

In [None]:
def convert_emojis(text):
    for emot in UNICODE_EMO:
        text = text.replace(emot, " "+UNICODE_EMO[emot]+" ")
    return text

tweets_data["text_NoEmoji"] = tweets_data["text_lower"].apply(lambda text: convert_emojis(text))

In [None]:
### Detect emoji ### Recheck 
def detect_emoji(text):
  if emot.emoji(text)['flag']:
    return 1
  else:
    return 0

tweets_data["text_emoji_Y/N_after"] = tweets_data["text_NoEmoji"].apply(lambda text: detect_emoji(text))
print(f"No of tweets with emoji: {tweets_data['text_emoji_Y/N_after'].sum()}")

In [None]:
## A quick check to text_NoEmoji where text_emoji_Y/N_before was TRUE
list(tweets_data[tweets_data["text_emoji_Y/N_before"] == 1]["text_NoEmoji"])[:]

In [None]:
## Just wanted to check one more lib to see if i get the same 364 tweets with the emoji's
!pip install emoji
import emoji

In [None]:
emoji.demojize("💕💕 adfksnfsf 🌞 hey oyou this's is 😎😎😁 :)) ")

In [None]:
emoji.demojize("UIss s ss ")

In [None]:
def emoji_lib_check(orignal_text):
  new_text = emoji.demojize(orignal_text)
  if new_text == orignal_text:
    return "No changes made"
  else:
    return "Found emoji"

tweets_data["emoji_lib_check"] = tweets_data["text"].apply(lambda text: emoji_lib_check(text))
tweets_data["emoji_lib_check"].value_counts()

In [None]:
## So our lib is ok.

In [None]:
### Detect emoticons ###
def detect_emoticons(text):
  try:
    if emot.emoticons(text)['flag']:
      return 1
    if not emot.emoticons(text)['flag']:
      return 0
  except:
    return -1

tweets_data["text_emoticons_Y/N_before"] = tweets_data["text_NoEmoji"].apply(lambda text: detect_emoticons(text))
tweets_data["text_emoticons_Y/N_before"].value_counts()

In [None]:
## 0 - Text without emoticons (7962)
## -1 - Text which is giving me error (2778)
## 1 - Valid text with emoticons (801)

## Need to check all of them.

In [None]:
# Check results where emot thinks emoticons are not present (Total 7962)
list(tweets_data[tweets_data["text_emoticons_Y/N_before"] == 0]['text_NoEmoji'])[:10]

In [None]:
## Check results where emot thinks emoticons are present (Total 801)
list(tweets_data[tweets_data["text_emoticons_Y/N_before"] == 1]['text_NoEmoji'])

In [None]:
## Everythinh looks good but
## emot is detecting ":/" in http:/ as an emoticon
emot.emoticons("@VirginAmerica come back to #PHL already. We need you to take us out of this horrible cold. #pleasecomeback http://t.co/gLXFwP6nQH")

In [None]:
## Maybe i need to remove these http things first.

In [None]:
# Remove URLS
import re

def remove_urls(text):
  url_pattern = re.compile(r'http\S+', flags=re.M)
  return url_pattern.sub("", text)

tweets_data["text_without_URLS"] = tweets_data["text_NoEmoji"].apply(lambda text: remove_urls(text))

In [None]:
### Detect emoticons ### Recheck after removal of URLs
def detect_emoticons(text):
  try:
    if emot.emoticons(text)['flag']:
      return 1
    if not emot.emoticons(text)['flag']:
      return 0
  except:
    return -1

tweets_data["text_emoticons_Y/N_before"] = tweets_data["text_without_URLS"].apply(lambda text: detect_emoticons(text))
tweets_data["text_emoticons_Y/N_before"].value_counts()

In [None]:
## 0 - Text without emoticons increased from (7962) -> (8449)
## -1 - Text with error decreased from (2865) -> (2750)
## 1 - Valid text with emoticons decreased from from (801) -> (342)

## Need to check all of them (Again)

In [None]:
# Check results where emot thinks emoticons are not present (Total 8449)
list(tweets_data[tweets_data["text_emoticons_Y/N_before"] == 0]['text_without_URLS'])[:10]

In [None]:
## Check results where emot thinks emoticons are present (Total 342)
list(tweets_data[tweets_data["text_emoticons_Y/N_before"] == 1]['text_without_URLS'])

In [None]:
## Check results where emot is giving error's (Total 2750)
list(tweets_data[tweets_data["text_emoticons_Y/N_before"] == -1]['text_without_URLS'])[:]

In [None]:
## After taking a closer look at text with error i cannot see anything specific i need to take care of
## so i assign it " 0 " in detect_emoticons function and do the same for convert_emoticons function
## So now i have to check only 316 count

In [None]:
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', " "+("_".join(EMOTICONS[emot].replace(",","").split()))+" ", text)
    return text

tweets_data["text_NoEmoticons"] = tweets_data["text_without_URLS"].apply(lambda text: convert_emoticons(text))

In [None]:
### Detect emoticons ### Recheck after removal of URLs
def detect_emoticons(text):
  try:
    if emot.emoticons(text)['flag']:
      return 1
    if not emot.emoticons(text)['flag']:
      return 0
  except:
    return 0

tweets_data["text_emoticons_Y/N_after"] = tweets_data["text_NoEmoticons"].apply(lambda text: detect_emoticons(text))
tweets_data["text_emoticons_Y/N_after"].value_counts()

In [None]:
## Continue rest of the work

In [None]:
# Remove Punctuation
import string

PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

tweets_data["text_wo_punct"] = tweets_data["text_NoEmoticons"].apply(lambda text: remove_punctuation(text))

print(f"Before ---- {tweets_data['text_NoEmoticons'].iloc[0]}\nAfter ---- {tweets_data['text_wo_punct'].iloc[0]}")

In [None]:
PUNCT_TO_REMOVE

About [text.translate](https://www.programiz.com/python-programming/methods/string/translate) & [mkestrans](https://www.w3schools.com/python/ref_string_maketrans.asp)

In [None]:
# Remove stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

tweets_data["text_wo_stop"] = tweets_data["text_wo_punct"].apply(lambda text: remove_stopwords(text))

print(f"Before ---- {tweets_data['text_wo_punct'].iloc[0]}\nAfter ---- {tweets_data['text_wo_stop'].iloc[0]}")

In [None]:
## Remove numbers or any other no Alphabet
## my_string.isalpha(), my_string.isdigit(), my_string.isalnum()

def remove_digits(text):
    return " ".join([word for word in text.split() if not word.isdigit()])

tweets_data["text_wo_digits"] = tweets_data["text_wo_stop"].apply(lambda text: remove_digits(text))

print(f"Before ---- {tweets_data['text_wo_stop'].iloc[0]}\nAfter ---- {tweets_data['text_wo_digits'].iloc[0]}")

In [None]:
# Check most frequent words 
from collections import Counter

cnt = Counter()

for text in tweets_data["text_wo_digits"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

In [None]:
# Remove most frequent words
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])

def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

tweets_data["text_wo_stopfreq"] = tweets_data["text_wo_digits"].apply(lambda text: remove_freqwords(text))

print(f"Before ---- {tweets_data['text_wo_digits'].iloc[0]}\nAfter ---- {tweets_data['text_wo_stopfreq'].iloc[0]}")

In [None]:
# Check least frequent words 
from collections import Counter

cnt = Counter()

for text in tweets_data["text_wo_stopfreq"].values:
    for word in text.split():
        cnt[word] += 1
        
list(reversed(cnt.most_common(10)))

In [None]:
# Remove rare words
RAREWORDS = set([w for (w, wc) in reversed(cnt.most_common(10))])

def remove_rarewords(text):
    """custom function to remove the rare words"""
    return " ".join([word for word in str(text).split() if word not in RAREWORDS])

tweets_data["text_wo_stopfreqrare"] = tweets_data["text_wo_stopfreq"].apply(lambda text: remove_rarewords(text))

print(f"Before ---- {tweets_data['text_wo_stopfreq'].iloc[0]}\nAfter ---- {tweets_data['text_wo_stopfreqrare'].iloc[0]}")

In [None]:
# Stemming (Try using for on sentiment task else the work may not retain its base meaning)
from nltk.stem.porter import PorterStemmer
# from nltk.stem.snowball import SnowballStemmer

stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

tweets_data["text_stemmed"] = tweets_data["text_wo_stopfreqrare"].apply(lambda text: stem_words(text))

print(f"Before ---- {tweets_data['text_wo_stopfreqrare'].iloc[0]}\nAfter ---- {tweets_data['text_stemmed'].iloc[0]}")

In [None]:
# Lemmatizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

tweets_data["text_lemmatized"] = tweets_data["text_wo_stopfreqrare"].apply(lambda text: lemmatize_words(text))

print(f"Before ---- {tweets_data['text_wo_stopfreqrare'].iloc[0]}\nAfter ---- {tweets_data['text_lemmatized'].iloc[0]}")

In [None]:
tweets_cleaned = tweets_data[['text_lemmatized', 'airline_sentiment']]
tweets_cleaned.columns = ['text', 'sentiments']
tweets_cleaned.reset_index(inplace=True)

#### After the analysis data visualization

In [None]:
# Cufflinks is a very useful add-on to plot figures directly from the dataframe series object
# !pip install cufflinks
import cufflinks as cf

cf.go_offline()

def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-1.5.1.min.js?noext',
            },
          });
        </script>
        '''))

In [None]:
configure_plotly_browser_state()
tweets_cleaned['sentiments'].iplot(kind='hist', bins=20, xTitle='polarity', linecolor='black', yTitle='count', title='Sentiment Polarity Distribution')

In [None]:
# Nltk word frequency
all_words=[]        
for i in range(len(tweets_cleaned['text'])):
    all_words += tweets_cleaned['text'][i].split()

#Get word frequency        
nlp_words = nltk.FreqDist(all_words)
plot1 = nlp_words.plot(20, color='salmon', title='Word Frequency')

In [None]:
#Bigrams
bigrm = list(nltk.bigrams(all_words))
words_2 = nltk.FreqDist(bigrm)
words_2.plot(20, color='salmon', title='Bigram Frequency')

In [None]:
import nltk
from nltk.collocations import *
bigram= nltk.collocations.BigramAssocMeasures()
Collocation = BigramCollocationFinder.from_words(all_words)
Collocation.nbest(bigram.pmi, 10)

In [None]:
#Trigrams
trigrm = list(nltk.trigrams(all_words))
words_2 = nltk.FreqDist(trigrm)
words_2.plot(20, color='salmon', title='Trigram Frequency')

In [None]:
import nltk
from nltk.collocations import *
trigram= nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(all_words)
finder.nbest(trigram.pmi, 10)

In [None]:
!pip install wordcloud

from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
wordcloud_before = WordCloud().generate(' '.join(tweets_cleaned['text']))

In [None]:
plt.figure(figsize=(15,15))
plt.imshow(wordcloud_before, interpolation='bilinear')

- Try adding a extra step in the cleaning pipeline to get rid of unwanted numbers. Try uisng regex.
- Re-run the classifiers and see if there is any improvement in the results.

In [None]:
## Class work
## Detect Emoji's and emoticons and replace them with the words

## Remove numbers and unwanted no required chars.
## try using - my_string.isalpha(), my_string.isdigit(), my_string.isalnum()

In [None]:
tweets_data.head(2)

## Using TextBlob

The textblob.sentiments module contains two sentiment analysis implementations, `PatternAnalyzer` (based on the pattern library) and `NaiveBayesAnalyzer` (an NLTK classifier trained on a movie reviews corpus).

In [None]:
# Override default implementation by pattern lib
from textblob.sentiments import NaiveBayesAnalyzer

In [None]:
blob_obj = TextBlob('I do not like Amazon', analyzer=NaiveBayesAnalyzer())
blob_score = blob_obj.sentiment
blob_score

**Check the parameters you can pass to TextBlob** `(classifier)`

In [None]:
blob_obj = TextBlob('I am :)', analyzer=NaiveBayesAnalyzer())
blob_score = blob_obj.sentiment
blob_score

In [None]:
blob_obj = TextBlob('Your service has never been good', analyzer=NaiveBayesAnalyzer())
blob_score = blob_obj.sentiment
blob_score

In [None]:
blob_obj = TextBlob('I hate flowers', analyzer=NaiveBayesAnalyzer())
blob_score = blob_obj.sentiment
blob_score

In [None]:
blob_obj = TextBlob('The product was not bad', analyzer=NaiveBayesAnalyzer())
blob_score = blob_obj.sentiment
blob_score

## Introduction to Spacy

In [None]:
## Function to convert data into Spacy format
## Using the old tweets_cleaned if required to test this data set else use the IMDB once

data_spacy = []
for row in range(tweets_cleaned.shape[0]):
  cats = {'pos' : False, 'neg': False}
  if tweets_cleaned['sentiments'].loc[row] == 'positive':
    cats['pos'] = True
  else:
    cats['neg'] = True
  temp = {"cats" : cats}
  data_spacy.append((tweets_cleaned['text'].loc[row], temp))

data_spacy[:3]

In [None]:
!pip install spacy
!spacy download en_core_web_sm
!spacy download en_core_web_md
# !spacy download en_core_web_lg

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")
# Can directly load 'lg' & 'md' model like 'sm' models but some times it gives errors. Try this method
import en_core_web_md
nlp_md = en_core_web_md.load()
# import en_core_web_lg 
# nlp_lg = en_core_web_lg.load()

## Can also use
# from spacy.lang.en import English
# nlp = English()

In [None]:
# In Spacy world a doc object is the top most class object
# Under it lies the Span object - A part of doc ex: doc[1:5]
# Token is the word or punct etc..
# Lexicals are the smallest units which have a meaning in that particular lang

# How easy it is to tokenize text in Spacy
text = """
Dave watched as the forest burned up on the hill,
only a few miles from his house. The car had
been hastily packed and Marta was inside trying to round
up the last of the pets. "Where could she be?" he wondered
as he continued to wait for Marta to appear with the pets.
"""
doc = nlp(text)
token_list = [token for token in doc]
print(token_list)

In [None]:
# Second way
second_way = nlp('Hi how are you John ? @ www.abc.com mnk@abc.com')
for token in second_way:
  print(token.text)

In [None]:
# The key difference between token and token.text is that 
# token is still in Spacy world like a spacy object 
# but toeken.text is a pure string type which is ready to be used outside the Spacy world

In [None]:
# A part of the doc is called as span
span = second_way[:2]
print(f"Hi, I'm still too Spacy ---- {span}\nMy type ---- {type(span)}")

In [None]:
second_way[:4]

In [None]:
# A part of the doc is called as span
span = second_way[:2]
print(f"Hi, I'm not that atas ---- {span.text}\nMy type ---- {type(span.text)}")

In [None]:
# Check more about atas spacy

# Check my index:
print(*[f' Index: {token.i}' for token in second_way], sep=',')

# Check my text
print(*[f' Text: {token.text}' for token in second_way], sep=',')

# Check if i am alphabet or not
print(*[f' is_alpha: {token.is_alpha}' for token in second_way], sep=',')

# Check if i am a punctuation mark or not 
print(*[f' is_punct: {token.is_punct}' for token in second_way], sep=',')

# Am i a number ?
print(*[f' like_num: {token.like_num}' for token in second_way], sep=',')

# Do i have a address ?
print(*[f' like_url: {token.like_url}' for token in second_way], sep=',')

# Send me an email 
print(*[f' like_email: {token.like_email}' for token in second_way], sep=',')

In [None]:
# Understand the part of speech 
for token in second_way:
  print(f"Token: {token}    POS: {token.pos_}")

In [None]:
# Syntactic dependency
for token in second_way:
  print(f"Text: {token.text}, POS: {token.pos_}, Dep: {token.dep_}, Head: {token.head.text}")

In [None]:
# Know the Named Entities (Will be usefull at Sentence level and entity level sentiment mining)
# This is where the magic of corpus will come into picture.
# We imported a english model which might be trained over news or general english text. 
# So this model can find person names, city names etc...
# Try giving it come virus names or drug names it might not catch any.
# So you have to find any other pre-trained model from medical.

for ent in second_way.ents:
  print(f"Text: {ent.text} ---- Entity type: {ent.label_}")

In [None]:
# Can understand the tags 
spacy.explain("NNP")

In [None]:
# Check existing vocab in Spacy (Wrong way to check it. Dont use it ever)
vocab = [word.text for word in second_way.vocab]
vocab[:20]

**Understand in detail visit** -- [Spacy API webisite](https://spacy.io/api)

In [None]:
# Easily remove stop words (Works for different languages)
filtered_tokens = [token for token in doc if not token.is_stop]
print(filtered_tokens)

In [None]:
# Lemmatization
lemmas = [token.lemma_ for token in filtered_tokens]
print(lemmas, '\n')

# More easy to compare
token_lemmas = [f"Token: {token}, lemma: {token.lemma_}" for token in filtered_tokens]
print(token_lemmas)

In [None]:
# Check similarty of words (both conventinal models based and Glove or Language model based)
# Read the error carefully 

word_1 = nlp('red')
word_2 = nlp('blue')
word_3 = nlp('lion')

print(word_1.similarity(word_2), '\n')
print(word_2.similarity(word_3),)

In [None]:
print(f" No of dims: {filtered_tokens[1].vector.shape}", '\n')
filtered_tokens[1].vector

[Read more about models from Spacy website and notice the key variations in them.](https://spacy.io/usage/models)

[English model details](https://spacy.io/models/en)

In [None]:
# Using medium size model (This one is Glove based. Same like Word2Vec)

word_1 = nlp_md('red')
word_2 = nlp_md('blue')
word_3 = nlp_md('lion')

print(word_1.similarity(word_2), '\n')
print(word_2.similarity(word_3),)

In [None]:
"""word_1 = nlp_lg('red')
word_2 = nlp_lg('blue')
word_3 = nlp_lg('lion')

print(word_1.similarity(word_2), '\n')
print(word_2.similarity(word_3),)"""

In [None]:
# What we saw was more like Token similarity
# What about document level 
# Using small model

doc_1 = nlp("I like to play football")
doc_2 = nlp("I like to play soccer")
doc_3 = nlp("I like the ice cream")

print(doc_1.similarity(doc_2), '\n')
print(doc_2.similarity(doc_3), '\n')

In [None]:
# What we saw was more like Token similarity
# What about document level 
# Using medium model (Word2Vector model)

doc_1 = nlp_md("I like to play football")
doc_2 = nlp_md("I like to play soccer")
doc_3 = nlp_md("I like the ice cream")

print(doc_1.similarity(doc_2), '\n')
print(doc_2.similarity(doc_3), '\n')

In [None]:
# Can do the same for span similarty

In [None]:
# Can check the doc vector for medium model
doc_1.vector.shape

In [None]:
# This is very handy in recomendations engine and finding similar text. 
# If you know some negatuve comments then find some more with top confidence with this and use for models
# and repeat the loop and make more training data (Not the best idea but used a lot)

#### Data prep for training classifier in Spacy

In [None]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/NUS Work/Sentiment Mining/Data Sets/')

# unzip the data
# !sudo tar -xvf aclImdb_v1.tar.gz

The function writing style is called [Support for type hints](https://docs.python.org/3/library/typing.html) \
Nothing new just a little bit type hints are added. You can revert back without getting any error.

In [None]:
import os
import random

def load_training_data(data_directory: str = "aclImdb/train", split: float = 0.8, limit: int = 0) -> tuple:
    # Load from files
    reviews = []
    for label in ["pos", "neg"]:
        labeled_directory = f"{data_directory}/{label}"
        for review in os.listdir(labeled_directory):
            if review.endswith(".txt"):
                with open(f"{labeled_directory}/{review}") as f:
                    text = f.read()
                    text = text.replace("<br />", "\n\n")
                    if text.strip():
                        spacy_label = {
                            "cats": {
                                "pos": "pos" == label,
                                "neg": "neg" == label}
                        }
                        reviews.append((text, spacy_label))
    random.shuffle(reviews)

    if limit:
        reviews = reviews[:limit]
    split = int(len(reviews) * split)
    return reviews[:split], reviews[split:]

In [None]:
training_data, testing_data = load_training_data("aclImdb/train", 0.8, 2500)

In [None]:
type(training_data)

In [None]:
len(training_data)

In [None]:
training_data[:3]

In [None]:
len(testing_data)

In [None]:
testing_data[:3]

```
## Build pipeline

# Change this for loading medium or large models
nlp = spacy.load("en_core_web_sm") 

# Our outcome pipeline name is 'textcat'
# now we check if this already does not exist in internal pipeline

# If it does not exit make it with 'textcat' and in configuration tell the 
# classifier architecture you want to use like 'Simple_cnn' or 
# BOW (bag of words) or 'hybrid'. Check the documentation for more details. 

if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe("textcat", config={"architecture": "simple_cnn"})
    nlp.add_pipe(textcat, last=True)

else:
    textcat = nlp.get_pipe("textcat")

# Tell textcat (means text categorization) will have 2 lables to predict
textcat.add_label("pos")
textcat.add_label("neg")

# There might be inbuid or other other custom elements in the pipeline
# we fist  get the list of those items and put them all in exclusion while 
# model training. 

# Train only textcat
training_excluded_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]

# here in first function we told it to exclude the list
# rest are the hyper-parameters we have set for the training
# in the end we print the results

with nlp.disable_pipes(training_excluded_pipes):
    optimizer = nlp.begin_training()
    # Training loop
    print("Beginning training")
    print("Loss\tPrecision\tRecall\tF-score")
    batch_sizes = compounding(
        4.0, 32.0, 1.001
    )  # A generator that yields infinite series of input numbers
    for i in range(iterations):
        print(f"Training iteration {i}")
        loss = {}
        random.shuffle(training_data)
        batches = minibatch(training_data, size=batch_sizes)
        for batch in batches:
            text, labels = zip(*batch)
            nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss)
        with textcat.model.use_params(optimizer.averages):
            evaluation_results = evaluate_model(
                tokenizer=nlp.tokenizer,
                textcat=textcat,
                test_data=test_data
            )
            print(
                f"{loss['textcat']}\t{evaluation_results['precision']}"
                f"\t{evaluation_results['recall']}"
                f"\t{evaluation_results['f-score']}"
            )

# Save the artifcats of the model because we dont want to retrain the model

# Save model
with nlp.use_params(optimizer.averages):
    nlp.to_disk("model_artifacts")

```



In [None]:
import os
import random
import spacy
from spacy.util import minibatch, compounding

def train_model(
    training_data: list,
    test_data: list,
    iterations: int = 20
) -> None:
    # Build pipeline
    nlp = spacy.load("en_core_web_sm") # Change this for loading medium or large models
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat", config={"architecture": "simple_cnn"}
        )
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    # Train only textcat
    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]
    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        # Training loop
        print("Beginning training")
        print("Loss\tPrecision\tRecall\tF-score")
        batch_sizes = compounding(
            4.0, 32.0, 1.001
        )  # A generator that yields infinite series of input numbers
        for i in range(iterations):
            print(f"Training iteration {i}")
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss)
            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(
                    tokenizer=nlp.tokenizer,
                    textcat=textcat,
                    test_data=test_data
                )
                print(
                    f"{loss['textcat']}\t{evaluation_results['precision']}"
                    f"\t{evaluation_results['recall']}"
                    f"\t{evaluation_results['f-score']}"
                )

    # Save model
    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")

In [None]:
def evaluate_model(
    tokenizer, textcat, test_data: list
) -> dict:
    reviews, labels = zip(*test_data)
    reviews = (tokenizer(review) for review in reviews)
    true_positives = 0
    false_positives = 1e-8  # Can't be 0 because of presence in denominator
    true_negatives = 0
    false_negatives = 1e-8
    for i, review in enumerate(textcat.pipe(reviews)):
        true_label = labels[i]['cats']
        for predicted_label, score in review.cats.items():
            # Every cats dictionary includes both labels. You can get all
            # the info you need with just the pos label.
            if (
                predicted_label == "neg"
            ):
                continue
            if score >= 0.5 and true_label["pos"]:
                true_positives += 1
            elif score >= 0.5 and true_label["neg"]:
                false_positives += 1
            elif score < 0.5 and true_label["neg"]:
                true_negatives += 1
            elif score < 0.5 and true_label["pos"]:
                false_negatives += 1
    precision = true_positives / (true_positives + false_positives)
    recall = true_positives / (true_positives + false_negatives)

    if precision + recall == 0:
        f_score = 0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f-score": f_score}

In [None]:
train_model(training_data, testing_data, 12)

Read more about [training Spacy classifier](https://spacy.io/usage/training#textcat) and [textcategorizer API](https://spacy.io/api/textcategorizer)

In [None]:
# Load saved model
loaded_model = spacy.load("model_artifacts") 

In [None]:
# Test predictions for reviews
def test_model(input_data: str = TEST_REVIEW):
    #  Load saved trained model
    loaded_model = spacy.load("model_artifacts")
    # Generate prediction
    parsed_text = loaded_model(input_data)
    # Determine prediction to return
    if parsed_text.cats["pos"] > parsed_text.cats["neg"]:
        prediction = "Positive"
        score = parsed_text.cats["pos"]
    else:
        prediction = "Negative"
        score = parsed_text.cats["neg"]
    print(f"Review text: {input_data}\nPredicted sentiment: {prediction}"
        f"\tScore: {score}")

In [None]:
TEST_REVIEW = """
Transcendently beautiful in moments outside the office, it seems almost
sitcom-like in those scenes. When Toni Colette walks out and ponders
life silently, it's gorgeous.<br /><br />The movie doesn't seem to decide
whether it's slapstick, farce, magical realism, or drama, but the best of it
doesn't matter. (The worst is sort of tedious - like Office Space with less humor.)
"""

test_model(TEST_REVIEW)

## Using stanfordnlp or Stanza (old version is stanfordCoreNLP)
[weblink](https://stanfordnlp.github.io/stanza/)

In [None]:
!pip install stanza

In [None]:
import stanza
stanza.download('en') # loaded the 
nlp = stanza.Pipeline('en')

In [None]:
doc = nlp('I do not like Amazon. I am :). Your service has never been good. I hate flowers. The product was not bad')

In [None]:
doc

In [None]:
print(f'No of sentences: {len(doc.sentences)}', '\n')
print(f'No of tokens: {doc.num_tokens}', '\n')
print(f'No of words: {doc.num_words}', '\n')
print(f'No of entities: {len(doc.entities)}', '\n')
print(*[f'Sentence: {sentence}\n' for sentence in doc.sentences])

In [None]:
print(f'First sentence : {doc.sentences[0].text}', '\n')
print(f'No of tokens: {len(doc.sentences[0].tokens)}', '\n')
print(f'No of words: {len(doc.sentences[0].words)}', '\n')
print(f'No of entities: {len(doc.sentences[0].entities)}', '\n')

In [None]:
print(f'First sentence : {doc.sentences[0].tokens[4].text}', '\n')
print(f'Start: {doc.sentences[0].tokens[4].start_char}', '\n')
print(f'End: {doc.sentences[0].tokens[4].end_char}', '\n')

In [None]:
print(f'First sentence : {doc.sentences[0].words[3].text}', '\n')
print(f'Lemma: {doc.sentences[0].words[3].lemma}', '\n')
print(f'upos: {doc.sentences[0].words[3].upos}', '\n')
print(f'xpos: {doc.sentences[0].words[3].xpos}', '\n')

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')
doc = nlp('I do not like Amazon. I am :). Your service has never been good. I hate flowers. The product was not bad')
sentiment_dict = {0: "Negative",
                  1: "Neutral",
                  2: "Positive"}
for i, sentence in enumerate(doc.sentences):
    print(i, sentence.text, '-'*10 , sentiment_dict[sentence.sentiment])

# Emotion Detection

In [None]:
!pip install text2emotion

In [None]:
#Import the modules
import text2emotion as te

In [None]:
text1 = """ I was asked to sign a third party contract a week out from stay.
If it wasn't an 8 person group that took a lot of wrangling I would have cancelled the booking straight away. 
Bathrooms - there are no stand alone bathrooms. 
Please consider this - you have to clear out the main bedroom to use that bathroom. 
Other option is you walk through a different bedroom to get to its en-suite. 
Signs all over the apartment - there are signs everywhere - some helpful - some telling you rules. 
Perhaps some people like this but It negatively affected our enjoyment of the accommodation. 
Stairs - lots of them - some had slightly bending wood which caused a minor injury. """

In [None]:
#Call to the function
te.get_emotion(text1)

In [None]:
text2 = "Day was pretty amazing😃😃"
te.get_emotion(text2)

In [None]:
text3 = "Some of them are idiots"
te.get_emotion(text3)

In [None]:
# Make a list of text1, text2 and text3

text = [text1, text2, text3]
text

In [None]:
tweets = [
    "Watching the sopranos again from start to finish!",
    "Finding out i have to go to the  dentist tomorrow",
    "I want to go outside and chalk but I have no chalk",
    "I HATE PAPERS AH #AH #HATE",
    "My mom wasn't mad",
    "Do people have no Respect for themselves or you know others peoples homes",
]

In [None]:
for tweet in tweets:
  print(tweet, "\n")
  print(te.get_emotion(tweet), "\n")

# Others important things

### How to deal with negations
https://nlpforhackers.io/sentiment-analysis-intro/

### Dealing with inter annotation agreements for human classifications of documents
[Read more from here](https://corpuslinguisticmethods.wordpress.com/2014/01/15/what-is-inter-annotator-agreement/)

## Other useful tools:
- [Allen NLP Demo](https://demo.allennlp.org/sentiment-analysis)
- [HuggingFace](https://api-inference.huggingface.co/docs/)
- [HuggingFace Colab Example Notebooks](https://github.com/huggingface/transformers/tree/master/notebooks)
- [If you want to read about transformers](https://huggingface.co/transformers/)
- [Pattern NLP link 1](https://stackabuse.com/python-for-nlp-introduction-to-the-pattern-library/)
- [Pattern NLP link 2](https://analyticsindiamag.com/hands-on-guide-to-pattern-a-python-tool-for-effective-text-processing-and-data-mining/)
- [Polyglot Link 1](https://pypi.org/project/polyglot/)
- [Ployglot link 2](https://jcharistech.wordpress.com/2018/12/10/introduction-to-natural-language-processing-with-polyglot/)
- [Ployglot link 3](https://analyticsindiamag.com/hands-on-tutorial-on-polyglot-python-toolkit-for-multilingual-nlp-applications/)
- [Ployglot link 4](https://www.geeksforgeeks.org/natural-language-processing-using-polyglot-introduction/)
- [Flair](https://github.com/flairNLP/flair)
- [Monty Lingua by MIT - Nice one try it for NLG](https://alumni.media.mit.edu/~hugo/montylingua/)
- [textacy](https://pypi.org/project/textacy/)
- [Twitter Emotion Recognition - Must try it!!!](https://github.com/nikicc/twitter-emotion-recognition)


## Aditional cleaning functions
For more visit 
- [Kaggle cleaning functions list](https://www.kaggle.com/sudalairajkumar/getting-started-with-text-preprocessing)
- [Emojie's & emoticons list with unicodes](https://unicode.org/emoji/charts/full-emoji-list.html#1f607)
- [How to handle Negating concepts & trigger terms (**Be careful it is very domain specific**)](https://medium.com/@MansiKukreja/clinical-text-negation-handling-using-negspacy-and-scispacy-233ce69ab2ac)
- [How NLTK is handling your negations](https://stackoverflow.com/questions/28720174/negation-handling-in-nlp)


In [None]:
# clean or convert  emoticons and emoji's
# Another lib https://github.com/carpedm20/emoji

# this will work with emojis and emoticons
!pip install emot
import emot

In [None]:
# Working with emoticons
text = "I love python :)"
emot.emoticons(text)

In [None]:
# Working with emoji's
text = "I love python 👨"
emot.emoji(text)

In [None]:
# Get Unicodes for string search and replace below
UNICODE_EMO = emot.UNICODE_EMO
EMOTICONS = emot.EMOTICONS

In [None]:
EMOTICONS[':x']

In [None]:
UNICODE_EMO['🔥']

In [None]:
UNICODE_EMO.keys()

In [None]:
# Remove URLS
def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

text = "Driverless AI NLP blog post on https://www.h2o.ai/blog/detecting-sarcasm-is-difficult-but-ai-may-have-an-answer/"
remove_urls(text)

In [None]:
## Remove HTML
from bs4 import BeautifulSoup

def remove_html(text):
    return BeautifulSoup(text, "lxml").text

text = """<div>
<h1> H2O</h1>
<p> AutoML</p>
<a href="https://www.h2o.ai/products/h2o-driverless-ai/"> Driverless AI</a>
</div>
"""

print(remove_html(text))

In [None]:
# Use spellings correction from TextBlog (i guess its: edit distance based)

In [None]:
# Dictionary based abbrivations & slangs correction (Very specific as per use case)
# English Ex: ASAP -> As soon as possible
# Siglish Ex: No lah -> simply No or if its No lahhhhhh (with stress) then strongly disagree (Very subjective as per user interpretations) 

In [None]:
# You can use Google API for language detection and traslation
# TextBlob's wrapper around the google api (Easy to use)

In [None]:
!pip install langdetect
from langdetect import detect_langs

In [None]:
unknow_lang = 'Buenas tarde'
detect_langs(unknow_lang)