# Setting up the environment

## Connect your Colab notebook to your Drive

In [None]:
# !wget http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz -o /content/drive/MyDrive/wdir/growth-hacking-sentiment/data/Video_Games_5.json.gz
from google.colab import drive
drive.mount('/content/drive')

## Install requirements

In [5]:
!pip install --quiet -r /content/drive/MyDrive/wdir/growth-hacking-sentiment/requirements.txt

      Successfully uninstalled ipykernel-4.10.1
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.24.3
    Uninstalling urllib3-1.24.3:
      Successfully uninstalled urllib3-1.24.3
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyter-console 5.2.0 requires prompt-toolkit<2.0.0,>=1.0.0, but you have prompt-toolkit 3.0.29 which is incompatible.
google-colab 1.0.0 requires ipykernel~=4.10, but you have ipykernel 6.13.0 which is incompatible.
google-colab 1.0.0 requires ipython~=5.5.0, but you have ipython 7.33.0 which is incompatible.
google-colab 1.0.0 requires tornado~=5.1.0; python_version >= "3.0", but you have tornado 6.1 which is incompatible.
datascience 0.10.6 requires folium=

## Install apex

In [1]:
%%writefile setup.sh

export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -q -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex


Writing setup.sh


In [None]:
!sh setup.sh

# asd

In [7]:
import nltk
nltk.download('opinion_lexicon')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
#@title Read Data
import pandas as pd
path="/content/drive/MyDrive/wdir/growth-hacking-sentiment/"
df = pd.read_csv(path+"data/raw/review_corpus.tsv", sep="\t")

ratings = list(df["rating"])
reviews = list(df["review"])

In [20]:
#@title Dictionary based sentiment analysis
from nltk.corpus import opinion_lexicon
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

positive_wds = set(opinion_lexicon.positive())
negative_wds = set(opinion_lexicon.negative())
# lists are NOT lemmatized so we only have to tokenize the text and count
# positive and negative words


def score_sent(sent):
    """Returns a score btw -1 and 1"""
    sent = [e.lower() for e in sent if e.isalnum()]
    total = len(sent)
    pos = len([e for e in sent if e in positive_wds])
    neg = len([e for e in sent if e in negative_wds])
    if total > 0:
        return (pos - neg) / total
    else:
        return 0


def score_review(review):
    sentiment_scores = []
    sents = sent_tokenize(review)
    for sent in sents:
        wds = word_tokenize(sent)
        sent_scores = score_sent(wds)
        sentiment_scores.append(sent_scores)
    return sum(sentiment_scores) / len(sentiment_scores)


review_sentiments = [score_review(e) for e in reviews]

df = pd.DataFrame(
    {
        "rating": ratings,
        "review": reviews,
        "review dictionary based sentiment": review_sentiments,
    }
)
    
with open(path+"data/processed/dictionary_based_sentiment.tsv", "w") as outfile:
    outfile.write(df.to_csv(index=False, sep="\t"))

mkdir: cannot create directory ‘/content/drive/MyDrive/wdir/growth-hacking-sentiment/data/processed/’: File exists


In [21]:
#@title Dictionary based sentiment analysis

from nltk.corpus import opinion_lexicon
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

positive_wds = set(opinion_lexicon.positive())
negative_wds = set(opinion_lexicon.negative())
# lists are NOT lemmatized so we only have to tokenize the text and count
# positive and negative words


def score_sent(sent):
    """Returns a score btw -1 and 1"""
    sent = [e.lower() for e in sent if e.isalnum()]
    total = len(sent)
    pos = len([e for e in sent if e in positive_wds])
    neg = len([e for e in sent if e in negative_wds])
    if total > 0:
        return (pos - neg) / total
    else:
        return 0


def score_review(review):
    sentiment_scores = []
    sents = sent_tokenize(review)
    for sent in sents:
        wds = word_tokenize(sent)
        sent_scores = score_sent(wds)
        sentiment_scores.append(sent_scores)
    return sum(sentiment_scores) / len(sentiment_scores)


review_sentiments = [score_review(e) for e in reviews]

df = pd.DataFrame(
    {
        "rating": ratings,
        "review": reviews,
        "review dictionary based sentiment": review_sentiments,
    }
)

with open(path+"data/processed/dictionary_based_sentiment.tsv", "w") as outfile:
    outfile.write(df.to_csv(index=False, sep="\t"))

In [22]:
#@title Correlation
# test correlation
from scipy.stats import pearsonr, spearmanr

corr1, _ = pearsonr(ratings, review_sentiments)
print(corr1)

# Spearman rank correlation says there's weak correlation btw review score
# and sentiment
scor1, _ = spearmanr(ratings, review_sentiments)

print(scor1)

0.4958754626551928
0.5565275331512324


ok, we plotted to see the distribution, but it's not normal, so it can be omitted on pearson because it assumes a normal distribution

Verbal negations have a big impact on the meaning of a world or a phrase. Let's mark them.

- no issues
- no complains
- Doesn't work.
- Didn't like it.

In [24]:
# Reference: https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide.php
#@markdown Let's see  the data

for i in range(len(reviews)):
    sc = ratings[i]
    rs = review_sentiments[i]
    # ss = summary_sentiments[i]
    t = reviews[i]
    if sc == 5 and rs < -0.2:
        print(t)
    if sc == 1 and rs > 0.3:
        print(t)



didn't work didn't work
Nice Love it
Excellent.  What I was expecting. The right item
Did not work upon arrival.. not good
The Villainous! Five Stars
Goon Five Stars
Addicting. Now I'm also addicted to Pokemon Shuffle. Halps.
No issues. No issues.


In [25]:
from nltk.sentiment.util import mark_negation


t = "I received these on time and no problems. No damages battlfield never fails"
print(mark_negation(t.split()))

['I', 'received', 'these', 'on', 'time', 'and', 'no', 'problems._NEG', 'No_NEG', 'damages_NEG', 'battlfield_NEG', 'never_NEG', 'fails_NEG']
