In [None]:
#@title Read Data
import pandas as pd
path="/content/drive/MyDrive/wdir/growth-hacking-sentiment/"
df = pd.read_csv(path+"data/raw/review_corpus.tsv", sep="\t")

ratings = list(df["rating"])
reviews = list(df["review"])

In [None]:
#@title Dictionary based sentiment analysis
from nltk.corpus import opinion_lexicon
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

positive_wds = set(opinion_lexicon.positive())
negative_wds = set(opinion_lexicon.negative())
# lists are NOT lemmatized so we only have to tokenize the text and count
# positive and negative words


def score_sent(sent):
    """Returns a score btw -1 and 1"""
    sent = [e.lower() for e in sent if e.isalnum()]
    total = len(sent)
    pos = len([e for e in sent if e in positive_wds])
    neg = len([e for e in sent if e in negative_wds])
    if total > 0:
        return (pos - neg) / total
    else:
        return 0


def score_review(review):
    sentiment_scores = []
    sents = sent_tokenize(review)
    for sent in sents:
        wds = word_tokenize(sent)
        sent_scores = score_sent(wds)
        sentiment_scores.append(sent_scores)
    return sum(sentiment_scores) / len(sentiment_scores)


review_sentiments = [score_review(e) for e in reviews]

df = pd.DataFrame(
    {
        "rating": ratings,
        "review": reviews,
        "review dictionary based sentiment": review_sentiments,
    }
)
    
with open(path+"data/processed/dictionary_based_sentiment.tsv", "w") as outfile:
    outfile.write(df.to_csv(index=False, sep="\t"))

In [None]:
#@title Exploratory Data Analysis
# plot score vs dict_sents
from collections import Counter

import altair as alt
import numpy as np
import pandas as pd

# let's see the distributions

# the distribution of review scores
rating_counts = Counter(ratings)
data1 = pd.DataFrame(
    {
        "ratings": [str(e) for e in list(rating_counts.keys())],
        "counts": list(rating_counts.values()),
    }
)

chart1 = alt.Chart(data1).mark_bar().encode(x="ratings", y="counts")
chart1.save(f"{path}plots/01/rating_counts.html")
# we have a majority class !

# the distribution of sentiment scores
hist, bin_edges = np.histogram(review_sentiments, density=True)
labels = list(zip(bin_edges, bin_edges[1:]))
labels = [(str(e[0]), str(e[1])) for e in labels]
labels = [" ".join(e) for e in labels]


data2 = pd.DataFrame({"sentiment scores": labels, "counts": hist})

chart2 = (
    alt.Chart(data2)
    .mark_bar()
    .encode(x=alt.X("sentiment scores", sort=labels), y="counts")
)
chart2.save(f"{path}plots/01/review_sentiments.html")
# (0.0, 0.20000000000000018) -> neutral is the majority


# is there any relationship btw review scores and sentiments?
source = pd.DataFrame(
    {"ratings": [str(e) for e in ratings], "sentiments": review_sentiments}
)


chart4 = (
    alt.Chart(source)
    .mark_circle(size=60)
    .encode(
        x="ratings", y="sentiments", color="ratings", tooltip=["ratings", "sentiments"]
    )
    .interactive()
)

chart4.save(f"{path}plots/01/reviews_ratings_vs_sentiment.html")
chart4

In [None]:
#@title Correlation
# test correlation
from scipy.stats import pearsonr, spearmanr

corr1, _ = pearsonr(ratings, review_sentiments)
print(corr1)

# Spearman rank correlation says there's weak correlation btw review score
# and sentiment
scor1, _ = spearmanr(ratings, review_sentiments)

print(scor1)

ok, we plotted to see the distribution, but it's not normal, so it can be omitted on pearson because it assumes a normal distribution

Verbal negations have a big impact on the meaning of a world or a phrase. Let's mark them.

- no issues
- no complains
- Doesn't work.
- Didn't like it.

In [None]:
# Reference: https://statistics.laerd.com/statistical-guides/spearmans-rank-order-correlation-statistical-guide.php
#@markdown Let's see  the data

for i in range(len(reviews)):
    sc = ratings[i]
    rs = review_sentiments[i]
    # ss = summary_sentiments[i]
    t = reviews[i]
    if sc == 5 and rs < -0.2:
        print(t)
    if sc == 1 and rs > 0.3:
        print(t)



In [None]:
from nltk.sentiment.util import mark_negation


t = "I received these on time and no problems. No damages battlfield never fails"
print(mark_negation(t.split()))

In [None]:
#@markdown Let's handle negation
positive_wds = set(opinion_lexicon.positive())
negative_wds = set(opinion_lexicon.negative())

positive_wds_with_negation = positive_wds.union({wd + "_NEG" for wd in negative_wds})
negative_wds_with_negation = negative_wds.union({wd + "_NEG" for wd in positive_wds})


def score_sent(sent):
    """Returns a score btw -1 and 1"""
    sent = [e.lower() for e in sent if e.isalnum()]
    total = len(sent)
    pos = len([e for e in sent if e in positive_wds_with_negation])
    neg = len([e for e in sent if e in negative_wds_with_negation])
    if total > 0:
        return (pos - neg) / total
    else:
        return 0


def score_review(review):
    sentiment_scores = []
    sents = sent_tokenize(review)
    for sent in sents:
        wds = word_tokenize(sent)
        wds = mark_negation(wds)
        sent_scores = score_sent(wds)
        sentiment_scores.append(sent_scores)
    return sum(sentiment_scores) / len(sentiment_scores)


review_sentiments = [score_review(e) for e in reviews]


df = pd.DataFrame(
    {"rating": ratings, "review": reviews, "review sentiment": review_sentiments,}
)

with open(f"{path}data/processed/rule_based_sentiment.tsv", "w") as outfile:
    outfile.write(df.to_csv(index=False, sep="\t"))

scor1, _ = spearmanr(ratings, review_sentiments)
print(scor1)

In [None]:
#@markdown Let's see the distributions the distribution of sentiment scores
hist, bin_edges = np.histogram(review_sentiments, density=True)
labels = list(zip(bin_edges, bin_edges[1:]))
labels = [(str(e[0]), str(e[1])) for e in labels]
labels = [" ".join(e) for e in labels]


data2 = pd.DataFrame({"sentiment scores": labels, "counts": hist})

chart2 = (
    alt.Chart(data2)
    .mark_bar()
    .encode(x=alt.X("sentiment scores", sort=labels), y="counts")
)
chart2.save(f"{path}plots/02/review_sentiments.html")
# (0.0, 0.20000000000000018) -> neutral is the majority

In [None]:
#@markdown Is there any relationship btw review scores and sentiments?
source = pd.DataFrame(
    {"ratings": [str(e) for e in ratings], "sentiments": review_sentiments}
)

chart4 = (
    alt.Chart(source)
    .mark_circle(size=60)
    .encode(
        x="ratings", y="sentiments", color="ratings", tooltip=["ratings", "sentiments"]
    )
    .interactive()
)
chart4.save(f"{path}plots/02/reviews_raings_vs_sentiment.html")

# Naive classification: Evaluation

## 0 Set up

In [None]:
import nltk
import pandas as pd
from sklearn.metrics import accuracy_score

In [None]:
#@title Corpus
df = pd.read_csv("data/processed/sentiment_with_lemmas.tsv", sep="\t")

ratings = list(df["rating"])
reviews = list(df["review"])
reviews = [str(e) for e in reviews]
lemmatized = list(df["lemmas"])
lemmatized = [str(e).split() for e in lemmatized]
lemmatized = [[e[0] + "_" + e[1] for e in list(nltk.bigrams(e))] for e in lemmatized]
sentiment = list(df["sentiment"])

In [None]:
#@title Sentiment values instead of scores


def get_rating_class(rating):
    if rating > 4:
        return "positive"
    elif 2 <= rating <= 4:
        return "neutral"
    else:
        return "negative"


def get_sentiment_value(sentiment):
    if sentiment > 0.2:
        return "positive"
    elif -0.2 <= sentiment <= 0.2:
        return "neutral"
    else:
        return "negative"


def check_status(e):
    if e[0] == e[1]:
        return "OK"
    else:
        return "CHECK"


rating_classes = [get_rating_class(e) for e in ratings]
sentiment_values = [get_sentiment_value(e) for e in sentiment]

## Evaluation

In [None]:
##@markdown  Evaluation

from sklearn.metrics import accuracy_score

acc = accuracy_score(rating_classes, sentiment_values)
print(acc)
# 0.4315555555555556

from sklearn.metrics import classification_report

target_names = ["negative", "neutral", "positive"]
print(
    classification_report(rating_classes, sentiment_values, target_names=target_names)
)

#                 precision    recall  f1-score   support
#     negative       0.80      0.08      0.14      1500
#      neutral       0.36      0.93      0.52      1500
#     positive       0.82      0.29      0.43      1500
#     accuracy                           0.43      4500
#    macro avg       0.66      0.43      0.36      4500
# weighted avg       0.66      0.43      0.36      4500

import altair as alt
import numpy as np
from sklearn.metrics import confusion_matrix

x, y = np.meshgrid(range(0, 3), range(0, 3))
cm = confusion_matrix(rating_classes, sentiment_values, labels=["negative",
                                                                "netural",
                                                                "positive"])

source = pd.DataFrame({"true": x.ravel(), "predicted": y.ravel(), "number": cm.ravel()})

chart = (
    alt.Chart(source)
    .mark_rect()
    .encode(x="true:O", y="predicted:O", color="number:Q", tooltip=["number"])
    .interactive()
    .properties(width=800, height=500)
)
chart.save("plots/05/confusion_matrix.html")

acc = accuracy_score(rating_classes, sentiment_values)
print(acc)
# 0.4315555555555556

from sklearn.metrics import classification_report

target_names = ["negative", "neutral", "positive"]
print(
    classification_report(rating_classes, sentiment_values, target_names=target_names)
)

#                 precision    recall  f1-score   support
#     negative       0.80      0.08      0.14      1500
#      neutral       0.36      0.93      0.52      1500
#     positive       0.82      0.29      0.43      1500
#     accuracy                           0.43      4500
#    macro avg       0.66      0.43      0.36      4500
# weighted avg       0.66      0.43      0.36      4500

import altair as alt
import numpy as np
from sklearn.metrics import confusion_matrix

x, y = np.meshgrid(range(0, 3), range(0, 3))
cm = confusion_matrix(rating_classes, sentiment_values, labels=["negative",
                                                                "netural",
                                                                "positive"])

source = pd.DataFrame({"true": x.ravel(), "predicted": y.ravel(), "number": cm.ravel()})

chart = (
    alt.Chart(source)
    .mark_rect()
    .encode(x="true:O", y="predicted:O", color="number:Q", tooltip=["number"])
    .interactive()
    .properties(width=800, height=500)
)
chart.save("plots/05/confusion_matrix.html")
