In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/small_corpus.csv')

In [3]:
df['reviews']= df['reviews'].astype(str)

In [4]:
ratings = list(df["ratings"])
reviews = list(df["reviews"])

# Dictionary based sentiment analysis 

In [5]:
from nltk.corpus import opinion_lexicon
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

In [6]:
positive_wds = set(opinion_lexicon.positive())
negative_wds = set(opinion_lexicon.negative())

In [7]:
def score_sent(sent):
    """Returns a score btw -1 and 1"""
    sent = [e.lower() for e in sent if e.isalnum()]
    total = len(sent)
    pos = len([e for e in sent if e in positive_wds])
    neg = len([e for e in sent if e in negative_wds])
    if total > 0:
        return (pos - neg) / total
    else:
        return 0

In [8]:
def score_review(review):
    sentiment_scores = []
    sents = sent_tokenize(review)
    for sent in sents:
        wds = word_tokenize(sent)
        sent_scores = score_sent(wds)
        sentiment_scores.append(sent_scores)
    return sum(sentiment_scores) / len(sentiment_scores)

In [9]:
review_sentiments = [score_review(e) for e in reviews]

In [10]:
df = pd.DataFrame(
    {
        "rating": ratings,
        "review": reviews,
        "review dictionary based sentiment": review_sentiments,
    }
)

In [11]:
df.to_csv('data/dictionary_based_sentiment.csv')

# Exploratory Data Analysis   

In [12]:
from collections import Counter

import altair as alt
import numpy as np
import pandas as pd

In [13]:
rating_counts = Counter(ratings)
data1 = pd.DataFrame(
    {
        "ratings": [str(e) for e in list(rating_counts.keys())],
        "counts": list(rating_counts.values()),
    }
)

In [14]:
chart1 = alt.Chart(data1).mark_bar().encode(x="ratings", y="counts")
chart1.save("data/rating_counts.html")

In [15]:
hist, bin_edges = np.histogram(review_sentiments, density=True)
labels = list(zip(bin_edges, bin_edges[1:]))
labels = [(str(e[0]), str(e[1])) for e in labels]
labels = [" ".join(e) for e in labels]

In [16]:
data2 = pd.DataFrame({"sentiment scores": labels, "counts": hist})

In [17]:
alt.Chart(data2).mark_bar().encode(x=alt.X("sentiment scores", sort=labels), y="counts")

In [18]:
source = pd.DataFrame(
    {"ratings": [str(e) for e in ratings], "sentiments": review_sentiments}
)

In [19]:
alt.Chart(source).mark_circle(size=60).encode(x="ratings", y="sentiments", color="ratings", tooltip=["ratings", "sentiments"]).interactive()

# Correlation

In [35]:
from scipy.stats import pearsonr, spearmanr

In [36]:
corr1, _ = pearsonr(ratings, review_sentiments)
print(corr1)

0.4417709661787049


# Convert the Scores to Rating

In [40]:
def score_to_rating(value):
    if value > 0.2:
        return 'Postive'
    if value <= 0.2 and value >= -0.2:
        return 'Neutral'
    else:
        return 'Negative'

In [43]:
df = pd.read_csv('data/dictionary_based_sentiment.csv')

In [45]:
df['sent'] = df['review dictionary based sentiment'].apply(lambda x:score_to_rating(x))

In [46]:
df.head()

Unnamed: 0.1,Unnamed: 0,rating,review,review dictionary based sentiment,sent
0,0,1,Recently UBISOFT had to settle a huge class-ac...,-0.013158,Neutral
1,1,1,"code didn't work, got me a refund.",0.285714,Postive
2,2,1,"these do not work at all, all i get is static ...",0.0,Neutral
3,3,1,well let me start by saying that when i first ...,-0.020521,Neutral
4,4,1,"Dont waste your money, you will just end up us...",0.0,Neutral


In [47]:
df.rename(columns={"sent":"predicted"},inplace=True)

In [48]:
df.head()

Unnamed: 0.1,Unnamed: 0,rating,review,review dictionary based sentiment,predicted
0,0,1,Recently UBISOFT had to settle a huge class-ac...,-0.013158,Neutral
1,1,1,"code didn't work, got me a refund.",0.285714,Postive
2,2,1,"these do not work at all, all i get is static ...",0.0,Neutral
3,3,1,well let me start by saying that when i first ...,-0.020521,Neutral
4,4,1,"Dont waste your money, you will just end up us...",0.0,Neutral


In [49]:
def score_to_Target(value):
    if value >= 5:
        return 'Postive'
    if value <= 4 and value >= 2:
        return 'Neutral'
    else:
        return 'Negative'

In [50]:
df['target'] = df['rating'].apply(lambda x:score_to_Target(x))

In [51]:
df.head()

Unnamed: 0.1,Unnamed: 0,rating,review,review dictionary based sentiment,predicted,target
0,0,1,Recently UBISOFT had to settle a huge class-ac...,-0.013158,Neutral,Negative
1,1,1,"code didn't work, got me a refund.",0.285714,Postive,Negative
2,2,1,"these do not work at all, all i get is static ...",0.0,Neutral,Negative
3,3,1,well let me start by saying that when i first ...,-0.020521,Neutral,Negative
4,4,1,"Dont waste your money, you will just end up us...",0.0,Neutral,Negative
