In [1]:
import altair as alt
import matplotlib
import nltk
from nltk.corpus import opinion_lexicon
from nltk.corpus import stopwords
import nltk.sentiment.util
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import scipy.stats
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score


In [2]:
"""
1.0
Load in the small_corpus.csv file you created in the previous milestone.
"""

small_corpus = pd.read_csv("small_corpus.csv")
small_corpus = small_corpus.drop("Unnamed: 0", axis=1)


In [3]:
"""
2.0
Tokenize the sentences and words of the reviews with the tokenize module of NLTK.

Keep in mind that word_tokenize and sent_tokenize functions of the nltk.tokenize module should be used.
"""

def tokenizer(sentence):
    """Splits paragraphs to sentences, then sentences to bags of words.

        Function breaks longer reviews into sentences and then breaks,
        the sentences into individual words.
        
        Args:
            sentence: a single, or multiple, sentence as a str.
        
        Returns:
            List of lists of each sentence, broken into words.
        """
    tokenized_sentence = sent_tokenize(sentence)
    return_list = []
    for s in tokenized_sentence:
        words = word_tokenize(s)
        words = [word.lower() for word in words if word.isalnum()]
        if words != []:
            return_list.append(words)
        elif words == []:
            continue
    return return_list


In [4]:
"""
3.0
Download the opinion lexicon of NLTK by using the following command: nltk.download('opinion_lexicon'). Before you classify each word of the reviews, experiment with words and find out whether they are labeled as positive or negative.

Note that the dictionary contains various word-forms, not only stems.
"""

nltk.download("opinion_lexicon")
nltk.download("punkt")

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
def sentimentalyzer(review):
    """Calculates a numeric representation of sentence sentiment (-1 to +1).

        Function takes in tokenized reviews, calculates a sentiment for
        each sentence as a positive or negative percentage of the total
        words in a sentence. Then calculates a sentiment for the entire
        review as a sum of the sentence sentiments, capped at either -1
        or +1.
        
        Args:
            review: a tokenized sentence(s), as a list of lists
        
        Returns:
            A numeric sentiment, capped from -1 to +1.
        """

    tokenized = tokenizer(review)
    review_polarity = 0
    for sentence in tokenized:
        positive_word_count = 0
        negative_word_count = 0
        neutral_word_count = 0
        sentence_polarity = 0
        
        for word in sentence:
            if word in opinion_lexicon.positive():
                positive_word_count += 1
            elif word in opinion_lexicon.negative():
                negative_word_count += 1
            else:
                neutral_word_count += 1
        
        total_word_count = positive_word_count + negative_word_count + neutral_word_count
        if positive_word_count > negative_word_count:
            sentence_polarity = positive_word_count / total_word_count
        elif negative_word_count > positive_word_count:
            sentence_polarity = -(negative_word_count / total_word_count)
        else:
            sentence_polarity = 0
        review_polarity += sentence_polarity

    if review_polarity > 1:
        review_polarity = 1
    elif review_polarity < -1:
        review_polarity = -1
    return review_polarity


In [6]:
def sentimentalyzer_mark_neg(review):
    """Calculates a numeric representation of sentence sentiment (-1 to +1).

        Function takes in tokenized reviews, calculates a sentiment for
        each sentence as a positive or negative percentage of the total
        words in a sentence. Then calculates a sentiment for the entire
        review as a sum of the sentence sentiments, capped at either -1
        or +1.

        **Function is based on sentimentalyzer() but includes additional
        code to catch negation words.
        
        Args:
            review: a tokenized sentence(s), as a list of lists
        
        Returns:
            A numeric sentiment, capped from -1 to +1.
        """

    tokenized = tokenizer(review)
    review_polarity = 0
    for sentence in tokenized:
        sentence = nltk.sentiment.util.mark_negation(sentence)
        positive_word_count = 0
        negative_word_count = 0
        neutral_word_count = 0
        sentence_polarity = 0
        
        for word in sentence:
            if word in opinion_lexicon.positive():
                positive_word_count += 1
            elif word in opinion_lexicon.negative() or word.find("_NEG") != -1:
                negative_word_count += 1
            else:
                neutral_word_count += 1
        
        total_word_count = positive_word_count + negative_word_count + neutral_word_count
        if positive_word_count > negative_word_count:
            sentence_polarity = positive_word_count / total_word_count
        elif negative_word_count > positive_word_count:
            sentence_polarity = -(negative_word_count / total_word_count)
        else:
            sentence_polarity = 0
        review_polarity += sentence_polarity

    if review_polarity > 1:
        review_polarity = 1
    elif review_polarity < -1:
        review_polarity = -1
    return review_polarity

In [7]:
def sentimentalyzer_mark_neg_stop_words(review):
    """Calculates a numeric representation of sentence sentiment (-1 to +1).

        Function takes in tokenized reviews, calculates a sentiment for
        each sentence as a positive or negative percentage of the total
        words in a sentence. Then calculates a sentiment for the entire
        review as a sum of the sentence sentiments, capped at either -1
        or +1.

        **Function is based on sentimentalyzer() but includes additional
        code to catch negation words and stop words.
        
        Args:
            review: a tokenized sentence(s), as a list of lists
        
        Returns:
            A numeric sentiment, capped from -1 to +1.
        """

    tokenized = tokenizer(review)
    review_polarity = 0
    for sentence in tokenized:
        sentence = [word for word in sentence if not word in stopwords.words("english")]
        sentence = nltk.sentiment.util.mark_negation(sentence)
        positive_word_count = 0
        negative_word_count = 0
        neutral_word_count = 0
        sentence_polarity = 0
        
        for word in sentence:
            if word in opinion_lexicon.positive():
                positive_word_count += 1
            elif word in opinion_lexicon.negative() or word.find("_NEG") != -1:
                negative_word_count += 1
            else:
                neutral_word_count += 1
        
        total_word_count = positive_word_count + negative_word_count + neutral_word_count
        if positive_word_count > negative_word_count:
            sentence_polarity = positive_word_count / total_word_count
        elif negative_word_count > positive_word_count:
            sentence_polarity = -(negative_word_count / total_word_count)
        else:
            sentence_polarity = 0
        review_polarity += sentence_polarity

    if review_polarity > 1:
        review_polarity = 1
    elif review_polarity < -1:
        review_polarity = -1
    return review_polarity


In [8]:
def generate_sentiment_score(dataframe, new_column, target_column, function_to_apply):
    """Applies a custom function to a target column and adds results as a new column.

        Function takes in a dataframe and applies a user defined function
        against a user defined column and saves the results in a new column.
        
        Args:
            dataframe: any dataframe
            new_column: the name of the column that will hold the function results
            target_column: the column in the dataframe to apply the function to
            function_to_apply: the function to be applied to the target column
        
        Returns:
            The original dataframe with a new column.
        """
    
    dataframe[new_column] = dataframe[target_column].map(lambda s: function_to_apply(s))
    return dataframe


In [9]:
def save_to_csv(dataframe, file_name, cols, header_titles):
    """Saves the passed dataframe to a .csv"""
    dataframe.to_csv(file_name, columns=cols, header=header_titles)


In [10]:
def create_histogram(dataframe, x, y, file_name):
    """Creates a histogram and saves to file."""
    chart = (
    alt.Chart(dataframe)
    .mark_bar()
    .encode(
        x=x,
        y=y,
        )
    )
    alt.renderers.enable("altair_viewer")
    chart.save(file_name)


In [11]:
def create_scatter_plot(dataframe, x, y, file_name):
    """Creates a scatter plot and saves to file."""
    chart = (
    alt.Chart(dataframe)
    .mark_circle(size=60)
    .encode(
        x=x,
        y=y,
        )
    )
    alt.renderers.enable("altair_viewer")
    chart.save(file_name)


In [46]:
def scipy_correlation(x, y, method="pearson"):
    """Print the correlation between two pandas series."""
    try:
        if method=="pearson":
            r, p = scipy.stats.pearsonr(x, y)
        elif method=="spearman":
            r, p = scipy.stats.spearmanr(x, y)
        elif method=="kendalltau":
            r, p = scipy.stats.kendalltau(x, y)
        print(f"Using {method} the R-value is: {r}, and it has a p-value of: {p}")
    except:
        print("That is not one of the available correlation methods.")


In [25]:
def scipy_linreg(x, y):
    """Print the intercept and coefficient of a linear regression."""
    regr = scipy.stats.linregress(x, y)
    print(f"Intercept value is: {regr.intercept}.")
    print(f"Coefficient value is: {regr.slope}.")
    print(f"Coefficient value is: {regr.slope}.")
    print(f"p-value is: {regr.pvalue}.")
    print(f"R-value is: {regr.rvalue}.")


In [15]:
def bin_sentiment_scores(dataframe, column, breaks=[-1, -.75, -0.5, -0.25, 0, 0.25, 0.50, 0.75, 1, 1.25]):
    """Returns histogram values as a dataframe."""
    count, division = np.histogram(dataframe[column], breaks)
    sentiment_histogram = pd.DataFrame({"count": count, "division": division[:-1]})
    return sentiment_histogram


In [16]:
"""
4.0
Classify each review in a scale of –1 to +1. The higher the score is, the more positive the review is.

It is recommended to score the reviews in two steps. First score the sentences of the reviews from –1 to 1 based on the sum of the positive and negative words they include. Then count the sentiment score of the reviews, which you preliminary sliced into sentences.
Don’t forget that NLTK opinion lexicon neither contains uppercase words, nor punctuation marks.
"""

generate_sentiment_score(small_corpus, "sentiment_score", "reviews", function_to_apply=sentimentalyzer)
generate_sentiment_score(small_corpus, "sentiment_score_mark_neg", "reviews", function_to_apply=sentimentalyzer_mark_neg)
generate_sentiment_score(small_corpus, "sentiment_score_mark_neg_stop_words", "reviews", function_to_apply=sentimentalyzer_mark_neg_stop_words)


Unnamed: 0,ratings,reviews,sentiment_score,sentiment_score_mark_neg,sentiment_score_mark_neg_stop_words
0,1,THE DAY GAMING CRIED...,0.000000,0.000000,0.0
1,1,One Star,0.000000,0.000000,0.0
2,1,"these do not work at all, all i get ...",0.111111,-0.666667,0.5
3,1,last gen game,0.000000,0.000000,0.0
4,1,Waste,-1.000000,-1.000000,-1.0
...,...,...,...,...,...
4495,5,Five Stars,0.000000,0.000000,0.0
4496,5,Five Stars,0.000000,0.000000,0.0
4497,5,Five Stars,0.000000,0.000000,0.0
4498,5,Awesome!,1.000000,1.000000,1.0


In [17]:
save_to_csv(small_corpus, "small_corpus_sentiment.csv", cols=["ratings", "reviews", "sentiment_score", "sentiment_score_mark_neg", "sentiment_score_mark_neg_stop_words"], header_titles=["ratings", "reviews", "sentiment_score", "sentiment_score_mark_neg", "sentiment_score_mark_neg_stop_words"])

In [18]:
"""
5.0
Compare the scores of the product reviews with the product ratings using a plot. In this step, you need to accomplish three sub-tasks.

5.1 
Create a plot of the distribution of the ratings. Explore which is the most common rating.

You can use Altair to create the plot.
"""

create_histogram(small_corpus, "ratings", "count()", "small_corpus_ratings_histogram.html")


In [19]:
"""
5.2 
Create a plot of the distribution of the sentiment scores. Explore which is the most common.

Note that the scores are not discrete numbers.
It is recommended to use NumPy histogram to put the sentiment scores into bins.
"""

sentiment_histogram = bin_sentiment_scores(small_corpus, "sentiment_score")
create_histogram(sentiment_histogram, "division", "count", "small_corpus_sentiment_histogram.html")


In [20]:
"""
5.3 
Create a plot about the relation of the sentiment scores and product ratings. What is your impression? Do they correlate?
"""

create_scatter_plot(small_corpus, "ratings", "sentiment_score", "ratings_vs_sentiment_scatter.html")


In [47]:
"""
6.0 
Measure the correlation of the sentiment scores and product ratings. Try out more methods. Study the contradictions, namely those cases where the rating is high but the score is low, or the other way around.

Choose the most effective correlation measure.
"""
corr_methods = ["pearson", "spearman", "kendalltau"]
for m in corr_methods:
    scipy_correlation(small_corpus["ratings"], small_corpus["sentiment_score"], method=m)
    print("\n")
scipy_linreg(small_corpus["ratings"], small_corpus["sentiment_score"])

Using pearson the R-value is: 0.4203153729322831, and it has a p-value of: 3.829044443872697e-192


Using spearman the R-value is: 0.431086557814956, and it has a p-value of: 4.2847721094879914e-203


Using kendalltau the R-value is: 0.34825337832628694, and it has a p-value of: 7.62443898302215e-187


Intercept value is: -0.2071126957347555.
Coefficient value is: 0.0803880898755916.
Coefficient value is: 0.0803880898755916.
p-value is: 3.829044443866182e-192.
R-value is: 0.4203153729322852.


In [22]:
"""
7.0
Improve your sentiment analyzer in order to reduce contradictory cases. Handle negation, as most of those cases are contradictory when there is negation in the sentence (e.g., no problem).

It is recommended to use the mark_negation function of the NLTK sentiment.utils module.
Don’t forget to complete your vocabulary with negated words.
"""

sentiment_histogram = bin_sentiment_scores(small_corpus, "sentiment_score_mark_neg")
create_histogram(sentiment_histogram, "division", "count", "small_corpus_sentiment_mark_neg_histogram.html")


sentiment_histogram = bin_sentiment_scores(small_corpus, "sentiment_score_mark_neg_stop_words")
create_histogram(sentiment_histogram, "division", "count", "small_corpus_sentiment_mark_neg_stop_words_histogram.html")

In [23]:
create_scatter_plot(small_corpus, "ratings", "sentiment_score_mark_neg", "ratings_vs_sentiment_mark_neg_scatter.html")
create_scatter_plot(small_corpus, "ratings", "sentiment_score_mark_neg_stop_words", "ratings_vs_sentiment_mark_neg_stop_words_scatter.html")


In [52]:
for m in corr_methods:
    scipy_correlation(small_corpus["ratings"], small_corpus["sentiment_score_mark_neg"], method=m)
    print("\n")
scipy_linreg(small_corpus["ratings"], small_corpus["sentiment_score_mark_neg"])

print("\n")
for m in corr_methods:
    scipy_correlation(small_corpus["ratings"], small_corpus["sentiment_score_mark_neg_stop_words"], method=m)
    print("\n")
scipy_linreg(small_corpus["ratings"], small_corpus["sentiment_score_mark_neg_stop_words"])


Using pearson the R-value is: 0.4627921104296059, and it has a p-value of: 9.89857705849804e-238


Using spearman the R-value is: 0.4907435118544055, and it has a p-value of: 1.852594628034942e-271


Using kendalltau the R-value is: 0.3913151273993718, and it has a p-value of: 3.81903593259519e-239


Intercept value is: -0.34139602551167797.
Coefficient value is: 0.10194331057012819.
Coefficient value is: 0.10194331057012819.
p-value is: 9.89857705841941e-238.
R-value is: 0.46279211042961005.


Using pearson the R-value is: 0.3897947656165339, and it has a p-value of: 3.276856828537041e-163


Using spearman the R-value is: 0.4158543064820247, and it has a p-value of: 1.0066814665397989e-187


Using kendalltau the R-value is: 0.33656206645715886, and it has a p-value of: 1.1511098722279842e-172


Intercept value is: -0.23923025298538136.
Coefficient value is: 0.09705425045040435.
Coefficient value is: 0.09705425045040435.
p-value is: 3.2768568285364585e-163.
R-value is: 0.38979476561653