## Testing hypothesis 2
---

**_Hypothesis_**: Reviews with more positive sentiment words receive higher helpfulness ratings.

- **Metric**: Mean helpfulness ratings for number of positive and negative words.

- **Model**: Multinomial Naive Bayes.

- **Description**:

  - Use NBC as a classifier to predict the sentiment of a review.
  - Extract the most useful words from the classifier.
  - Compute the mean helpfulness ratings for the most useful words.  

**Missing Values**:

  - `review/score`: remove the entire sample
  - `review/text`: remove the entire sample
  - `review/helpfulness`: remove the entire sample

**Data Transformation**:

  - `review/score`: Assign 1 to score (4, 5), 0 to score (1, 2). 
  - `review/text`: Create the BoW for the text. Fit a MNBC and count the number of positive and negative words. Graphical Plot.
  - `review/helpfulness`: $helpfulness = \frac{x}{y} \sqrt(y)$

---

In [None]:
import nltk
import pymongo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import seaborn as sns
import scipy.stats as stats

from scipy_analyze import *
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

nltk.download('stopwords')

In [None]:
# Connect to MongoDB


client = pymongo.MongoClient('mongodb://localhost:27017/')
database = client['spark_db']
books = database['book_reviews']

In [None]:
# Remove the samples if the fields reported above have missing values. Remove also the samples with score equal to 3, since it indicates a neutral review.
pipeline_remove = {'$match':
                   {
                       'review/text': {'$exists': True},
                       'review/score': {'$exists': True, '$ne': 3},
                       'Tot_votes': {'$exists': True, '$ne': 0}
                   }
                   }

# Create a new field called class. If the score is greater than 3, the class is 1, otherwise is 0.
pipeline_class = {'$project': {
    '_id': 0,
    'review/text': 1,
    'review/helpfulness_rate': {'$multiply': [
        {'$divide': ['$N_helpful', '$Tot_votes']},
        {'$sqrt': '$Tot_votes'}
    ]
    },
    'class': {
        '$cond': {
            'if': {'$in': ['$review/score', [4, 5]]},
            'then': 1,
            'else': 0
        }
    }
}
}

books_removed = books.aggregate([pipeline_remove, pipeline_class])
df = pd.DataFrame(list(books_removed))

In [None]:
# # Remove punctuation and convert to lowercase the review/text column
df['review/text'] = df['review/text'].replace('[^\w\s]', ' ', regex=True)
df['review/text'] = df['review/text'].str.lower()
df.head()

# # Remove stopwords from the review/text column
stop = stopwords.words('english')
df['review/text'] = df['review/text'].apply(lambda x: ' '.join(
    [word for word in x.split() if word not in (stop)]))
df.head()

# # Remove words with length less than 2 from the review/text column
df['review/text'] = df['review/text'].apply(
    lambda x: ' '.join([word for word in x.split() if len(word) > 2]))
df.head()

In [None]:
# # Plot the histogram of the helpful_score
plt.figure(figsize=(10, 6))
plt.hist(df['review/helpfulness_rate'], bins=100)
plt.xlabel('review/helpfulness_rate')
plt.ylabel('Count')
plt.show()

In [None]:
# dictionary of 2000 words
# # Build a bag of words
vectorizer = CountVectorizer(max_features=2000)
X = vectorizer.fit_transform(df['review/text'])
y = df['class']

# # Train a Naive Bayes classifier
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

nb = MultinomialNB()
nb.fit(X_train, y_train)

In [None]:
feature_names = vectorizer.get_feature_names_out()
pos_neg_ratio = nb.feature_log_prob_[1, :] - nb.feature_log_prob_[0, :]
pos_neg_ratio_sorted = np.argsort(pos_neg_ratio)

words = pd.DataFrame({'Word': feature_names, 'Pos_neg_ratio': pos_neg_ratio})
words = words.sort_values(by='Pos_neg_ratio', ascending=False)
plot_words = pd.concat([words.head(10), words.tail(10)])
# Print the most 20 impactful words
plot_words.plot.bar(x='Word', y='Pos_neg_ratio', rot=45, figsize=(
    10, 6), grid=True, title='Most 20 impactful words', legend=False)


In [None]:
# # Create a new column with the number of positive words in the review/text column
print(f"Plot with 20 most positive words:")
df['positive_words'] = df['review/text'].apply(lambda x: len([word for word in x.split() if word in feature_names[pos_neg_ratio_sorted[-800:]]]))
df.head()

# # Plot the helpful_score vs the number of positive words in the review/text column
plt.figure(figsize=(10, 6))
plt.scatter(df['positive_words'], df['review/helpfulness_rate'], alpha=0.2)
plt.xlabel('Number of positive words')
plt.ylabel('Helpful score')
plt.show()

In [None]:
scipy_analize(df['positive_words'],'positive_words', df['review/helpfulness_rate'], 'review/helpfulness_rate', [1,2,3,4])


In [None]:
# Create different bins of positive_words
groups = [0, 10, 20, 50, 75, 100, 125, 150, 175, 200]
df['length_bin'] = pd.cut(df['positive_words'], bins=groups, labels=[
                          group for group in groups[1:]])

# Plot the distribution of positive_words with respect to helpfulness rate
plt.figure(figsize=(15, 10))
sns.boxplot(x='length_bin', y='review/helpfulness_rate',
            data=df, palette='rainbow',showfliers=False)
plt.title('Review Length Range vs Helpfulness Rate')
plt.xlabel('Number of positive words')
plt.ylabel('Helpfulness Rate')
for el in groups[1:]:
    dataframe = df[df['length_bin'] == el]
    corr, pval = scipy.stats.kendalltau(
        dataframe['positive_words'], dataframe['review/helpfulness_rate'])
    print(
        f'Group number: {el}\nCorrelation Coefficient: {corr}\nP-value: {pval}\n')
    plt.figure(figsize=(15, 10))
    dataframe.plot(kind='scatter', x='positive_words', y='review/helpfulness_rate',
                   figsize=(15, 10), title=f'Review Length vs Helpfulness Rate in Group {el}')
    plt.show()

In [None]:
client.close()