# Naive Bayes classifier: Google Play Store reviews

## Notebook set-up

In [None]:

# Python standard library imports
import string
from pathlib import Path

# PyPI imports
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

nltk.download('punkt_tab')
nltk.download('wordnet')

## 1. Data loading

### 1.1. Load data from URL

In [None]:
# Load the data from the URL provided in the project tutorial
data_df = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/naive-bayes-project-tutorial/main/playstore_reviews.csv')

### 1.2. Save a local copy

In [None]:
# Make a directory for raw data
Path('../data/raw').mkdir(exist_ok=True, parents=True)

# Save a local copy of the raw data
data_df.to_parquet('../data/raw/playstore_reviews.parquet')

### 1.3. Inspect

In [None]:
data_df.head()

In [None]:
data_df.info()

## 2. Review length distribution

In [None]:
lengths = data_df['review'].str.len().tolist()

plt.title('Review length distribution')
plt.xlabel('Characters')
plt.ylabel('Reviews')
plt.hist(lengths, bins=30, color='black')
plt.show()

print(f'Review length mean: {np.mean(lengths):.0f}')
print(f'Review length min: {min(lengths):.0f}')
print(f'Review length max: {max(lengths):.0f}')

### 2.1. Long reviews

In [None]:
data_df['review_length'] = lengths
long_reviews = data_df[data_df['review_length'] > 600]

print(long_reviews['review'].iloc[0])

In [None]:
print(long_reviews['review'].iloc[1])

In [None]:
print(long_reviews['review'].iloc[2])

### 2.2. Short reviews

In [None]:
short_length = 30

short_reviews = data_df[data_df['review_length'] < short_length]
short_reviews.head()

### 2.3. Filter reviews by length

In [None]:
data_df = data_df[(data_df['review_length'] >= 10) & (data_df['review_length'] <= 600)]
data_df.info()

## 3. Bag-of-words encoding

### 3.1. Text preprocessing

In [None]:
# Get just the reviews
reviews = data_df['review']

In [None]:
# Lowercase the reviews
reviews = reviews.str.lower()

In [None]:
# Remove numbers
reviews = reviews.str.replace(r'\d+', '', regex=True)

In [None]:
# Remove punctuation
reviews = reviews.str.replace('[{}]'.format(string.punctuation), '')

In [None]:
# Lemmatize the text

def lemmatize_text(text):

    # Create a lemmatizer object
    lemmatizer = WordNetLemmatizer()

    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Lemmatize each word
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

    # Join the lemmatized words back into a string
    lemmatized_text = ' '.join(lemmatized_words)

    return lemmatized_text

reviews = reviews.apply(lemmatize_text)

## 4. Word count encoding

In [None]:
# Fit the count vectorizer and transform the reviews into word counts
vector_model = CountVectorizer(stop_words="english")
word_counts = vector_model.fit_transform(reviews).toarray()

print(f'Word count matrix has {word_counts.shape[0]} rows and {word_counts.shape[1]} columns')

In [None]:
# Fit the count vectorizer and transform the reviews into word counts
vector_model = CountVectorizer(stop_words="english")
word_counts = vector_model.fit_transform(reviews).toarray()

print(f'Word count matrix has {word_counts.shape[0]} rows and {word_counts.shape[1]} columns')

# Get the words from the vector model
feature_names = vector_model.get_feature_names_out()

# Convert the word counts back into a dataframe
word_count_df = pd.DataFrame(word_counts, columns=feature_names)

# # Add back the label
# word_count_df['polarity'] = data_df['polarity']

# Take a look...
word_count_df.head()

### 4.1. Word count distribution

In [None]:
word_count_df.head()

In [None]:
word_counts = word_count_df.sum().tolist()

plt.title('Word count distribution')
plt.xlabel('Count')
plt.ylabel('Words')
plt.hist(word_counts, bins=30, color='black')
plt.yscale('log')
plt.show()

### 4.2. Feature selection

In [None]:
total_word_count_df = pd.DataFrame({
    'word': word_count_df.columns,
    'count': word_counts
})

trimmed_word_count_df = total_word_count_df[(total_word_count_df['count'] > 1) & (total_word_count_df['count'] < 200)]

word_count_df = word_count_df[trimmed_word_count_df['word'].tolist()]
word_count_df.info()

## 5. Data preparation

In [None]:
# Add the label back
word_count_df['polarity'] = data_df['polarity']

# Train test split
training_df, testing_df = train_test_split(word_count_df, test_size=0.3)

## 6. Model comparison

In [None]:
cross_val_scores={
    'Model': [],
    'Score': []
}

### 6.1. Logistic regression

In [None]:
scores = cross_val_score(
    LogisticRegression(),
    training_df.drop('polarity', axis=1),
    training_df['polarity'],
    cv=7,
    n_jobs=-1
)

cross_val_scores['Model'].extend(['Logistic regression']*len(scores))
cross_val_scores['Score'].extend(scores*100)

print(f'Cross-validation accuracy: {np.mean(scores)*100:.1f} +/- {np.std(scores)*100:.1f}%')

### 6.2. Multinomial Naive Bayes

In [None]:
scores = cross_val_score(
    MultinomialNB(),
    training_df.drop('polarity', axis=1),
    training_df['polarity'],
    cv=7,
    n_jobs=-1
)

cross_val_scores['Model'].extend(['Multinomial Naive Bayes']*len(scores))
cross_val_scores['Score'].extend(scores*100)

print(f'Cross-validation accuracy: {np.mean(scores)*100:.1f} +/- {np.std(scores)*100:.1f}%')

### 6.3. Gaussian Naive Bayes

In [None]:
scores = cross_val_score(
    GaussianNB(),
    training_df.drop('polarity', axis=1),
    training_df['polarity'],
    cv=7,
    n_jobs=-1
)

cross_val_scores['Model'].extend(['Gaussian Naive Bayes']*len(scores))
cross_val_scores['Score'].extend(scores*100)

print(f'Cross-validation accuracy: {np.mean(scores)*100:.1f} +/- {np.std(scores)*100:.1f}%')

### 6.4. Bernoulli Naive Bayes

In [None]:
scores = cross_val_score(
    BernoulliNB(),
    training_df.drop('polarity', axis=1),
    training_df['polarity'],
    cv=7,
    n_jobs=-1
)

cross_val_scores['Model'].extend(['Bernoulli Naive Bayes']*len(scores))
cross_val_scores['Score'].extend(scores*100)

print(f'Cross-validation accuracy: {np.mean(scores)*100:.1f} +/- {np.std(scores)*100:.1f}%')

### 6.5. Cross-validation performance

In [None]:
sns.boxplot(pd.DataFrame.from_dict(cross_val_scores), x='Model', y='Score')
plt.title('Model cross-validation performance comparison')
plt.ylabel('Accuracy (%)')
plt.xticks(rotation=45)
plt.show()

## 7. Final model evaluation

In [None]:
model = MultinomialNB()
model.fit(training_df.drop('polarity', axis=1), training_df['polarity'])

testing_predictions = model.predict(testing_df.drop('polarity', axis=1))
accuracy = accuracy_score(testing_predictions, testing_df['polarity'])*100

# Plot the confusion matrix
cm = confusion_matrix(testing_df['polarity'], testing_predictions, normalize='true')
cm_disp = ConfusionMatrixDisplay(confusion_matrix=cm)
_ = cm_disp.plot()

plt.title(f'Test set performance\noverall accuracy: {accuracy:.1f}%')
plt.xlabel('Predicted outcome')
plt.ylabel('True outcome')
plt.show()