# Natural Language Processing (NLP): Bag-of-words model

## Import statements

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string

plt.style.use('seaborn-pastel')

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

## Read in csv data
Steam is a video game digital distribution service with a vast community of gamers globally. A lot of gamers write reviews on the game page and have the option of choosing whether they would recommend this game to others or not.  
**review_id** --> Unique ID for each review  
**title** --> Title of the game  
**year** --> Year in which the review was posted  
**user_review** --> Full Text of the review posted by a user  
**user_suggestion** --> (Target) Game marked Recommended(1) and Not Recommended(0) by the user

In [None]:
steam = pd.read_csv("https://raw.githubusercontent.com/BaroqueObama/hhs-ws-bag-of-words/main/steam.csv",encoding='ISO-8859-1') # encoding='ISO-8859-1' allows us to read unicode characters without issue
display(steam)

### Drop uneeded columns (review_id, title, year) and NaN values if any

In [None]:
steam.drop(columns=steam.columns[0:3],inplace=True)
steam.dropna(axis=0,inplace=True)
display(steam)

### Plot the distribution between Recommended and Not Recommended game reviews

In [None]:
ax = steam[steam.columns[1]].value_counts().plot(kind='bar', title='Count of Game Reviews by Recommendation', figsize=(10, 5))
ax.set_xlabel('Recommend = 1, Not Recommend = 0')
plt.show()

## Tokenize a string by splitting it into a list of words (split at each space)

In [None]:
test_string = "I ain't got much sleeping last-night. Don't you think 3 hours' sleep is enough?"
print(test_string)

In [None]:
tokens = # Hint, use .split()
print(tokens)

### Solution Here
<details>
  <summary>Click to reveal answer. </summary>

```python
tokens = test_string.split()
print(tokens)
```
(You still have to copy paste this code and run it)
</details>

## Now tokenize all the reviews in the DataFrame

In [None]:
steam["basic_tokenize"] = steam["user_review"].apply(lambda review: review #Apply your tokenization to the reviews here)
display(steam)

### Solution Here
<details>
  <summary>Click to reveal answer. </summary>

```python
steam["basic_tokenize"] = steam["user_review"].apply(lambda review: review.split(" "))
display(steam)
```
(You still have to copy paste this code and run it)
</details>

## Let's plot what kind of tokens were made!

In [None]:
plt.subplot(3, 1, 1)
steam["basic_tokenize"].explode().value_counts().head(10).plot(kind='bar', title='Total: Tokenized word frequency')
plt.subplot(3, 1, 2)
steam[steam["user_suggestion"]==1]["basic_tokenize"].explode().value_counts().head(10).plot(kind='bar', title='Recommended: Tokenized word frequency')
plt.subplot(3, 1, 3)
ax = steam[steam["user_suggestion"]==0]["basic_tokenize"].explode().value_counts().head(10).plot(kind='bar', title='Not Recommended: Tokenized word frequency')
plt.tight_layout()
plt.show()

## Let's now turn these tokens into something a model can comprehend (numbers)!

### X is the input (tokens), y is what we try to predict (1 = Recommend, 0 = Does not recommend)

In [None]:
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(steam["basic_tokenize"].values)
y = steam["user_suggestion"]
print(X.shape)

### Let's split our data into train and test sets and train our Naive Bayes Model!

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

## Evaluate the Model

In [None]:
y_pred = model.predict(X_test)
#display(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()

## Test the model for yourself!

In [None]:
def try_basic_model(game_review):
    stop_words = set(stopwords.words('english'))
    tokens = game_review.split(" ")
    mat = [[int(mlb.classes_[i] in tokens) for i in range(len(mlb.classes_))]]
    if model.predict(mat).value == 1:
        return f"Model predicts Recommend. Tokens: {tokens}"
    else:
        return f"Model predicts Does Not Recommend. Tokens: {tokens}"


### Type in any hypothetical game review

In [None]:
try_basic_model("This game is amazing!")

# Improving the model: Tokenize better

In [None]:
test_string = "I ain't got much sleeping last-night. Don't you think 3 hours' sleep is enough?"
print(test_string)

### Split the text into more detail using nltk.word_tokenize

In [None]:
tokens = word_tokenize(test_string)
print(tokens)

### Stop words: Frequent words that don't carry much meaning (Using stopwords)
Try to remove these words

In [None]:
stop_words = set(stopwords.words('english'))
print(stop_words)

In [None]:
new_tokens = []
for token in tokens:
    if not token in stop_words:
        new_tokens.append(token)
print(new_tokens)

### Perform stemming to homogenize same words with different endings (using PorterStemmer())

In [None]:
newer_tokens = []
for token in new_tokens:
    newer_tokens.append(PorterStemmer().stem(token))
print(newer_tokens)

### Remove punctuation (using string.punctuation)

In [None]:
string.punctuation

In [None]:
final_tokens = []
for token in newer_tokens:
    if not token in string.punctuation:
        final_tokens.append(token)
print(final_tokens)

## Now Apply these techniques on the entire DataFrame

In [None]:
stop_words = set(stopwords.words('english'))
steam["advanced_tokenize"] = steam["user_review"].apply(lambda review: review #You could apply your operations here, or in multiple rounds using for loops)
display(steam["advanced_tokenize"])

### Solution Here
<details>
  <summary>Click to reveal answer. </summary>

```python
stop_words = set(stopwords.words('english'))
steam["advanced_tokenize"] = steam["user_review"].apply(lambda review: list(filter(lambda token: token not in string.punctuation, [PorterStemmer().stem(word) for word in nltk.word_tokenize(review) if word.lower() not in stop_words])))
display(steam["advanced_tokenize"])
```
(You still have to copy paste this code and run it)
</details>

## Let's see what kinds of tokens we have

In [None]:
plt.subplot(3, 1, 1)
steam["advanced_tokenize"].explode().value_counts().head(10).plot(kind='bar', title='Total: Tokenized word frequency')
plt.subplot(3, 1, 2)
steam[steam["user_suggestion"]==1]["advanced_tokenize"].explode().value_counts().head(10).plot(kind='bar', title='Recommended: Tokenized word frequency')
plt.subplot(3, 1, 3)
ax = steam[steam["user_suggestion"]==0]["advanced_tokenize"].explode().value_counts().head(10).plot(kind='bar', title='Not Recommended: Tokenized word frequency')
plt.tight_layout()
plt.show()

## Create X and y set

In [None]:
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(steam["advanced_tokenize"].values)
y = steam["user_suggestion"]
print(X.shape)

## Split Data and Train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

## Evaluate Model

In [None]:
y_pred = model.predict(X_test)
#display(confusion_matrix(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
plt.show()

## Try it out yourself with your own reviews!
(Make sure you tokenize the words the same way in this function as you did when you trained the model)

In [None]:
def try_advanced_model(game_review):
    stop_words = set(stopwords.words('english'))
    tokens = list(filter(lambda token: token not in string.punctuation, [PorterStemmer().stem(word) for word in nltk.word_tokenize(game_review.translate(string.punctuation)) if word.lower() not in stop_words]))
    print(tokens)
    mat = [[int(mlb.classes_[i] in tokens) for i in range(len(mlb.classes_))]]
    if model.predict(mat).value == 1:
        return f"Model predicts Recommend. Tokens: {tokens}"
    else:
        return f"Model predicts Does Not Recommend. Tokens: {tokens}"

In [None]:
try_advanced_model("This game is amazing!")