# Imports

In [47]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error, r2_score
from sklearn.feature_extraction.text import CountVectorizer
from textblob import Word
from tqdm.notebook import tqdm
import re
import xgboost as xgb

## Load dataset

In [86]:
df = pd.read_json('yelp_academic_dataset_review.json', lines=True, nrows=500000)

## Setup Dataframe

In [87]:
df = df[['text', 'stars', 'useful', 'funny', 'cool']]

## Pre Processing

### Fill NA scores with 0

In [88]:
df = df.fillna(0)

### Clean Text

In [89]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))
def clean_text(text):
    # TODO implement this function
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

In [90]:
df['text'] = df['text'].apply(clean_text)

### Add metadata

In [91]:
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['char_count'] = df['text'].apply(lambda x: len(x))

### Stop words
Stop words are words deemed to add little or no value to a review. i.e. 'and.'
Stopwords are downloaded from nltk package.

In [None]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
df['stopword_count'] = df['text'].apply(lambda x: len([word for word in x.split() if word.lower() in stop_words]))
df['stopword_rate'] = df['stopword_count'] / df['word_count']

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/davecameron/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df['stopwords_removed'] = df['text'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words))

### Irrelevant words not in stop_words
Looking for remaining words that might be considered stop words.

In [None]:
pd.Series(" ".join(df['stopwords_removed']).split()).value_counts()[:30]

In [None]:
other_stop_words = ['get', 'would', 'got', 'us', 'also', 'even', 'ive', 'im']

In [None]:
df['clean_reviews'] = df['stopwords_removed'].apply(lambda x: " ".join(word for word in x.split() if word not in other_stop_words))

### Lemmatization
Lemmatization reduces words to their root. i.e. Running is deconjucated to run, etc.

In [None]:
df['lemmatized'] = df['clean_reviews'].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))

In [None]:
df['clean_word_count'] = df['lemmatized'].apply(lambda x: len(x.split()))
df['clean_rate'] = df['clean_word_count'] / df['word_count']

In [None]:
df = df.drop(['word_count', 
              'char_count',
              'stopword_count',
              'stopword_rate',
              'stopwords_removed',
              'clean_reviews',
              'clean_word_count',
              'clean_rate'], axis=1)

In [None]:
def tranform_score(score):
    return score - 1


df['stars'] = df['stars'].apply(tranform_score)

## Adjustment

In [None]:
sia = SentimentIntensityAnalyzer()
compound_polarity = {}
negativity = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    negativity[i] = sia.polarity_scores(row['lemmatized'])['neg']
    compound_polarity[i] = sia.polarity_scores(row['lemmatized'])['compound']

In [None]:
df['compound_polarity'] = pd.Series(compound_polarity)
df['negativity'] = pd.Series(negativity)

In [None]:
down = 0
up = 0
for i, row in tqdm(df.iterrows(), total=len(df)):
    if row['negativity'] > 0.1 and row['stars'] > 0:
        row['stars'] -= 1
        down += 1
    if row['compound_polarity'] < 0.25 and row['stars'] > 0:
        row['stars'] -= 1
        down += 1
    if row['negativity'] < 0.01 and row['stars'] < 4.0:
        row['stars'] += 1
        up += 1

print(f'Altered {down} rows down and {up} rows up')

## Sentiment Analysis

In [None]:
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df['text'])
y = df['stars']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
model = xgb.XGBClassifier()
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

In [None]:
print(classification_report(y_test, y_pred))