In [1]:
import re
import yaml

import string
import itertools
from collections import Counter

import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

import spacy

import string

from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from tqdm.auto import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/Module_5_Lecture_1_Class_amazon_product_reviews.csv', index_col='Id')
df['sentiment'] = [1 if score in [4, 5] else 0 for score in df['Score']]
df = df.drop_duplicates().reset_index(drop=True)
df = df.drop_duplicates(subset={"UserId", "Time","Text"})
df = df.groupby('sentiment').sample(2500, random_state=42)

In [3]:
df.head(3)

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,sentiment
556850,B001EO5YDY,A3KMNPL0AN0QB3,Chad Ware,1,1,3,1326240000,Flavors in product description are inaccurate,"According to the product description, I was su...",0
36204,B000FGXT2A,A35DW1GJBLNMZI,VicPaxGear,1,1,3,1202860800,Tasty but Crumbly,"This cereal tastes great. Unfortunately, it d...",0
506099,B001C15JCU,A26LT2ZMC3E0BK,C. Fairstone,10,11,1,1346198400,FDA Warns Chicken Jerky From China May Harm or...,"STOP USING THIS PRICEY JUNK NOW!! search ""chic...",0


In [None]:
with open('data/my_vocab.yaml', 'r') as file:
    vocab = yaml.safe_load(file)

contractions = vocab['contractions']
negations = vocab['negations']

('aren', 'may not')

In [5]:
print('negations', negations[:5])
print('contractions', [(k, contractions[k]) for k in list(contractions.keys())[:5]])

negations ['aren', "aren't", 'couldn', "couldn't", 'didn']
contractions [("ain't", 'am not'), ("aren't", 'are not'), ("can't", 'cannot'), ("can't've", 'cannot have'), ('cause', 'because')]


In [6]:
# including words to stop-words list
include_to_stopwords = set(['also', 'would', 'much', 'many'])
stop_words = set(stopwords.words('english'))
stop_words = stop_words.union(include_to_stopwords)

# removing words from the stop-words list
stop_words = stop_words.difference(negations)

In [7]:
# !python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm", disable = ['parser','ner'])

In [8]:
def remove_stuff(text):
    text = re.sub("<[^>]*>", " ", text) # Remove html tags
    text = re.sub("\S*@\S*[\s]+", " ", text) # Remove emails
    text = re.sub("https?:\/\/.*?[\s]+", " ", text) # Remove links
    text = re.sub("[^a-zA-Z' ]", "", text) # Remove non-letters
    text = re.sub("[\s]+", " ", text) # Remove excesive whitespaces
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    
    return text

def remove_stopwords(text, stop_words: set):
    text = text.lower().split()
    text = [word for word in text if not word in stop_words]
    return " ".join(text)

def process_with_stemmer(text):
    stemmer = PorterStemmer()
    text = text.lower().split()
    text = [stemmer.stem(word) for word in text]
    return " ".join(text)

def process_with_lemmatizer(text):
    text = text.lower()
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc if len(token.lemma_) > 1 ])

    return text

def replace_words(text, replace_on:dict):
    text = text.lower().split()
    text = [replace_on.get(word) if word in replace_on else word for word in text]
    return " ".join(text)


def normalize_text(text):
    text = remove_stuff(text)
    text = remove_stopwords(text, stop_words)
    text = replace_words(text, contractions)

    # test = process_with_stemmer(text)
    text = process_with_lemmatizer(text)
    
    return text

In [9]:
text = 'On a quest for the perfedc1112t,,, !!!! <br />%%2%% popcorn to compliment\
 the Whirley Pop.  Don\'t get older, I\'m beginning to appreciate the more "natural" \
popcorn varieties, and I suppose that\'s what attracted me to the Arrowhead Mills \
Organic Yellow Popcorn.<br /> <br />I\'m no "organic" food expert.  I just wanted \
some good tasting popcorn.  And, I feel like that\'s what I got.  Using the Whirley \
Pop, with a very small amount of oil, I\'ve had great results.'

print('Original text')
print(text)
print("#" * 50)
print('Normalized text')
print(normalize_text(text))

Original text
On a quest for the perfedc1112t,,, !!!! <br />%%2%% popcorn to compliment the Whirley Pop.  Don't get older, I'm beginning to appreciate the more "natural" popcorn varieties, and I suppose that's what attracted me to the Arrowhead Mills Organic Yellow Popcorn.<br /> <br />I'm no "organic" food expert.  I just wanted some good tasting popcorn.  And, I feel like that's what I got.  Using the Whirley Pop, with a very small amount of oil, I've had great results.
##################################################
Normalized text
quest perfedct popcorn compliment whirley pop do not get old begin appreciate natural popcorn variety suppose that attract arrowhead mill organic yellow popcorn organic food expert want good tasting popcorn feel like that get use whirley pop small amount oil ve great result


In [10]:
df['text_normalized'] = df['Text'].progress_apply(normalize_text)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [11]:
X = df['text_normalized']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
vectorizer = CountVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print('len', len(vectorizer.vocabulary_))
print('features', vectorizer.get_feature_names_out()[:5])

len 12222
features ['aacurate' 'aai' 'aardvark' 'aback' 'abandon']


In [13]:
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

In [14]:
predictions = model.predict(X_test)
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.779


In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

print('len', len(vectorizer.vocabulary_))
print('features', vectorizer.get_feature_names_out()[:5])

model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print('AUC: ', roc_auc_score(y_test, predictions))

len 123723
features ['aacurate' 'aacurate happy' 'aai' 'aai raspberry' 'aardvark']
AUC:  0.797
