In [1]:
import re
import yaml

import string
import itertools
from collections import Counter

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
nltk.download('punkt_tab')

import spacy

import string

from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from tqdm.auto import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

import gensim
from gensim.models import word2vec
from gensim.models import KeyedVectors #  implements word vectors
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/aleksejkitajskij/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
df = pd.read_csv('data/Module_5_Lecture_1_Class_amazon_product_reviews.csv', index_col='Id')
df['sentiment'] = [1 if score in [4, 5] else 0 for score in df['Score']]
df = df.drop_duplicates().reset_index(drop=True)
df = df.drop_duplicates(subset={"UserId", "Time","Text"})
df = df.groupby('sentiment').sample(2500, random_state=42)

In [3]:
df.head(3)

Unnamed: 0,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,sentiment
556850,B001EO5YDY,A3KMNPL0AN0QB3,Chad Ware,1,1,3,1326240000,Flavors in product description are inaccurate,"According to the product description, I was su...",0
36204,B000FGXT2A,A35DW1GJBLNMZI,VicPaxGear,1,1,3,1202860800,Tasty but Crumbly,"This cereal tastes great. Unfortunately, it d...",0
506099,B001C15JCU,A26LT2ZMC3E0BK,C. Fairstone,10,11,1,1346198400,FDA Warns Chicken Jerky From China May Harm or...,"STOP USING THIS PRICEY JUNK NOW!! search ""chic...",0


In [4]:
with open('data/my_vocab.yaml', 'r') as file:
    vocab = yaml.safe_load(file)

contractions = vocab['contractions']
negations = vocab['negations']

In [5]:
print('negations', negations[:5])
print('contractions', [(k, contractions[k]) for k in list(contractions.keys())[:5]])

negations ['aren', "aren't", 'couldn', "couldn't", 'didn']
contractions [("ain't", 'am not'), ("aren't", 'are not'), ("can't", 'cannot'), ("can't've", 'cannot have'), ('cause', 'because')]


In [6]:
# including words to stop-words list
include_to_stopwords = set(['also', 'would', 'much', 'many'])
stop_words = set(stopwords.words('english'))
stop_words = stop_words.union(include_to_stopwords)

# removing words from the stop-words list
stop_words = stop_words.difference(negations)

In [7]:
# !python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm", disable = ['parser','ner'])

In [8]:
def remove_stuff(text):
    text = re.sub("<[^>]*>", " ", text) # Remove html tags
    text = re.sub("\S*@\S*[\s]+", " ", text) # Remove emails
    text = re.sub("https?:\/\/.*?[\s]+", " ", text) # Remove links
    text = re.sub("[^a-zA-Z' ]", "", text) # Remove non-letters
    text = re.sub("[\s]+", " ", text) # Remove excesive whitespaces
    text = text.translate(str.maketrans('', '', string.punctuation)) # Remove punctuation
    
    return text

def remove_stopwords(text, stop_words: set):
    text = text.lower().split()
    text = [word for word in text if not word in stop_words]
    return " ".join(text)

def process_with_stemmer(text):
    stemmer = PorterStemmer()
    text = text.lower().split()
    text = [stemmer.stem(word) for word in text]
    return " ".join(text)

def process_with_lemmatizer(text):
    text = text.lower()
    doc = nlp(text)
    text = " ".join([token.lemma_ for token in doc if len(token.lemma_) > 1 ])

    return text

def replace_words(text, replace_on:dict):
    text = text.lower().split()
    text = [replace_on.get(word) if word in replace_on else word for word in text]
    return " ".join(text)


def normalize_text(text):
    text = remove_stuff(text)
    text = remove_stopwords(text, stop_words)
    text = replace_words(text, contractions)

    # test = process_with_stemmer(text)
    text = process_with_lemmatizer(text)
    
    return text

In [9]:
text = 'On a quest for the perfedc1112t,,, !!!! <br />%%2%% popcorn to compliment\
 the Whirley Pop.  Don\'t get older, I\'m beginning to appreciate the more "natural" \
popcorn varieties, and I suppose that\'s what attracted me to the Arrowhead Mills \
Organic Yellow Popcorn.<br /> <br />I\'m no "organic" food expert.  I just wanted \
some good tasting popcorn.  And, I feel like that\'s what I got.  Using the Whirley \
Pop, with a very small amount of oil, I\'ve had great results.'

print('Original text')
print(text)
print("#" * 50)
print('Normalized text')
print(normalize_text(text))

Original text
On a quest for the perfedc1112t,,, !!!! <br />%%2%% popcorn to compliment the Whirley Pop.  Don't get older, I'm beginning to appreciate the more "natural" popcorn varieties, and I suppose that's what attracted me to the Arrowhead Mills Organic Yellow Popcorn.<br /> <br />I'm no "organic" food expert.  I just wanted some good tasting popcorn.  And, I feel like that's what I got.  Using the Whirley Pop, with a very small amount of oil, I've had great results.
##################################################
Normalized text
quest perfedct popcorn compliment whirley pop do not get old begin appreciate natural popcorn variety suppose that attract arrowhead mill organic yellow popcorn organic food expert want good tasting popcorn feel like that get use whirley pop small amount oil ve great result


In [10]:
df['text_normalized'] = df['Text'].progress_apply(normalize_text)

  0%|          | 0/5000 [00:00<?, ?it/s]

### scratch

In [11]:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for sentence in data:
        word_list = sentence.split(" ")
        corpus.append(word_list)    
           
    return corpus

corpus = build_corpus(df['text_normalized'])
corpus[0][:10]

['accord',
 'product',
 'description',
 'suppose',
 'receive',
 'sampler',
 'box',
 'contain',
 'kcup',
 'follow']

In [None]:
model_emb_from_scratch = word2vec.Word2Vec(corpus, vector_size=100, window=5, min_count=50, workers=4)
model_emb_from_scratch.wv.save_word2vec_format('data/model_emb_from_scratch.bin', binary=True)

In [13]:
def tok2vec(tokens, model, avg = 'mean'):
    vects = []
    for token in tokens:
        try:
            vects.append(model[token])
        except TypeError:
            try:
                vects.append(model.wv[token])
            except KeyError:
                pass
        except KeyError:
            pass

    if not vects:
        return np.full((model.vector_size,), np.nan, dtype=np.float32)

    vects = np.array(vects)

    if avg == 'mean':
        return np.nanmean(vects, axis=0)
    elif avg == 'sum':
        return np.nansum(vects, axis=0)

In [14]:
text = "This is an example cat."
tokens = word_tokenize(text.lower())
print('tokens', tokens)

vector = tok2vec(tokens, model_emb_from_scratch, avg='mean')
print('vector', vector)

tokens ['this', 'is', 'an', 'example', 'cat', '.']
vector [ 0.59161085  0.35044494  0.8492645   0.46546075 -0.25025776 -0.6378975
  0.12813044  0.36764082  0.4150986   0.42065218 -0.17782676 -0.20056008
  0.6615182   0.8868185   0.14721219 -0.01239322 -0.3462136  -0.22809528
  0.14166239 -0.90407175  0.30769414  1.1139303  -0.5064335  -0.3028266
 -1.2784158   0.99183035 -0.45414108 -0.5351317  -0.24835297  0.4142786
  0.37300763 -0.00327196  0.60698575 -0.09981637 -0.57618225  0.23192622
 -0.31584084 -0.07097046 -0.47449717 -1.0083066   0.29220402 -0.4681635
 -0.8912367  -0.12310207 -0.25458217 -0.07787346 -0.09584736 -0.36006615
  0.43161273 -0.5658284  -0.32748434  0.32925162 -0.11703091  0.31460857
  0.40999267  0.88384813 -0.27076566 -0.39030018  0.48422053  0.11924732
  0.20880312 -0.7001594  -0.43048787 -0.3184659  -0.38061136  0.42503923
  0.5088034  -0.2238752  -0.45792094  0.30862778 -0.65808004 -0.15626599
 -0.6572367  -0.69766015 -0.12925617  1.4608196  -0.5171097   0.444204

### glove

In [15]:
# Convert GloVe format to Word2Vec format
glove2word2vec('data/glove.6B.50d.txt', 'data/glove.6B.50d.vec')

# Load the GloVe model
glove_model = KeyedVectors.load_word2vec_format('data/glove.6B.50d.vec')

# Example: Retrieve vector for a word
word_vector = glove_model['cat']
print("Vector for 'cat':", word_vector)

# Example: Find most similar words
similar_words = glove_model.most_similar('cat', topn=5)
print("Most similar words to 'cat':", similar_words)

  glove2word2vec('data/glove.6B.50d.txt', 'data/glove.6B.50d.vec')


Vector for 'cat': [ 0.45281  -0.50108  -0.53714  -0.015697  0.22191   0.54602  -0.67301
 -0.6891    0.63493  -0.19726   0.33685   0.7735    0.90094   0.38488
  0.38367   0.2657   -0.08057   0.61089  -1.2894   -0.22313  -0.61578
  0.21697   0.35614   0.44499   0.60885  -1.1633   -1.1579    0.36118
  0.10466  -0.78325   1.4352    0.18629  -0.26112   0.83275  -0.23123
  0.32481   0.14485  -0.44552   0.33497  -0.95946  -0.097479  0.48138
 -0.43352   0.69455   0.91043  -0.28173   0.41637  -1.2609    0.71278
  0.23782 ]
Most similar words to 'cat': [('dog', 0.9218006134033203), ('rabbit', 0.8487821817398071), ('monkey', 0.8041081428527832), ('rat', 0.7891963720321655), ('cats', 0.7865270376205444)]


### train

In [16]:
X = df['text_normalized']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
print(X_train.iloc[0])

one big mouser love troutyet another high quality cat food fancy feast appreciate amazon offer variety autodelivery discount think cat like troutit not smell stinky seem type gravy kitty nice bath eat appearing satifie what not like happy cat happy human


In [18]:
X_train = X_train.apply(word_tokenize)
print(X_train.iloc[0])

['one', 'big', 'mouser', 'love', 'troutyet', 'another', 'high', 'quality', 'cat', 'food', 'fancy', 'feast', 'appreciate', 'amazon', 'offer', 'variety', 'autodelivery', 'discount', 'think', 'cat', 'like', 'troutit', 'not', 'smell', 'stinky', 'seem', 'type', 'gravy', 'kitty', 'nice', 'bath', 'eat', 'appearing', 'satifie', 'what', 'not', 'like', 'happy', 'cat', 'happy', 'human']


In [19]:
X_train = X_train.apply(lambda x: tok2vec(x, glove_model, 'mean'))
print(X_train.iloc[0])

[ 0.23401967  0.11642583 -0.39039063 -0.19259414  0.44021174  0.08487431
 -0.44889867 -0.25670952  0.08128236  0.2058957  -0.04263912  0.38225102
  0.18541351  0.02980959  0.47350112  0.2496316   0.12010796  0.1731719
 -0.1827333  -0.62209094 -0.13758871  0.17337418  0.26171416  0.22553168
  0.24107078 -1.0848439  -0.7436808   0.31928018  0.57311535 -0.4045977
  2.3049736   0.39121112 -0.0859626   0.0529886   0.04741706  0.05078922
 -0.08203696  0.08052175  0.11566141 -0.36979553 -0.04784045  0.19128942
 -0.02155017  0.31916544  0.3836128   0.17888433 -0.02071665 -0.21769795
  0.09551313  0.33448845]


In [20]:
X_train = X_train.to_numpy()
print(X_train[0])

[ 0.23401967  0.11642583 -0.39039063 -0.19259414  0.44021174  0.08487431
 -0.44889867 -0.25670952  0.08128236  0.2058957  -0.04263912  0.38225102
  0.18541351  0.02980959  0.47350112  0.2496316   0.12010796  0.1731719
 -0.1827333  -0.62209094 -0.13758871  0.17337418  0.26171416  0.22553168
  0.24107078 -1.0848439  -0.7436808   0.31928018  0.57311535 -0.4045977
  2.3049736   0.39121112 -0.0859626   0.0529886   0.04741706  0.05078922
 -0.08203696  0.08052175  0.11566141 -0.36979553 -0.04784045  0.19128942
 -0.02155017  0.31916544  0.3836128   0.17888433 -0.02071665 -0.21769795
  0.09551313  0.33448845]


In [21]:
X_test = X_test.apply(word_tokenize)
X_test = X_test.apply(lambda x: tok2vec(x, glove_model, 'mean'))
X_test = X_test.to_numpy()

In [22]:
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

X_train = np.stack(X_train, axis=0)
X_test = np.stack(X_test, axis=0)

In [23]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4000, 50), (1000, 50), (4000,), (1000,))

In [24]:
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.727
