In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import model_selection, naive_bayes
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import re
import nltk

In [2]:
data = pd.read_csv('amazon-reviews.csv', encoding="latin-1")

In [3]:
data.head()

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tr...,__label__2
1,The best soundtrack ever to anything.: I'm re...,__label__2
2,Amazing!: This soundtrack is my favorite musi...,__label__2
3,Excellent Soundtrack: I truly like this sound...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After H...",__label__2


In [4]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [5]:


def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text



In [6]:
clean_texts = []
i = 0
for text in data.text:
    clean_texts.append(clean_text(text))
    data['text'][i] = clean_texts[i]
    i+=1
print("Texts are complete.")

Texts are complete.


In [7]:
data.head(5)

Unnamed: 0,text,label
0,stuning even non gamer sound track beautiful p...,__label__2
1,best soundtrack ever anything reading lot revi...,__label__2
2,amazing soundtrack favorite music time hands i...,__label__2
3,excellent soundtrack truly like soundtrack enj...,__label__2
4,remember pull jaw floor hearing played game kn...,__label__2


In [8]:

# split dataset
from sklearn import model_selection
from sklearn import preprocessing

# train-test split
x_train, x_test, y_train, y_test = model_selection.train_test_split(data.text, data.label) 

# label encode the target 
encoder = preprocessing.LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)


In [9]:
# count vector
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') 
count_vect.fit(data.text) # regexp selects tokens of 1 or more alphanumeric characters

xall_count = count_vect.transform(data.text)
xtrain_count = count_vect.transform(x_train)
xtest_count = count_vect.transform(x_test)

In [10]:
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

# word-level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(data.text)
xtrain_tfidf = tfidf_vect.transform(x_train)
xtest_tfidf = tfidf_vect.transform(x_test)

# ngram-level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2, 3), max_features=5000)
tfidf_vect_ngram.fit(data.text) # measures bi-grams and tri-grams
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(x_train)
xtest_tfidf_ngram = tfidf_vect_ngram.transform(x_test)

In [12]:
# Latent Dirichlet Allocation model (with online variational Bayes algorithm)
from sklearn import decomposition
import numpy as np

lda_model = decomposition.LatentDirichletAllocation(n_components=10, learning_method='online', max_iter=100)
lda_fit = lda_model.fit_transform(xall_count)
topics = lda_model.components_ 
vocab = count_vect.get_feature_names()

# top keywords for each topic
n_words = 10
vocab = count_vect.get_feature_names()
keywords = np.array(vocab)
topic_keywords = []
for topic_weights in topics:
    top_keyword_locs = (-topic_weights).argsort()[:n_words]
    topic_keywords.append(keywords.take(top_keyword_locs))
df_topic_kw = pd.DataFrame(topic_keywords)
df_topic_kw.columns = ['Word '+str(i) for i in range(df_topic_kw.shape[1])]
df_topic_kw.index = ['Topic '+str(i) for i in range(df_topic_kw.shape[0])]
df_topic_kw

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,battery,camera,power,version,tape,charger,charge,apple,adapter,plug
Topic 1,cd,music,album,songs,song,tracks,sound,blue,rice,cable
Topic 2,boot,metal,ugly,bar,sheets,results,email,contact,cooker,coyote
Topic 3,air,small,apart,stopped,keeps,pump,shipped,local,fast,drive
Topic 4,junk,la,jack,de,windows,built,mac,card,scanner,twist
Topic 5,american,printer,brother,hopkins,haunting,dummy,america,print,economics,fats
Topic 6,book,read,one,would,books,like,reading,story,people,could
Topic 7,movie,one,film,like,great,good,dvd,album,music,best
Topic 8,would,one,get,product,buy,great,good,bought,money,like
Topic 9,book,great,good,one,love,story,really,like,better,time


In [13]:
# dominant topic for each matrix
topic_names = ['Topic ' + str(i) for i in range(lda_model.n_components)]
df_doctop = pd.DataFrame(np.round(lda_fit, 2), columns=topic_names, index=data.index)
dominant_topic = np.argmax(df_doctop.values, axis=1)
df_doctop['dominant_topic'] = dominant_topic 
df_doctop.head(10)

Unnamed: 0,Topic 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,dominant_topic
0,0.0,0.04,0.0,0.0,0.0,0.0,0.6,0.35,0.0,0.0,6
1,0.0,0.0,0.0,0.0,0.0,0.0,0.27,0.52,0.19,0.0,7
2,0.0,0.0,0.0,0.0,0.0,0.02,0.09,0.57,0.31,0.0,7
3,0.0,0.0,0.0,0.0,0.28,0.0,0.0,0.63,0.08,0.0,7
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.95,0.0,0.04,7
5,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.33,0.0,0.41,9
6,0.0,0.0,0.0,0.0,0.0,0.01,0.84,0.0,0.14,0.0,6
7,0.0,0.0,0.0,0.0,0.03,0.0,0.17,0.0,0.0,0.78,9
8,0.0,0.0,0.15,0.0,0.02,0.09,0.38,0.0,0.0,0.34,6
9,0.0,0.0,0.0,0.0,0.07,0.0,0.46,0.0,0.0,0.45,6


In [14]:
data2 = data
# coll2apse texts into a set
data = data2.text.map(word_tokenize).values
total_vocabulary = set(word for line in data for word in line)
print('Unique tokens in texts: {}'.format(len(total_vocabulary)))

Unique tokens in texts: 31545


In [18]:
# word embedding
# ! pip3 install gensim
import gensim
from gensim.models import Word2Vec

w2v_model = Word2Vec(data, window=5, min_count=1, workers=4)
w2v_model.train(data, total_examples=w2v_model.corpus_count, epochs=10)
word_vectors = w2v_model.wv

In [21]:
print('Words Most Similar to Amazon:')
display(word_vectors.most_similar('amazon'))

Words Most Similar to Amazon:


[('com', 0.8505464196205139),
 ('sale', 0.8436679244041443),
 ('purchase', 0.8381311893463135),
 ('refund', 0.836225688457489),
 ('vendor', 0.8354429602622986),
 ('return', 0.8338057398796082),
 ('seller', 0.8285685777664185),
 ('mail', 0.8234552145004272),
 ('offered', 0.8066745400428772),
 ('company', 0.8025359511375427)]

In [22]:
# model wrapper function
from sklearn import metrics

def train_model(classifier, train_features, label, test_features):
    # fit the training data on classifier
    classifier.fit(train_features, label)
    
    # predict testing data labels
    predictions = classifier.predict(test_features)
    
    return metrics.accuracy_score(predictions, y_test)

In [23]:
# Naive Bayes
from sklearn import naive_bayes

# Count Vectors
nb_cv = train_model(naive_bayes.MultinomialNB(), xtrain_count, y_train, xtest_count)
print("[Naive Bayes] Count Vectors Accuracy:", round(nb_cv, 3))

# Word-Level TF-IDF Vectors
nb_wl = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, y_train, xtest_tfidf)
print("[Naive Bayes] Word-Level TF-IDF Accuracy:", round(nb_wl, 3))

[Naive Bayes] Count Vectors Accuracy: 0.838
[Naive Bayes] Word-Level TF-IDF Accuracy: 0.845


In [24]:
# Logistic Regression
from sklearn import linear_model

# Count Vectors
lr_cv = train_model(linear_model.LogisticRegression(), xtrain_count, y_train, xtest_count)
print("[Logistic Regression] Count Vectors Accuracy:", round(lr_cv, 3))

# Word-Level TF-IDF Vectors
lr_wl = train_model(linear_model.LogisticRegression(), xtrain_tfidf, y_train, xtest_tfidf)
print("[Logistic Regression] Word-Level TF-IDF Accuracy:", round(lr_wl, 3))


[Logistic Regression] Count Vectors Accuracy: 0.848
[Logistic Regression] Word-Level TF-IDF Accuracy: 0.853


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
