In [1]:
#the module 'sys' allows istalling module from inside Jupyter
import sys

!{sys.executable} -m pip install numpy
import numpy as np

!{sys.executable} -m pip install pandas
import pandas as pd

#Natrual Language ToolKit (NLTK)
!{sys.executable} -m pip install nltk
import nltk

!{sys.executable} -m pip install sklearn
from sklearn import metrics
#from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import  CountVectorizer #bag-of-words vectorizer 
from sklearn.decomposition import LatentDirichletAllocation #package for LDA

# Plotting tools

from pprint import pprint
!{sys.executable} -m pip install pyLDAvis #visualizing LDA
import pyLDAvis
import pyLDAvis.sklearn

import matplotlib.pyplot as plt
%matplotlib inline

#ignore warnings about future changes in functions as they take too much space
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)



In [2]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        
def get_topic_words(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_words = []
    for topic_weights in lda_model.components_:
        top_word_locs = (-topic_weights).argsort()[:n_words]
        topic_words.append(keywords.take(top_word_locs).tolist())
    return topic_words

In [3]:
# Load data from file
df_fashion = pd.read_json('data/AMAZON_FASHION_5.json.gz', lines = True, compression='gzip')
df_beauty = pd.read_json('data/All_Beauty_5.json.gz', lines = True, compression='gzip')

In [4]:
# Remove duplicate and missing data
df_fashion = df_fashion.drop_duplicates(subset=['reviewerID', 'asin']).dropna(subset=['reviewText'])
df_beauty = df_beauty.drop_duplicates(subset=['reviewerID', 'asin']).dropna(subset=['reviewText'])

In [5]:
print("There are", len(df_fashion), "reviews in Amazon Fashion")
print("There are", len(df_beauty), "reviews in Amazon Beauty")

There are 3026 reviews in Amazon Fashion
There are 4088 reviews in Amazon Beauty


In [6]:
%run ./Text_Normalization_Function.ipynb #defining text normalization function

Collecting html.parser
Installing collected packages: html.parser
Successfully installed html.parser


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fortunagd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/fortunagd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/fortunagd/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fortunagd/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  ['<', 'p', '>', 'The', 'circus', 'dog', 'in', 'a', 'plissé', 'skirt', 'jumped', 'over', 'Python', 'who', 'was', "n't", 'that', 'large', ',', 'just', '3', 'feet', 'long.', '<', '/p', '>']
Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  <p>The circus dog in a plissé skirt jumped over Python who was not that large, just 3 feet long.</p>
Original:   <p>The circus dog in a plissé skirt jumped over Python who wasn't that large, just 3 feet long.</p>
Processed:  [('<', 'a'), ('p', 'n'), ('>', 'v'), ('the', None), ('circus', 'n'), ('dog', 'n'), ('in', None), ('a', None), ('plissé', 'n'), ('skirt', 'n'), ('jumped', 'v'), ('over', None), ('python', 'n'), ('who', None), ('was', 'v'), ("n't", 'r'), ('that', None), ('large', 'a'), (',', None), ('just', 'r'), ('3', None), ('feet', 'n'), ('long.', 'a'), 

In [7]:
# Preprocess review data
df_fashion_reviews = normalize_corpus(df_fashion['reviewText'])
df_beauty_reviews = normalize_corpus(df_beauty['reviewText'])

In [8]:
# Select the most important k features, The classes in the sklearn.feature_selection module can be used for feature selection
# /dimensionality reduction on sample sets, either to improve estimators’ accuracy scores or to boost their performance on 
# very high-dimensional datasets.
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif


def find_k_best_features(corpus, k):
    vectorizer = CountVectorizer()
    corpus_train = vectorizer.fit_transform(corpus)
    corpus_train_table = pd.DataFrame(data = corpus_train.todense(), columns = vectorizer.get_feature_names())
    chi2_kbest = SelectKBest(score_func = chi2, k = k)
    NORM_corpus_train_chi2_BEST = chi2_kbest.fit_transform(corpus_train, corpus)
    chi2_best_features_ind = chi2_kbest.get_support(indices=True)
    chi2_best_features_names = np.array(vectorizer.get_feature_names())[chi2_best_features_ind]
    X_train_bow_chi2_BEST_table = pd.DataFrame(data = NORM_corpus_train_chi2_BEST.todense(), columns = chi2_best_features_names)
    return X_train_bow_chi2_BEST_table

In [9]:
find_k_best_features(df_fashion_reviews, 20)

Unnamed: 0,1st,capri,day,deep,fault,finish,fruit,glass,gray,grommet,hip,loom,male,offer,pain,ring,show,teal,thigh,wear
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3021,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3022,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3023,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3024,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [10]:
find_k_best_features(df_beauty_reviews, 20)

Unnamed: 0,aqua,calibra,certified,exfoliating,exfoliator,eye,gum,hibiscus,mouthwash,odor,patchouli,polish,pump,shave,sonicare,toothbrush,toothpaste,tweezer,vanilla,velva
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,1,0,0,0,5,0,0,0,0,0,5
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4083,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4084,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4085,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4086,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
corpus = df_fashion_reviews
#define a Bag-of-Words vecgtorizer
bow_vectorizer_news = CountVectorizer(max_features=1000)

#vectorize data
bow_news_corpus = bow_vectorizer_news.fit_transform(corpus)
    
lda_news = LatentDirichletAllocation(n_components=2, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_news_corpus)
no_top_words_news = 10
display_topics(lda_news, bow_vectorizer_news.get_feature_names(), no_top_words_news)
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')

Topic 0:
shoe fit comfortable size great love like light foot perfect
Topic 1:
shoe love wear foot comfortable day good use look nike


In [12]:
corpus = df_beauty_reviews
#define a Bag-of-Words vecgtorizer
bow_vectorizer_news = CountVectorizer(max_features=1000)

#vectorize data
bow_news_corpus = bow_vectorizer_news.fit_transform(corpus)
    
lda_news = LatentDirichletAllocation(n_components=2, max_iter=100,
                                     doc_topic_prior = 0.25,
                                     topic_word_prior = 0.25).fit(bow_news_corpus)
no_top_words_news = 10
display_topics(lda_news, bow_vectorizer_news.get_feature_names(), no_top_words_news)
#prepare to display result in the Jupyter notebook
pyLDAvis.enable_notebook()

#run the visualization [mds is a function to use for visualizing the "distance" between topics]
pyLDAvis.sklearn.prepare(lda_news, bow_news_corpus, bow_vectorizer_news, mds='tsne')

Topic 0:
love smell product great scent body skin like buy soap
Topic 1:
use hair product shampoo good like well work great really
