# Preprocessing

In [2]:
import numpy as np
import pandas as pd

In [3]:
import string
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore') 
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [4]:
# Read and Peak at Data
df = pd.read_csv("./Sentiment_analysis/Womens Clothing E-Commerce Reviews.csv",index_col=0)

# Delete missing observations for following variables
for x in ["Division Name","Department Name","Class Name","Review Text"]:
    df = df[df[x].notnull()]

# Extracting Missing Count and Unique Count by Column
unique_count = []
for x in df.columns:
    unique_count.append([x,len(df[x].unique()),df[x].isnull().sum()])

# Missing Values
print("Missing Values: {}".format(df.isnull().sum().sum()))

# Data Dimensions
print("Dataframe Dimension: {} Rows, {} Columns".format(*df.shape))

# Create New Variables: 
# Word Length
df["Word Count"] = df['Review Text'].str.split().apply(len)
# Character Length
df["Character Count"] = df['Review Text'].apply(len)
# Boolean for Positive and Negative Reviews
df["Label"] = 0
df.loc[df.Rating >= 3,["Label"]] = 1

Missing Values: 2966
Dataframe Dimension: 22628 Rows, 10 Columns


In [5]:
#tokenization and stemming
stop_words = stopwords.words('english')
stemmer = SnowballStemmer('english')

def clean_doc(doc):
    # split the doc into word tokens
    tokens = doc.split()
    # remove punctuation from each token
    tokens = [w.lower() for w in tokens if not w in string.punctuation]
    # remove remaining tokens that are not alphabetic
    tokens = [w for w in tokens if w.isalpha()]
    # stemming
    tokens = [stemmer.stem(w) for w in tokens]
    # filter out stop words
    tokens = [w for w in tokens if not w in stop_words]
    return tokens
df['cleaned_review'] = [clean_doc(rev) for rev in df["Review Text"]]

In [6]:
df.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Word Count,Character Count,Label,cleaned_review
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,8,53,1,"[absolut, wonder, silki, sexi, comfort]"
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,62,303,1,"[love, sooo, happen, find, glad, bc, never, wo..."
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,98,500,1,"[high, hope, dress, realli, want, work, initi,..."
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,22,124,1,"[love, everi, time, wear, get, noth, great]"
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,36,192,1,"[shirt, veri, flatter, due, adjust, front, per..."


## Count vectorize

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

df_train, df_test = train_test_split(df, test_size=0.2)

# merged words list
X_train =[' '.join(review) for review in df_train['cleaned_review']]
X_test=[' '.join(review) for review in df_test['cleaned_review']]

# count
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train).toarray()
X_test_counts = count_vectorizer.transform(X_test).toarray()

## n-gram vectorize

In [8]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
X_train_ng = ngram_vectorizer.fit_transform(X_train).toarray()
X_test_ng = ngram_vectorizer.transform(X_test).toarray()

## tf-idf

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
X_train_tfidf = transformer.fit_transform(X_train_counts)
X_test_tfidf = transformer.transform(X_test_counts)

## word2Vec

In [10]:
# word2Vec
model_w2v = Word2Vec(df["cleaned_review"],vector_size=128)
# summarize the loaded model
print(model_w2v)

Word2Vec<vocab=2964, vector_size=128, alpha=0.025>


In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['cleaned_review'])
encoded_train_docs = tokenizer.texts_to_sequences(df_train['cleaned_review'])
encoded_test_docs = tokenizer.texts_to_sequences(df_test['cleaned_review'])
max_length = max([len(s) for s in df_train['cleaned_review']])
X_train = pad_sequences(encoded_train_docs, 
                        maxlen=max_length, 
                        padding='post')
y_train = pd.get_dummies(df_train['Class Name'])
X_test = pad_sequences(encoded_test_docs, 
                        maxlen=max_length, 
                        padding='post')
y_test = pd.get_dummies(df_test['Class Name'])

# Topic Detection
## LDA model

In [12]:
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

In [13]:
# generate dictionary (bag of words)
docs_dictionary = Dictionary(df['cleaned_review'])
# transfer each document into (word_index, frequency)
docs_corpus = [docs_dictionary.doc2bow(review) for review in df['cleaned_review']]

In [14]:
# train the model on the corpus.
lda_model = LdaModel(corpus=docs_corpus, 
               id2word=docs_dictionary,
               num_topics=10,
               chunksize=100,
               passes=10,
               alpha='auto')
lda_model.print_topics(num_topics=20,num_words=15)

[(0,
  '0.049*"abov" + 0.046*"overal" + 0.045*"shorter" + 0.043*"end" + 0.042*"jacket" + 0.034*"know" + 0.034*"denim" + 0.033*"kind" + 0.031*"tie" + 0.031*"flow" + 0.025*"open" + 0.022*"heavi" + 0.021*"appropri" + 0.018*"snag" + 0.018*"tunic"'),
 (1,
  '0.079*"skirt" + 0.054*"pair" + 0.052*"tight" + 0.042*"white" + 0.039*"side" + 0.030*"someth" + 0.028*"alway" + 0.027*"recommend" + 0.025*"weight" + 0.021*"slip" + 0.020*"fun" + 0.019*"tank" + 0.019*"thick" + 0.016*"lbs" + 0.016*"navi"'),
 (2,
  '0.065*"look" + 0.058*"like" + 0.041*"would" + 0.040*"top" + 0.031*"fabric" + 0.028*"tri" + 0.025*"realli" + 0.020*"becaus" + 0.019*"much" + 0.019*"materi" + 0.018*"think" + 0.017*"feel" + 0.016*"also" + 0.015*"want" + 0.015*"cute"'),
 (3,
  '0.060*"fit" + 0.058*"size" + 0.055*"veri" + 0.047*"wear" + 0.031*"order" + 0.024*"littl" + 0.022*"small" + 0.020*"perfect" + 0.019*"run" + 0.019*"bit" + 0.017*"usual" + 0.015*"beauti" + 0.015*"got" + 0.015*"flatter" + 0.015*"go"'),
 (4,
  '0.068*"mayb" + 0.0

## Probabilistic Latent Semantic Analysis

In [15]:
import scipy as sp
import sklearn
import sys
from gensim.models import ldamodel
import gensim.corpora
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import pickle

In [16]:
Reviews=[' '.join(text) for text in df['cleaned_review']]
len(Reviews)

22628

In [17]:
vectorizer = CountVectorizer(analyzer='word', max_features=5000)
x_counts = vectorizer.fit_transform(Reviews)
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x_counts)
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)
#number of topics
num_topics=10
#obtain a NMF model.
model = NMF(n_components=num_topics, init='nndsvd');
#fit the model
model.fit(xtfidf_norm)
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {};
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-n_top_words - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words;
    
    return pd.DataFrame(word_dict)


print(get_nmf_topics(model, num_topics))

  Topic # 01 Topic # 02 Topic # 03 Topic # 04 Topic # 05 Topic # 06  \
0       look       love      great      dress        fit       veri   
1       like    absolut       look       easi       true    flatter   
2        top      color       jean    flatter       well       soft   
3     realli    everyth      style      could     materi       well   
4       nice    feminin    qualiti     casual      color     materi   
5     fabric       wish       work     easili        lbs      happi   
6      would      simpl      color       made       good       nice   
7      color     detail     summer       slip       loos       true   
8      shirt     materi     skinni     pretti    flatter     fabric   
9       much       soft     bought     summer         xs     pretti   

   Topic # 07 Topic # 08 Topic # 09  Topic # 10  
0      beauti      super       size     perfect  
1       color       cute      order        wear  
2     sweater      comfi        run     comfort  
3        high     

## CNN

In [18]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['Review Text'])
encoded_train_docs = tokenizer.texts_to_sequences(df_train['Review Text'])
encoded_test_docs = tokenizer.texts_to_sequences(df_test['Review Text'])
max_length = max([len(s) for s in df_train['Review Text']])
X_train = pad_sequences(encoded_train_docs, 
                        maxlen=max_length, 
                        padding='post')
y_train = pd.get_dummies(df_train['Class Name'])
X_test = pad_sequences(encoded_test_docs, 
                        maxlen=max_length, 
                        padding='post')
y_test = pd.get_dummies(df_test['Class Name'])
diff_idx = y_train.columns.difference(y_test.columns)
for idx in diff_idx:
    y_test[idx] = [0]*len(y_test)

In [20]:
def get_word2vec_embed_layer(max_length, tokenizer, wv):
  word_index = tokenizer.word_index
  embedding_mat = np.zeros((len(word_index)+1, 100))
  for word, i in word_index.items():
      try:
          vector = wv[word]
          embedding_mat[i] = vector
      except:
          continue
  word2vec_embedding_layer = Embedding(input_dim=embedding_mat.shape[0],
                                      output_dim=embedding_mat.shape[1], 
                                      weights=[embedding_mat],
                                      input_length=max_length, 
                                      trainable=False)
  return word2vec_embedding_layer

# model_word2vec = Sequential()
# model_word2vec.add(get_word2vec_embed_layer(max_length, tokenizer, model_w2v.wv))
# model_word2vec.add(Conv1D(filters=128, kernel_size=2, activation='relu'))
# model_word2vec.add(MaxPooling1D(pool_size=2))
# model_word2vec.add(Flatten())
# model_word2vec.add(Dense(20, activation='softmax'))
# print(model_word2vec.summary())
# model_word2vec.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model_word2vec.fit(X_train, y_train, epochs=10)
# model_word2vec.evaluate(X_test, y_test)

In [27]:
df_train, df_test = train_test_split(df, test_size=0.2)

# merged words list
X_train =[' '.join(review) for review in df_train['cleaned_review']]
X_test=[' '.join(review) for review in df_test['cleaned_review']]

# count
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train).toarray()
X_test_counts = count_vectorizer.transform(X_test).toarray()

  and should_run_async(code)


In [28]:
# pretrained bert
from tensorflow.keras.layers import Input, Dropout
from tensorflow.keras import Model
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text # just needed tensorflow_text

bert_encoder_dir = 'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1'
bert_preprocess_dir = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

bert_preprocess_layer = hub.KerasLayer(bert_preprocess_dir)
bert_encode_model = hub.KerasLayer(bert_encoder_dir, trainable=True)

from tensorflow.keras.layers import Input, Dropout
from tensorflow.keras import Model
import tensorflow as tf
import tensorflow_hub as hub

text_input = Input(shape=(), dtype=tf.string)
bert_inputs = bert_preprocess_layer(text_input)
outputs = bert_encode_model(bert_inputs)
net = outputs['pooled_output']
net = Dropout(0.1)(net)
net = Dense(20, activation='softmax')(net)
bert_model = Model(text_input, net)
bert_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
bert_model.fit(tf.constant(X_train), y_train, epochs=10)


  and should_run_async(code)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15056086a7c0>