## 데이터 전처리

In [29]:
import pandas as pd
import numpy as np
import string
import re

## NLTK stopwords
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [30]:
# stopwords
def set_stop_words():
    stop_words = stopwords.words('english')
    stop_words.extend(['from', 'subject', 're', 'edu',
    'use', 'not', 'would', 'say', 'could', '_', 'be', 'know',
    'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some',
    'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily',
    'lot', 'lack', 'make', 'want', 'seem', 'run', 'need',
    'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come',
    'app', 'non'])
    
    return stop_words
stop_words = set_stop_words()

In [31]:
def preprocess_text(text_data, get_lemma=True, tags=(), stop_words=[]):
    # 소문자 변경
    text_data = text_data.lower()
    # URL 제거
    text_data = re.sub(r'((www.\S+)|(https?://\S+))', r"", text_data)
    # HTML 태그 제거
    text_data = re.sub(r'<[^>]+>', r'', text_data)
    # 숫자 제거
    text_data = re.sub(r'[0-9]\S+', r'', text_data)
    # 문장부호 제거
    text_data = [char for char in text_data if char not in string.punctuation]
    text_data = "".join(text_data)
    # 금지어 제거
    text_data = [word for word in text_data.split() if word.lower() not in stop_words]
    text_data = " ".join(text_data)
    # 래마타이즈
    if get_lemma == True:
        text_data = [lemmatizer.lemmatize(word) for word in text_data.split()]
        text_data = " ".join(text_data)
    # POS 필터
    if len(tags) > 0:
        text_data = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(text_data)) if
                     pos.startswith(tags)]
        text_data = " ".join(text_data)
    
    return text_data

In [32]:
# 데이터 불러오기
file_name = "Womens Clothing E-Commerce Reviews.csv"
df = pd.read_csv(file_name)
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [33]:
df.shape

(23486, 11)

In [34]:
df2 = df.dropna(subset=['Review Text'])
df2.shape

(22641, 11)

In [35]:
data = df2.copy()

In [36]:
TAGS = ('JJ', 'NN', 'RB', 'VB')

In [37]:
data['Review Text2'] = data['Review Text'].apply(lambda x: preprocess_text(x, get_lemma=True, tags=TAGS, stop_words=stop_words))

In [38]:
data['Review Text2']

0              absolutely wonderful silky sexy comfortable
1        love dress sooo pretty happened find store im ...
2        high hope dress really wanted work initially o...
3        love love love jumpsuit fun flirty fabulous ti...
4        shirt flattering due adjustable front tie perf...
                               ...                        
23481    happy snag dress great price slip flattering c...
23482    reminds maternity clothes soft stretchy shiny ...
23483    fit well top never worked im glad able store d...
23484    bought dress wedding summer cute unfortunately...
23485    dress lovely platinum feminine fit perfectly w...
Name: Review Text2, Length: 22641, dtype: object

In [39]:
import gensim

def sent_to_words(sentences):
    for sent in sentences:
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True)
        yield(sent)

In [40]:
# Convert to list
dataset = data['Review Text2'].values.tolist()
data_words = list(sent_to_words(dataset))
print(data_words[:1])

[['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable']]


## LDA Modeling

In [41]:
# import packages for LDA
import gensim
import logging
import warnings
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Library for visualization
import matplotlib.pyplot as plt

In [42]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) 
# higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)

trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [43]:
bigram.export_phrases()

{'tracy_reese': 6149.393939393939,
 'foot_tall': 108.95927420737912,
 'lower_half': 120.09152830991911,
 'hourglass_figure': 133.73052959501558,
 'average_height': 138.34357841967056,
 'took_chance': 285.04582651391166,
 'form_fitting': 406.9405864197531,
 'others_mentioned': 138.37484170536092,
 'dry_cleaned': 447.99107142857144,
 'month_ago': 216.65124555160142,
 'added_bonus': 335.25468587296075,
 'pilcro_stet': 199.45663397005677,
 'reasonably_priced': 1006.11328125,
 'visual_interest': 1396.436746987952,
 'cowl_neck': 206.72577288941736,
 'elastic_waistband': 149.03132075471697,
 'cant_wait': 164.7727443778788,
 'wish_list': 122.38103070175438,
 'couldve_gotten': 120.80909943714822,
 'football_player': 5365.9375,
 'cami_underneath': 109.89607460685114,
 'baby_doll': 1015.101156069364,
 'body_type': 118.72181548602201,
 'last_week': 106.23759468039897,
 'cold_water': 614.7641975308642,
 'burnt_orange': 358.84096024006,
 'real_life': 838.5837209302326,
 'caught_eye': 714.77515516829

In [44]:
trigram.export_phrases()

{'tracy_reese': 1695460.0,
 'foot_tall': 165.49909971705392,
 'lower_half': 159.45036544850498,
 'hourglass_figure': 221.1289297068174,
 'average_height': 176.34915865384616,
 'took_chance': 784.8968001695274,
 'form_fitting': 2052.981086028066,
 'others_mentioned': 405.96160972356034,
 'dry_cleaned': 3518.525179856115,
 'month_ago': 4176.009852216749,
 'someone_else': 129.30074046534685,
 'added_bonus': 547.6763717805152,
 'pilcro_stet': 395.2121212121212,
 'reasonably_priced': 5016.153846153847,
 'lot_compliment': 109.60603721807735,
 'visual_interest': 10296.315789473683,
 'cowl_neck': 549.4367295990014,
 'warmer_climate': 111.32735808792147,
 'elastic_waistband': 206.28980125163332,
 'cant_wait': 694.2230015342706,
 'wish_list': 316.0812826249068,
 'couldve_gotten': 163.84422110552765,
 'football_player': 104336.0,
 'cami_underneath': 207.6807172487622,
 'baby_doll': 16578.813559322032,
 'body_type': 340.59444725545563,
 'last_week': 243.56876061120542,
 'cold_water': 2271.57957957

In [45]:
def process_words(texts):
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    
    return texts

In [46]:
# Prepare documents for Gensim Modeling
documents = process_words(data_words)

In [53]:
documents[6]

['aded',
 'basket',
 'hte',
 'last',
 'mintue',
 'look',
 'person',
 'store',
 'pick',
 'went',
 'teh',
 'darkler',
 'color',
 'pale',
 'hte',
 'color',
 'really',
 'gorgeous',
 'turn',
 'mathced',
 'everythiing',
 'trying',
 'prefectly',
 'little',
 'baggy',
 'hte',
 'hte',
 'msallet',
 'size',
 'petite',
 'decided',
 'jkeep',
 'said',
 'matvehd',
 'everything',
 'ejans',
 'pant',
 'skirt',
 'waas',
 'trying',
 'kept',
 'oops']

In [54]:
# Create Dictionary
dictionary = corpora.Dictionary(documents)
for i in range(0, 10):
    print(dictionary[i])

absolutely
comfortable
sexy
silky
wonderful
bc
bought
definitely
dress
find


In [57]:
print('dictionary size: %d', len(dictionary))

dictionary size: %d 16503


In [58]:
from collections import Counter

min_count = 20
word_counter = Counter((word for words in documents for word in words))
removal_word_idxs = {
    dictionary.token2id[word] for word, count in word_counter.items() if count < min_count
}

dictionary.filter_tokens(removal_word_idxs)
dictionary.compactify()
print('dictionary size: %d', len(dictionary))

dictionary size: %d 2086
