# Airline Review Topic Modeling

# 1. 라이브러브러리 가져오기 

In [1]:
# 라이브러리
import pandas as pd
import numpy as np
import string
import re
# NLTK 금지어 가져오기
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import gensim

In [2]:
# 변수 초기화
lemmatizer = WordNetLemmatizer()

In [3]:
# 함수

# 금지어 확장 함수
def set_stop_words(extended_list):
    '''
    
    '''
    stop_words = stopwords.words('english')
    stop_words.extend(extended_list)
    return stop_words

extended_list = ['from', 'subject', 're', 'edu', 
    'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 
    'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 
    'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 
    'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 
    'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come', 'app', 'non']




In [4]:
# 텍스트 전처리 함수
def preprocess_text(text_data,get_lemma=True, tags=(), stop_words=[]):
    # 소문자 변경
    text_data = text_data.lower()
    # URL 제거
    text_data = re.sub(r'((www.\S+)|(https?://\S+))', r"", text_data)
    # HTML 태그 제거
    text_data = re.sub(r'<[^>]+>', r'', text_data)
    # 숫자 제거
    text_data = re.sub(r'[0-9]\S+', r'', text_data)
    # 문장부호 제거
    text_data = [char for char in text_data if char not in string.punctuation]
    text_data = "".join(text_data)
    # 금지어 제거
    text_data = [word for word in text_data.split() if word.lower() not in stop_words]
    text_data = " ".join(text_data)
    # 래마타이즈
    if get_lemma==True:
        text_data = [lemmatizer.lemmatize(word) for word in text_data.split()]
        text_data = " ".join(text_data)
    # POS 필터 
    if len(tags)>0:
        text_data = [word for word, pos in nltk.pos_tag(nltk.word_tokenize(text_data)) if pos.startswith(tags)]
        text_data = " ".join(text_data)

    return text_data



In [5]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent) 
        
sent_to_words(["sentences"])

<generator object sent_to_words at 0x0000014E0F481000>

# 2. 데이터로딩

In [6]:
# 데이터 가져오기
file_name = "./dataset/Womens Clothing E-Commerce Reviews.csv"
df = pd.read_csv(file_name )
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [7]:
df.shape

(23486, 11)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


In [9]:
# 결측 값 제거
data = df.dropna(subset=["Review Text"]).copy()

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22641 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               22641 non-null  int64 
 1   Clothing ID              22641 non-null  int64 
 2   Age                      22641 non-null  int64 
 3   Title                    19675 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   22641 non-null  int64 
 6   Recommended IND          22641 non-null  int64 
 7   Positive Feedback Count  22641 non-null  int64 
 8   Division Name            22628 non-null  object
 9   Department Name          22628 non-null  object
 10  Class Name               22628 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.1+ MB


# 3. 텍스트 전처리

In [11]:
TAGS = ("JJ", "NN", "RB", "VB")
stop_words= set_stop_words(extended_list)

In [12]:
data['Review Text2'] = data.apply(lambda x: 
                                  preprocess_text(x['Review Text'], 
                                                  get_lemma=True, 
                                                  tags=TAGS,
                                                  stop_words= stop_words), 
                                                  axis=1)


In [13]:
data['Review Text2'].head()

0          absolutely wonderful silky sexy comfortable
1    love dress sooo pretty happened find store im ...
2    high hope dress really wanted work initially o...
3    love love love jumpsuit fun flirty fabulous ti...
4    shirt flattering due adjustable front tie perf...
Name: Review Text2, dtype: object

# 4. LDA 모델링

In [14]:
# Convert to list
dataset = data['Review Text2'].values.tolist()
data_words = list(sent_to_words(dataset))
print(data_words[:1])


[['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable']]


In [15]:
dataset = data['Review Text2'].values.tolist()

In [16]:
dataset[:1]

['absolutely wonderful silky sexy comfortable']

In [17]:
data_words = list(sent_to_words(dataset))

In [18]:
data_words[:1]

[['absolutely', 'wonderful', 'silky', 'sexy', 'comfortable']]

In [19]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_mod = gensim.models.phrases.Phraser(bigram)


In [31]:
bigram.export_phrases()

{'tracy_reese': 6151.303948576676,
 'foot_tall': 108.99311708951214,
 'lower_half': 120.12882888361816,
 'hourglass_figure': 133.77206645898235,
 'average_height': 138.38654810217236,
 'took_chance': 285.1343620918089,
 'form_fitting': 407.06698265726044,
 'others_mentioned': 138.417821098277,
 'dry_cleaned': 448.1302179962894,
 'month_ago': 216.71853769006793,
 'added_bonus': 335.35881638320024,
 'pilcro_stet': 199.5185854414042,
 'reasonably_priced': 1006.42578125,
 'visual_interest': 1396.870481927711,
 'cowl_neck': 206.78998216409036,
 'elastic_waistband': 149.0776100628931,
 'cant_wait': 164.8239229912394,
 'wish_list': 122.41904239766082,
 'couldve_gotten': 120.84662288930582,
 'football_player': 5367.604166666666,
 'cami_underneath': 109.93020846031939,
 'baby_doll': 1015.4164477141355,
 'body_type': 118.75869062526408,
 'last_week': 106.27059220558459,
 'cold_water': 614.9551440329218,
 'burnt_orange': 358.9524166755975,
 'real_life': 838.8441860465117,
 'caught_eye': 714.99716

In [20]:
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [33]:
trigram.export_phrases()

{'tracy_reese': 1695973.5,
 'foot_tall': 165.5492240418417,
 'lower_half': 159.49865780730894,
 'hourglass_figure': 221.19590250794772,
 'average_height': 176.4025691105769,
 'took_chance': 785.1345200254291,
 'form_fitting': 2053.6028676021965,
 'others_mentioned': 406.0845623656711,
 'dry_cleaned': 3519.590827338129,
 'month_ago': 4177.274630541872,
 'someone_else': 129.3399014778325,
 'added_bonus': 547.8422452407615,
 'pilcro_stet': 395.33181818181816,
 'reasonably_priced': 5017.673076923077,
 'lot_compliment': 109.63923334190892,
 'visual_interest': 10299.434210526315,
 'cowl_neck': 549.6031362146981,
 'warmer_climate': 111.36107554417414,
 'elastic_waistband': 206.3522797606767,
 'cant_wait': 694.4332592291073,
 'wish_list': 316.1770134228188,
 'couldve_gotten': 163.89384422110552,
 'football_player': 104367.6,
 'cami_underneath': 207.74361702127658,
 'baby_doll': 16583.83474576271,
 'body_type': 340.69760229813767,
 'last_week': 243.6425297113752,
 'cold_water': 2272.26756756756

In [21]:
# !python3 -m spacy download en  # run in terminal once
from gensim.utils import simple_preprocess

def process_words(texts):
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    return texts


In [22]:
documents = process_words(data_words) 

In [23]:
documents[5]

['love',
 'tracy_reese',
 'dress',
 'petite',
 'foot_tall',
 'usually',
 'wear',
 'brand',
 'dress',
 'pretty',
 'package',
 'dress',
 'skirt',
 'long',
 'full',
 'overwhelmed',
 'small',
 'frame',
 'stranger',
 'alteration',
 'shortening',
 'narrowing',
 'skirt',
 'away',
 'embellishment',
 'garment',
 'love',
 'color',
 'idea',
 'style',
 'work',
 'returned',
 'dress']

In [24]:
#  Gensim 모델링을 위한 Dictionary 세트 준비 
import gensim.corpora as corpora
dictionary = corpora.Dictionary(documents)

In [25]:
for i in range(0, 10):
    print(dictionary.get(i))

absolutely
comfortable
sexy
silky
wonderful
bc
bought
definitely
dress
find


In [26]:
print('dictionary size : %d' % len(dictionary))


dictionary size : 16508


In [27]:
# 필터링
from collections import Counter
min_count = 20
word_counter = Counter((word for words in documents for word in words))
removal_word_idxs = {dictionary.token2id[word] for word, count in word_counter.items() if count < min_count}
dictionary.filter_tokens(removal_word_idxs)
dictionary.compactify()
print('dictionary size : %d' % len(dictionary))


dictionary size : 2087


In [28]:
# 말뭉치 생성

# Create Corpus: Term Document Frequency
corpus = [dictionary.doc2bow(text) for text in documents]
# View
print(corpus[:1][0][:30])


[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]


In [34]:
# Build Base LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=dictionary,
                                            num_topics=4,
                                            random_state=1969,
                                            per_word_topics=True)

In [33]:
# 토픽 인쇄
from pprint import pprint
pprint(lda_model.print_topics())


[(0,
  '0.033*"love" + 0.026*"great" + 0.022*"dress" + 0.019*"wear" + 0.019*"color" '
  '+ 0.016*"fit" + 0.016*"perfect" + 0.015*"comfortable" + 0.013*"look" + '
  '0.012*"bought"'),
 (1,
  '0.045*"size" + 0.025*"fit" + 0.021*"small" + 0.017*"ordered" + 0.017*"im" + '
  '0.016*"dress" + 0.013*"petite" + 0.011*"love" + 0.011*"wear" + '
  '0.009*"large"'),
 (2,
  '0.036*"dress" + 0.020*"fit" + 0.016*"look" + 0.015*"fabric" + 0.015*"color" '
  '+ 0.015*"size" + 0.014*"top" + 0.011*"im" + 0.010*"love" + 0.009*"little"'),
 (3,
  '0.043*"top" + 0.017*"look" + 0.016*"im" + 0.016*"back" + 0.013*"really" + '
  '0.012*"cute" + 0.011*"love" + 0.010*"small" + 0.010*"fit" + 0.009*"fabric"')]


In [35]:
from pprint import pprint
pprint(lda_model.print_topics())

[(0,
  '0.033*"love" + 0.026*"great" + 0.022*"dress" + 0.019*"wear" + 0.019*"color" '
  '+ 0.016*"fit" + 0.016*"perfect" + 0.015*"comfortable" + 0.013*"look" + '
  '0.012*"bought"'),
 (1,
  '0.045*"size" + 0.025*"fit" + 0.021*"small" + 0.017*"ordered" + 0.017*"im" + '
  '0.016*"dress" + 0.013*"petite" + 0.011*"love" + 0.011*"wear" + '
  '0.009*"large"'),
 (2,
  '0.036*"dress" + 0.020*"fit" + 0.016*"look" + 0.015*"fabric" + 0.015*"color" '
  '+ 0.015*"size" + 0.014*"top" + 0.011*"im" + 0.010*"love" + 0.009*"little"'),
 (3,
  '0.043*"top" + 0.017*"look" + 0.016*"im" + 0.016*"back" + 0.013*"really" + '
  '0.012*"cute" + 0.011*"love" + 0.010*"small" + 0.010*"fit" + 0.009*"fabric"')]


In [52]:
def format_topics_sentences(lda_model=None, corpus=corpus, documents=documents):
    sent_topics_df = pd.DataFrame()
    for i, row_list in enumerate(lda_model[corpus]):
        # pprint(row_list[0])
        row = row_list[0] if lda_model.per_word_topics else row_list 
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # pprint(row)
        # print(i)
        # pprint(row_list)
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0: # => dominant topic
                wp = lda_model.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                new_row = pd.DataFrame([int(topic_num),round(prop_topic,4), topic_keywords]).T
                sent_topics_df = pd.concat([new_row, sent_topics_df.loc[:]]).reset_index(drop=True)
            else:
                break

        #break;
    # Add original text to the end of the output
    contents = pd.Series(documents)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    sent_topics_df.columns = ['Dominant_Topic','Perc_Contribution','Topic_Keywords','Text']
    return sent_topics_df


In [53]:
sent_topics_df = format_topics_sentences(lda_model, corpus, documents)

In [54]:
sent_topics_df.shape

(22641, 4)

In [55]:
sent_topics_df.head()

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,Text
0,0,0.9133,"love, great, dress, wear, color, fit, perfect,...","[absolutely, wonderful, silky, sexy, comfortable]"
1,2,0.7646,"dress, fit, look, fabric, color, size, top, im...","[love, dress, sooo, pretty, happened, find, st..."
2,1,0.9494,"size, fit, small, ordered, im, dress, petite, ...","[high, hope, dress, really, wanted, work, init..."
3,3,0.5461,"top, look, im, back, really, cute, love, small...","[love, love, love, jumpsuit, fun, flirty, fabu..."
4,0,0.5436,"love, great, dress, wear, color, fit, perfect,...","[shirt, flattering, due, adjustable, front, ti..."


In [56]:
df_dominant_topic = sent_topics_df.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(5)


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,0,0.9133,"love, great, dress, wear, color, fit, perfect,...","[absolutely, wonderful, silky, sexy, comfortable]"
1,1,2,0.7646,"dress, fit, look, fabric, color, size, top, im...","[love, dress, sooo, pretty, happened, find, st..."
2,2,1,0.9494,"size, fit, small, ordered, im, dress, petite, ...","[high, hope, dress, really, wanted, work, init..."
3,3,3,0.5461,"top, look, im, back, really, cute, love, small...","[love, love, love, jumpsuit, fun, flirty, fabu..."
4,4,0,0.5436,"love, great, dress, wear, color, fit, perfect,...","[shirt, flattering, due, adjustable, front, ti..."


In [60]:
# The Most Representative Sentence for Each Topic
pd.options.display.max_colwidth = 100

sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = sent_topics_df.groupby('Dominant_Topic')
print(sent_topics_outdf_grpd)

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], 
                                                             ascending=False).head(1)], 
                                                             axis=0)
print(sent_topics_sorteddf_mallet.head())
#Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]
# Show
print(sent_topics_sorteddf_mallet.head())


<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000014E1D2EA600>
      Dominant_Topic Perc_Contribution  \
14692              0            0.9828   
13030              1            0.9809   
12860              2            0.9839   
17219              3            0.9811   

                                                                 Topic_Keywords  \
14692  love, great, dress, wear, color, fit, perfect, comfortable, look, bought   
13030           size, fit, small, ordered, im, dress, petite, love, wear, large   
12860              dress, fit, look, fabric, color, size, top, im, love, little   
17219               top, look, im, back, really, cute, love, small, fit, fabric   

                                                                                                      Text  
14692  [living, casual, life, style, key, largo, fl, love, fitted, tshirt, instead, boxy, shape, im, ye...  
13030  [ordered, online, excited, unfortunately, didnt, work, maybe, style, iss

In [61]:
def get_most_contribute_document(sent_topics_df):
    # The Most Representative Sentence for Each Topic
    pd.options.display.max_colwidth = 100

    sent_topics_sorteddf_mallet = pd.DataFrame()
    sent_topics_outdf_grpd = sent_topics_df.groupby('Dominant_Topic')
    # print(sent_topics_outdf_grpd)

    for i, grp in sent_topics_outdf_grpd:
        sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                                grp.sort_values(['Perc_Contribution'], 
                                                                ascending=False).head(1)], 
                                                                axis=0)
    # print(sent_topics_sorteddf_mallet.head())
    #Reset Index    
    sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
    # Format
    sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]
    # Show
    # print(sent_topics_sorteddf_mallet.head())
    return sent_topics_sorteddf_mallet

In [62]:
sent_topics_sorteddf_mallet = get_most_contribute_document(sent_topics_df)

In [63]:
sent_topics_sorteddf_mallet.head()

Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0,0.9828,"love, great, dress, wear, color, fit, perfect, comfortable, look, bought","[living, casual, life, style, key, largo, fl, love, fitted, tshirt, instead, boxy, shape, im, ye..."
1,1,0.9809,"size, fit, small, ordered, im, dress, petite, love, wear, large","[ordered, online, excited, unfortunately, didnt, work, maybe, style, issue, fit, maybe, im, long..."
2,2,0.9839,"dress, fit, look, fabric, color, size, top, im, love, little","[found, jean, fitted, slouchy, reference, im, im, heavier, thru, hip, thigh, ordered, petite, us..."
3,3,0.9811,"top, look, im, back, really, cute, love, small, fit, fabric","[love, print, design, photo, justice, fell, love, store, find, size, ordered, size, online, stil..."
