## 아마존 제품 리뷰 문서 클러스트링

### 0. 아마존 제품 리뷰 TDM 분석 결과 및 원본 파일 불러오기

In [1]:
import pandas as pd
import joblib

In [2]:
with open('amazon.pkl', 'rb') as f:
    data = joblib.load(f)
locals().update(data)

In [3]:
df = pd.read_csv('amazon_cells_labelled.txt', sep='\t', header=None)

In [4]:
tdm = data['tdm']

#### 클러스터별 상위 10개 사용 단어 확인 함수

In [5]:
def top10(words, labels, tdm):
    from operator import itemgetter
    freq_words=[]
    for i in range(4):
        count=tdm[labels == i, :].sum(axis = 0)
        ws = [w for w, n in sorted(zip(words, count.flat), key = itemgetter(1), reverse=True)[:10]]
        freq_words.append(ws)
    return pd.DataFrame(freq_words)

### 1. 유클리드거리 기준으로 4개의 클러스트로 묶기

In [6]:
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import SpectralClustering

In [7]:
cl = SpectralClustering(n_clusters=4, random_state=1234)
labels = cl.fit_predict(tdm[:100])



In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
words = vectorizer.get_feature_names()



In [10]:
labels

array([0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 2, 3, 2, 3, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 3, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 2, 0, 0, 2, 0, 0, 3, 0, 2, 3, 0, 0, 0, 0, 0, 2, 0, 2, 0,
       3, 0, 0, 0, 2, 0, 0, 0, 0, 3, 0, 1])

In [11]:
top10(words, labels, tdm)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,phone,battery,good,ve,time,right,money,doesn,use,headsets
1,disappointed,decision,battery,10,100,11,12,13,15,15g
2,great,works,worked,item,phone,choice,jawbone,mic,quality,situations
3,product,love,worthless,impressed,thing,waaay,sensitive,big,handy,cheaper


### 2. 코사인 유사도거리 기준으로 4개의 클러스트로 묶기

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
cl_cos = SpectralClustering(n_clusters=4, affinity=cosine_similarity, random_state=1234)
labels_cos = cl_cos.fit_predict(tdm[:100])



In [14]:
top10(words, labels_cos, tdm)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,product,love,money,features,waste,thing,worthless,impressed,waaay,want
1,battery,disappointed,days,decision,runs,right,problem,bought,quickly,buy
2,way,ear,wear,comfortable,holds,unless,clip,wonder,plug,music
3,great,phone,works,good,quality,use,case,ve,mic,doesn


### 3. K-Means를 활용한 클러스트링

#### 정규화 전

In [15]:
from sklearn.cluster import KMeans

In [16]:
km = KMeans(n_clusters=4, random_state=1234)
labels_km = km.fit_predict(tdm)

In [17]:
top10(words, labels_km, tdm)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,phone,good,product,quality,headset,use,recommend,excellent,work,works
1,don,money,waste,buy,time,product,dont,wasted,make,mistake
2,great,phone,works,price,product,deal,item,working,device,worked
3,battery,life,long,original,phone,good,great,works,dying,buy


#### 정규화 진행 후 클러스터링 변화 확인

In [18]:
from sklearn.preprocessing import Normalizer

In [19]:
nom = Normalizer(copy=False)
pos = nom.fit_transform(tdm)

In [20]:
labels_km2 = km.fit_predict(pos)
top10(words, labels_km2, pos)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,phone,good,product,quality,headset,use,recommend,excellent,work,works
1,don,money,waste,buy,time,product,dont,wasted,make,mistake
2,great,phone,works,price,product,deal,item,working,device,worked
3,battery,life,long,original,phone,good,great,works,dying,buy
