In [110]:
# 경고(warning) 문구를 화면에 출력하지 않게 하기 위해서 아래 코드를 사용합니다. 
import warnings
warnings.filterwarnings('ignore')

아래와 같이 텍스트 파일을 읽어 옵니다.

In [9]:
import numpy as np
from os import listdir
from os.path import isfile, join

mypath = './example_En_docs/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles.sort()

total_docs = []
for file in onlyfiles:
    file_path = mypath+file
    with open(file_path, 'r', encoding='utf8') as f:
        content = f.read()
    total_docs.append(content)

In [11]:
len(total_docs)

15

In [13]:
total_docs[0]

'Ryen Aleman was engrossed in the virtual world, focused on the video game in front of him, when he realized the gunshots he was hearing were real. He ducked and bolted for a restroom to take cover.\nAnother mass shooting was unfolding in Florida, this time at a tournament for competitive players of the football video game, Madden, in Jacksonville. The winners would go on to a higher level tournament in Las Vegas in October, where large cash prizes could be won.\nParticipants had gathered at the Jacksonville Landing Complex, an open-air marketplace with stores, bars and restaurants along the St. Johns River.\n\nDavid Katz, a 24-year-old gamer from Baltimore, Maryland, was in Jacksonville for the tournament at GLHF Game Bar, in the back of a pizza restaurant. On Sunday, he brought a gun into the venue and opened fire, killing two people. Then he turned the gun on himself, Jacksonville Sheriff Mike Williams said. Police have not released a possible motive.\nKatz used at least one handgun

In [15]:
# 전처리 수행하기
import re
import nltk
from nltk.corpus import stopwords

def do_En_preprocessing(text, customized_stopwords):
    cleaned_content = re.sub(r'[^\w\d\s]','',text) # To remove symbols
    cleaned_content = cleaned_content.lower() # Case conversion, upper -> lower
    word_tokens = nltk.word_tokenize(cleaned_content) # Tokenization
    tokens_pos = nltk.pos_tag(word_tokens) # POS tagging
    NN_words = []   # To select nouns
    for word, pos in tokens_pos:
        if 'NN' in pos:
            NN_words.append(word)
            
    wlem = nltk.WordNetLemmatizer()   # Lemmatization
    lemmatized_words = []
    for word in NN_words:
        lemmatized_words.append(wlem.lemmatize(word))
    # 불용어 제거    
    stopwords_list = stopwords.words('english')
    unique_NN_words = set(lemmatized_words)
    final_NN_words = lemmatized_words
    for word in unique_NN_words:
        if word in stopwords_list:
            while word in final_NN_words: 
                final_NN_words.remove(word)
    
    unique_NN_words1 = set(final_NN_words)
    for word in unique_NN_words1:
        if word in customized_stopwords:
            while word in final_NN_words: 
                final_NN_words.remove(word)
    
    return final_NN_words

In [17]:
customized_stopwords = ['today', 'yesterday', 'tomorrow'] # 불용어 사전 생성하기

In [19]:
docs_nouns = [do_En_preprocessing(doc, customized_stopwords) for doc in total_docs]

In [21]:
print(docs_nouns[0]) # 첫번째 문서에 대한 내용 

['aleman', 'world', 'video', 'game', 'front', 'gunshot', 'restroom', 'cover', 'mass', 'shooting', 'florida', 'time', 'tournament', 'player', 'football', 'video', 'game', 'madden', 'jacksonville', 'winner', 'level', 'tournament', 'vega', 'october', 'cash', 'prize', 'participant', 'jacksonville', 'marketplace', 'store', 'bar', 'restaurant', 'st', 'john', 'river', 'david', 'gamer', 'baltimore', 'maryland', 'jacksonville', 'tournament', 'game', 'bar', 'back', 'pizza', 'restaurant', 'sunday', 'gun', 'venue', 'fire', 'people', 'gun', 'mike', 'williams', 'police', 'motive', 'katz', 'handgun', 'sheriff', 'people', 'wound', 'people', 'area', 'williams', 'victim', 'condition', 'hospital', 'sheriff', 'horror', 'stream', 'event', 'game', 'shot', 'people', 'person', 'f', 'whatd', 'call', 'williams', 'officer', 'scene', 'minute', 'bathroom', 'minute', 'ran', 'scene', 'im', 'member', 'jacksonville', 'fire', 'station', 'rock', 'training', 'parking', 'garage', 'street', 'landing', 'dozen', 'people', 'p

In [27]:
documents_filtered = [' '.join(doc) for doc in docs_nouns]
# 불필요한 단어들을 제거하고 난후 DTM로 변환하기 위해 다시 list of strings의 형태로 변환

In [54]:
documents_filtered[0]

'aleman world video game front gunshot restroom cover mass shooting florida time tournament player football video game madden jacksonville winner level tournament vega october cash prize participant jacksonville marketplace store bar restaurant st john river david gamer baltimore maryland jacksonville tournament game bar back pizza restaurant sunday gun venue fire people gun mike williams police motive katz handgun sheriff people wound people area williams victim condition hospital sheriff horror stream event game shot people person f whatd call williams officer scene minute bathroom minute ran scene im member jacksonville fire station rock training parking garage street landing dozen people president jacksonville association fire fighter person shirt chest wyse person wound firefighter aid others landing wyse officer victim body katzs body hour authority agent bureau alcohol tobacco firearm explosive fbi police family home baltimore katz gaming bread tournament multiple gamers cnn buf

CounterVectorizer 사용해 보기

In [62]:
from sklearn.feature_extraction.text import CountVectorizer # frequency based DTM
tf_vectorizer = CountVectorizer()
DTM_tf = tf_vectorizer.fit_transform(documents_filtered)
DTM_TF = np.array(DTM_tf.todense()) #dense array (즉, 0을 포함한 array)로 변환후 다시 numpy array로 변환

In [114]:
DTM_TF.shape

(15, 1166)

In [116]:
DTM_TF[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [118]:
len(DTM_TF[0])

1166

유클리디안 거리와 코사인 유사도 계산해 보기

In [121]:
# 유클리디안 거리
d1d2_tf = np.linalg.norm(DTM_TF[0]-DTM_TF[1])
d1d4_tf = np.linalg.norm(DTM_TF[0]-DTM_TF[3])
d1d6_tf = np.linalg.norm(DTM_TF[0]-DTM_TF[5])
print(d1d2_tf, d1d4_tf, d1d6_tf)

25.11971337416094 31.04834939252005 35.94440151122286


In [123]:
# 코사인 유사도
d1d2_cos_tf = np.dot(DTM_TF[0],DTM_TF[1])/(np.linalg.norm(DTM_TF[0])*np.linalg.norm(DTM_TF[1]))
d1d4_cos_tf = np.dot(DTM_TF[0],DTM_TF[3])/(np.linalg.norm(DTM_TF[0])*np.linalg.norm(DTM_TF[3]))
d1d6_cos_tf = np.dot(DTM_TF[0],DTM_TF[5])/(np.linalg.norm(DTM_TF[0])*np.linalg.norm(DTM_TF[5]))
print(d1d2_cos_tf, d1d4_cos_tf, d1d6_cos_tf)

0.48098450817589 0.0890419846686842 0.03920003859698009


## KMeans 적용하기

In [126]:
from sklearn.cluster import KMeans    

In [128]:
# TF 정보 사용
kmeans = KMeans(n_clusters=5)
clusters_TF = kmeans.fit_predict(DTM_TF) # KMeans 알고리즘으로 학습

'clusters_TF = kmeans.fit_predict(DTM_TF)'를 실행하는데 있어 AttributeError: 'NoneType' object has no attribute 'split' 에러가 발생하는 경우, 아래 코드를 실행해서 threadpoolctl 모듈의 버전을 최신 버전으로 업데이트 해주세요.<br>
!pip install - U threadpoolctl

In [130]:
clusters_TF  ## 근데 4번째, 5번째 글이 비슷한 주제인데 다른 군집에 속함 -> 빈도수 기반 분석이기 때문
## TF-IDF 기반으로 분석을 해볼 필요가 있음

array([2, 2, 2, 2, 4, 2, 2, 2, 0, 2, 2, 2, 1, 3, 1])

TfidfVectorizer 사용해보기

In [134]:
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf based DTM
tfidf_vectorizer = TfidfVectorizer()
DTM_tfidf = tfidf_vectorizer.fit_transform(documents_filtered)
DTM_TFIDF = np.array(DTM_tfidf.todense())

In [136]:
#TFIDF 정보 사용
kmeans = KMeans(n_clusters=5)
clusters_TFIDF = kmeans.fit_predict(DTM_TFIDF)

In [137]:
clusters_TFIDF

array([1, 1, 0, 1, 1, 3, 4, 3, 2, 2, 2, 2, 0, 0, 0])

# 실루엣 스코어 계산해 보기

In [141]:
from sklearn.metrics import silhouette_score
for k in range(2,11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    cluster = kmeans.fit_predict(DTM_TFIDF) 
    print(k, silhouette_score(DTM_TFIDF, cluster))

2 0.07188291038986777
3 0.09617966109994258
4 0.19872347429292278
5 0.16898785419367665
6 0.14053530435223188
7 0.18969621768111508
8 0.14301650998211804
9 0.1345721911057098
10 0.13133890768345496


고차원의 벡터는 시각화가 안됨
> 2차원상에 표현할 필요가 있음 (차원축소)