In [1]:
from os import listdir
from os.path import isfile, join

mypath = './docs/' #문서들이 저장되어 있는 폴더 경로
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles.sort()

In [2]:
onlyfiles

['apple_iphone1.txt',
 'apple_iphone2.txt',
 'kevin_durant1.txt',
 'serena_williams1.txt']

In [3]:
# 각 파일의 내용 읽어오기
total_docs = []
for file in onlyfiles:
    file_path = mypath+file
    with open(file_path, 'r', encoding='utf8') as f:
        content = f.read()
    total_docs.append(content)

In [4]:
len(total_docs)

4

In [5]:
total_docs[0]

'Smartphones are by our sides, on our desks, in our pockets and bags all day long. We use them for hours every day (as the latest software on iPhone and Android phones reveals in all-too-clear detail).\nSo, it’s no surprise that battery anxiety, that great malady of the twenty-first century, is more perniciously present than ever.\n\nThe latest iPhones, I’d venture, have great battery life. I favor the iPhone XS Max and, as I’ve mentioned before, I’ve never had an issue with the battery running out, nor with the iPhone XR which, after all, has the largest battery of any iPhone. But how about the iPhone XS, the smallest-screened of the 2018 iPhones? Again, I haven’t had an issue except on those extra-long days, like flying west from London to California, for instance. On those 32-hour days caused by the time difference, the iPhone XS has struggled.\nSo, that was the Smart Battery Case I plumped for, to see if it could banish battery anxiety altogether. The essentials of this review appl

### 전처리 수행하기

In [6]:
import re
import nltk
from nltk.corpus import stopwords

# 전처리를 위한 사용자 정의 함수
def do_En_preprocessing(text, customized_stopwords):
    cleaned_content = re.sub(r'[^\w\d\s]','',text) # To remove symbols
    cleaned_content = cleaned_content.lower() # Case conversion, upper -> lower
    word_tokens = nltk.word_tokenize(cleaned_content) # Tokenization
    tokens_pos = nltk.pos_tag(word_tokens) # POS tagging
    NN_words = []   # To select nouns
    for word, pos in tokens_pos:
        if 'NN' in pos:
            NN_words.append(word)
            
    wlem = nltk.WordNetLemmatizer()   # Lemmatization
    lemmatized_words = []
    for word in NN_words:
        lemmatized_words.append(wlem.lemmatize(word))
    # 불용어 제거    
    stopwords_list = stopwords.words('english')
    unique_NN_words = set(lemmatized_words)
    final_NN_words = lemmatized_words
    for word in unique_NN_words:
        if word in stopwords_list:
            while word in final_NN_words: 
                final_NN_words.remove(word)
    
    unique_NN_words1 = set(final_NN_words)
    for word in unique_NN_words1:
        if word in customized_stopwords:
            while word in final_NN_words: 
                final_NN_words.remove(word)
    
    return final_NN_words

In [7]:
customized_stopwords = ['today', 'yesterday', 'new', 'york', 'time'] # 불용어 사전 생성하기

In [8]:
docs_nouns = [do_En_preprocessing(doc, customized_stopwords) for doc in total_docs]

In [9]:
len(docs_nouns)

4

In [10]:
print(docs_nouns[0])

['smartphones', 'side', 'desk', 'pocket', 'bag', 'day', 'hour', 'day', 'software', 'iphone', 'phone', 'reveals', 'detail', 'surprise', 'battery', 'anxiety', 'malady', 'century', 'iphones', 'venture', 'battery', 'life', 'iphone', 'x', 'max', 'issue', 'battery', 'iphone', 'xr', 'battery', 'iphone', 'iphone', 'iphones', 'havent', 'issue', 'day', 'london', 'instance', 'day', 'difference', 'iphone', 'x', 'battery', 'case', 'battery', 'anxiety', 'essential', 'review', 'iphone', 'x', 'max', 'iphone', 'xr', 'iphone', 'x', 'moment', 'packaging', 'plate', 'case', 'place', 'product', 'attention', 'everything', 'material', 'iphone', 'x', 'part', 'case', 'degree', 'iphone', 'top', 'case', 'microfibre', 'lightning', 'connector', 'base', 'case', 'lightweight', 'phone', 'thicker', 'trade', 'peace', 'mind', 'phone', 'wont', 'juice', 'design', 'apple', 'battery', 'case', 'battery', 'bulge', 'middle', 'back', 'back', 'camera', 'unit', 'bottom', 'lot', 'battery', 'board', 'hand', 'trouser', 'pocket', 'ins

In [11]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
documents_filtered = []
for doc in docs_nouns:
    document_filtered =' '.join(doc)
    documents_filtered.append(document_filtered) # list of docs, doc 은 단어들로 구성이 된 string data
# 불필요한 단어들을 제거하고 난후 DTM로 변환하기 위해 다시 list of strings의 형태로 변환

In [13]:
documents_filtered[0]

'smartphones side desk pocket bag day hour day software iphone phone reveals detail surprise battery anxiety malady century iphones venture battery life iphone x max issue battery iphone xr battery iphone iphone iphones havent issue day london instance day difference iphone x battery case battery anxiety essential review iphone x max iphone xr iphone x moment packaging plate case place product attention everything material iphone x part case degree iphone top case microfibre lightning connector base case lightweight phone thicker trade peace mind phone wont juice design apple battery case battery bulge middle back back camera unit bottom lot battery board hand trouser pocket instance silicone feel touch smartphones surface matter back surprise fit button pressure phone switch phone ringtone fingernail hole bottom edge course match grille base iphone'

In [14]:
tf_vectorizer = CountVectorizer(min_df=1, max_df=0.8, ngram_range=(1,1))
DTM_tf = tf_vectorizer.fit_transform(documents_filtered)

In [15]:
DTM_TF = np.array(DTM_tf.todense())

In [16]:
DTM_TF

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 2, 0, 1],
       [0, 2, 1, ..., 1, 1, 0],
       [1, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [17]:
DTM_TF.shape

(4, 285)

In [18]:
DTM_TF[0]

array([ 0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  1,  0,  0,  1,  0,  3,  1,
        2,  0,  0,  0,  9,  0,  0,  1,  0,  2,  0,  1,  1,  0,  0,  1,  0,
        0,  0,  0,  6,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  1,  1,  0,  0,  0,  0,  4,  0,  1,  0,  1,  1,  1,
        0,  1,  0,  0,  1,  0,  1,  1,  0,  0,  0,  1,  0,  1,  1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  1,  0,  0,  1,  0,
        1,  0,  0,  0,  2,  0, 12,  2,  2,  0,  1,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  1,  1,  1,  0,  1,  0,  0,  0,  1,  0,  1,  0,
        0,  0,  1,  1,  1,  0,  2,  0,  0,  1,  1,  0,  1,  0,  1,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,
        1,  0,  0,  0,  0,  0,  5,  0,  0,  1,  1,  0,  2,  0,  0,  0,  0,
        0,  0,  1,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        1,  0,  1,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  1,  0

In [18]:
np.linalg.norm(DTM_TF[1]-DTM_TF[0])

23.895606290697042

In [19]:
np.linalg.norm(DTM_TF[2]-DTM_TF[0]) 

24.919871588754223

In [20]:
np.linalg.norm(DTM_TF[3]-DTM_TF[0])

25.826343140289914

In [21]:
print(np.dot(DTM_TF[0],DTM_TF[1])/(np.linalg.norm(DTM_TF[0])*np.linalg.norm(DTM_TF[1])))

0.5959464513198515


In [22]:
print(np.dot(DTM_TF[0],DTM_TF[2])/(np.linalg.norm(DTM_TF[0])*np.linalg.norm(DTM_TF[2])))

0.010149858103156184


In [23]:
print(np.dot(DTM_TF[0],DTM_TF[3])/(np.linalg.norm(DTM_TF[0])*np.linalg.norm(DTM_TF[3])))

0.021112128709387182


In [16]:
tfidf_vectorizer = TfidfVectorizer(min_df=1, max_df=0.8, ngram_range=(1,1))
DTM_tfidf = tfidf_vectorizer.fit_transform(documents_filtered)

In [17]:
DTM_TFIDF = np.array(DTM_tfidf.todense())

In [18]:
print(np.linalg.norm(DTM_TFIDF[1]-DTM_TFIDF[0]))
print(np.linalg.norm(DTM_TFIDF[2]-DTM_TFIDF[0]))
print(np.linalg.norm(DTM_TFIDF[3]-DTM_TFIDF[0]))

0.9538556951497862
1.409374356422095
1.4028109418615884


In [19]:
print(np.dot(DTM_TFIDF[0],DTM_TFIDF[1])/(np.linalg.norm(DTM_TFIDF[0])*np.linalg.norm(DTM_TFIDF[1])))
print(np.dot(DTM_TFIDF[0],DTM_TFIDF[2])/(np.linalg.norm(DTM_TFIDF[0])*np.linalg.norm(DTM_TFIDF[2])))
print(np.dot(DTM_TFIDF[0],DTM_TFIDF[3])/(np.linalg.norm(DTM_TFIDF[0])*np.linalg.norm(DTM_TFIDF[3])))

0.5450796564151593
0.006831961729902576
0.01606073069670202
