In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import pandas as pd
import re
import numpy as np

In [2]:
with open("./dataset/ioh1500_arti.json") as f:
    docs = json.load(f)

In [3]:
def clean_data(doc):
    res = doc.replace("/", " ").replace("（", "").replace("／", "").replace("）", "").replace("」", "").replace("「", "")
    res = re.sub(r'[0-9]', '', res)
    return res

In [4]:
docs = docs[:1500]

In [5]:
from pprint import pprint

In [6]:
documents = []

In [7]:
for doc in docs:
    try:
        sents = doc['arti']['result_segmentation']
#     pprint(sents)
        sents = clean_data(sents)
#     print(sents)
        documents.append(sents)
    except Exception as e:
        pass

In [8]:
vectorizer = TfidfVectorizer()

In [9]:
X = vectorizer.fit_transform(documents)

In [10]:
X = X.T.toarray()

In [11]:
df = pd.DataFrame(X, index=vectorizer.get_feature_names())

In [12]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
aaalac,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
ahmc,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
ai,0.0,0.0,0.0,0.0,0.053031,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.030997,0.021501,0.0,0.0,0.0
aiot,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.063896,0.0,0.0,0.0
aka,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.031948,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
龍德,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
龍頭,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
龐大,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0
龐雜,0.0,0.0,0.0,0.0,0.024102,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.031081,0.000000,0.000000,0.0,0.0,0.0


In [13]:
print(df.shape)

(13663, 100)


In [14]:
def get_similar_articles(q, df):
    print("query:", q)
    print("查詢結果: ")
    # Convert the query become a vector
    q = [q]
    q_vec = vectorizer.transform(q).toarray().reshape(df.shape[0],)
    sim = {}
    # Calculate the similarity
    lens = df.shape[1]
    for i in range(lens):
        sim[i] = np.dot(df.loc[:, i].values, q_vec) / np.linalg.norm(df.loc[:, i]) * np.linalg.norm(q_vec)
    
    # Sort the values 
    sim_sorted = sorted(sim.items(), key=lambda x: x[1], reverse=True)
    # Print the articles and their similarity values
    print(sim_sorted)
    result = []
    for k, v in sim_sorted:
        if v != 0.0:
            print("相似度:", v)
            content = documents[k].replace(" ", "")
#             print(content)
#             print()
            result.append(content)
    
    return result

In [15]:
# documents

In [20]:
q1 = "運動"

query_result = get_similar_articles(q1, df)

query: 運動
查詢結果: 
[(8, 0.7842696033546982), (16, 0.07125056394751413), (51, 0.044646512536320036), (83, 0.0422434547832524), (42, 0.03274369696684138), (78, 0.02551971838568282), (11, 0.023932342771924926), (3, 0.021060133985196905), (19, 0.018812515211298717), (71, 0.0166019880941577), (9, 0.015264412656051267), (1, 0.01494805468621339), (36, 0.013271333135497068), (39, 0.012671987426087605), (46, 0.009392080601724482), (0, 0.0), (2, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (10, 0.0), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (17, 0.0), (18, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.0), (37, 0.0), (38, 0.0), (40, 0.0), (41, 0.0), (43, 0.0), (44, 0.0), (45, 0.0), (47, 0.0), (48, 0.0), (49, 0.0), (50, 0.0), (52, 0.0), (53, 0.0), (54, 0.0), (55, 0.0), (56, 0.0), (57, 0.0), (58, 0.0), (59, 0.0), (60, 0.0), (61, 0.0), (62, 0.0), (63, 0.0), (64, 0.0),

In [21]:
for idx, res in enumerate(query_result):
    print(res)
    print()
    if idx > 3:
        break

跳至Q.當初升大學選擇校系時，是否有過糾結的時刻？我的父親是體育老師，我從小一開始就在體操隊，三年級進游泳隊，也在田徑隊學習，一直都在體育班生活，所以對運動非常有興趣。不過我並不想當體育老師，我覺得父親在大太陽底下教課滿辛苦的。在思考大學校系時，正好接觸到物理治療的研究，當中有提到運動醫學。我本身對生物非常有興趣，特別是人體的部分，加上喜歡運動，就覺得或許這個科系適合我。做決定之前，曾異想天開，想念很多不同科系，包括建築系、外文系，不過，高中老師提醒我要去看這些科系的課程內容，發現真的不是我想要的，比如：外文系要學文學，但我連中文的文言文都不太有興趣了，所以很快就排除這些想法。後來又發現自己的專長和興趣都在物理治療，所以就選擇了這個科系。跳至Q.是否建議同學念體育班？建議同學如何規劃體育生涯？現在對於菁英體育選手的訓練，不建議在太小的時候進行專項訓練，基本上建議在青少年到歲之前多方學習各式運動。但仍要依項目而定，例如體操就要更早開始訓練。因此，在青少年時期之前要不要念體育班，見仁見智。不過，以我自己曾是選手的經驗來說，要當一個選手非常不容易，並不是願意努力訓練、刻苦耐勞就會有好成績，先天條件其實很重要。假使你有興趣當選手，要先跟教練好好了解、評估自己的基本條件。以我自己的觀點來說，我認為高中時再加入體育班會比較適合。無論是不是體育班，很重要的是學科方面不可以完全放棄。訓練的辛苦我也經歷過，很難好好專心念書，但如果自己不積極維持基本程度，當選手之路走到一個階段，不適合再走下去時，會發現自己沒有其它路可以轉，會有很多限制、很可惜。許多優秀選手很聰明，只是欠缺學習學科的時間，程度才會跟不上，但我認為只要努力，都可以跟得上。跳至Q.教授後來選擇到美國俄亥俄州立大學研究運動科學，原因為何？我本來沒有明確知道要不要出國留學。我大四整年都在醫院實習，到醫院實習才發現自己不太適合這個環境。我們要接觸的病人比較憂鬱，不管是中風、脊椎損傷的病人或是小兒科腦性麻痺的小孩，想到他們的未來很辛苦，讓我在實習期間很憂鬱，覺得自己沒有辦法待下去。但這個科系畢業就是要當物理治療師，如果不當怎麼辦？當時發現自己對念書有興趣，還想再學更多運動醫學、科學相關的知識和技術，在物理治療系學的有限，所以就和父母親討論，希望有機會到國外留學。會選擇國外學校，是因為當時我跟系上老師討論過未來該往哪個方向、哪些國家

In [17]:
lost = [1, 2, 3]

In [58]:
documents[lost]

TypeError: list indices must be integers or slices, not list