## 📝 安裝套件

In [None]:
!pip install jieba -U
!pip install httpx -U
!pip install sklearn -U
!pip install ckiptagger[tfgpu,gdown] -U
!pip install BeautifulSoup4 -U
'''
1. 抓資料
2. 資料清洗
3. 長詞優先斷詞+jieba斷詞+ckiptagger斷詞
4. tf-idf取出關鍵字
5. 計算文件相似度
'''

## 📝 針對標點符號斷行

In [42]:
import re
def splitSentense(text, delimiter):
    return re.split(delimiter, text)
    
delimiter = "，|。|、|（|）|／|《|》|】|【|「|」|；|："
allNews = ['虛擬貨幣「比特幣期貨」要來了！美國監管機構美國商品期貨交易委員會（CFTC）一日宣布將放行比特幣期貨，允許芝加哥商品交易所（CME）和芝加哥選擇權交易所（CBOE）推出相關期貨合約，因為兩交易所已證明擬推出的合約和交易安全符合必要的監管規定；這成為推動主流投資人買賣此價格高度波動的數位貨幣的重大一步。']
sentenceAry = []
for rec in allNews:
    text = rec
    sentenceAry += splitSentense(text,delimiter)
sentenceAry    

['虛擬貨幣',
 '比特幣期貨',
 '要來了！美國監管機構美國商品期貨交易委員會',
 'CFTC',
 '一日宣布將放行比特幣期貨',
 '允許芝加哥商品交易所',
 'CME',
 '和芝加哥選擇權交易所',
 'CBOE',
 '推出相關期貨合約',
 '因為兩交易所已證明擬推出的合約和交易安全符合必要的監管規定',
 '這成為推動主流投資人買賣此價格高度波動的數位貨幣的重大一步',
 '']

## 📝 ngram + 長詞優先斷詞 => 取出特殊關鍵詞

In [48]:
import operator
def removeKey(text, keyword):
    textAry= text
    for key in keyword:
        textAry = ''.join(textAry.split(key))
    return textAry
def ngram(input_sentence, n = 2):
    word_dic = {}
    sentence  = input_sentence
    for i in range(0, len(sentence) - n + 1):        
        if sentence[i:i+n] not in word_dic:
            word_dic[sentence[i:i+n]] = 1
        else:
            word_dic[sentence[i:i+n]] = word_dic[sentence[i:i+n]] + 1
    return word_dic    
keywords=['期貨','核准']        
ret_terms={}
words_freq    = []
for term_length in range(4,1,-1):
    word_dic = {}
    for sentence in sentenceAry:
        text_list = removeKey(sentence,keywords)  
        ngram_words = ngram(text_list,term_length) 
        for word in ngram_words:
            if word not in word_dic:
                word_dic[word] = 1
            else:
                word_dic[word] += ngram_words[word]   
    for word in word_dic:
        if word_dic[word] >= 5:
            keywords.append(word)            
            ret_terms.update({word:word_dic[word]})
sorted_terms = sorted(ret_terms.items(),key=operator.itemgetter(1),reverse=True) 
sorted_terms


[('交易', 5)]

## 📝 tf-idf

In [49]:
import jieba.analyse
sentence = '''有外貌美麗如女星范冰冰的馮姓女子2008年與陳姓富二代訂婚後無故悔婚，但馮女不告而別2年後，忽然回頭請陳男金援，癡情的陳男不但不計前嫌，對馮女數十次索討生日、情人節等禮物及資助學費等一概答應，竟連她結婚生子後要求生活費，也大方匯給10萬元，直到陳男也結婚，妻發現他竟贈百萬名畫給馮女，陳男才在妻逼迫下告馮女詐欺、侵占，並請求返還借款533萬餘元，但檢方認定陳男自願贈與，將馮女不起訴，台北地院也僅認定2013年間馮女謊稱買房討「房貸頭期款」部分陳男求償有理，判馮女須賠60萬元。'''
tags = jieba.analyse.extract_tags(sentence,3, withWeight=True)
tags


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\smart\AppData\Local\Temp\jieba.cache
Loading model cost 0.557 seconds.
Prefix dict has been built successfully.


[('馮女', 0.6210268832675324),
 ('萬元', 0.3105134416337662),
 ('認定', 0.3105134416337662)]

## 📝 查詢相似文件

In [None]:
import httpx
from bs4 import BeautifulSoup
import time
import asyncio
import csv
import os
import re

headers = {'User-Agent': 'GoogleBot'}
start = 1
end = 50
root_url = 'https://www.ithome.com.tw'
url = f'{root_url}/devops'
fetched_news_data=[]

async def get_news(news_url, page):
    req = httpx.get(f'{news_url}?page={page}', headers=headers)
    res = req.text
    root = BeautifulSoup(res, 'lxml')
    if len(root.find_all('div',attrs={'class':'span4 channel-item'}))==0:
        print(f'--------------沒有任何新聞內容了~~~~~-----------')
        return
    news_list = root.find_all(
        'span', attrs={'class': 'views-field views-field-created'})    
    for news in news_list:
        news_div = news.span.div
        categories = ','.join(list(map(lambda x: x.text, news_div.find(
            'p', attrs={'class': 'category'}).find_all('a'))))
        print(categories)
        title_div = news_div.find('p', attrs={'class': 'title'})
        title = title_div.a.text.strip()
        href = f"{root_url}{title_div.a['href']}"
        content_req=httpx.get(href)
        content_res=content_req.text
        content_root=BeautifulSoup(content_res, 'lxml')
        content=content_root.find('div',attrs={'class':'field-type-text-with-summary'}).div.div.text
        print(content)
        id=title_div.a['href'].split('/')[2]
        print(title)
        print(href)
        print(id)
        summary = news_div.find('div', attrs={'class': 'summary'}).text.strip()
        print(summary)
        post_date = news_div.find('p', attrs={'class': 'post-at'}).text.strip()
        print(post_date)
        fetched_news_data.append([id,page,post_date,title,summary,content,href,categories])
    print(f'--------------{page}-----------')


async def main():
    task_list = []
    for i in range(start, end+1):
        task_list.append(get_news(url, i))
    await asyncio.gather(*task_list)

s = time.perf_counter()
await main()
elapsed = time.perf_counter() - s

if os.path.exists("output.csv"):
  os.remove("output.csv")
else:
  print("The file does not exist")

with open('output.csv', 'w', newline='', encoding='UTF-8') as csvfile:
  writer = csv.writer(csvfile)
  writer.writerow(['ID','page', 'post_date', 'title','summary','content','href','categories'])
  for data in fetched_news_data:
    writer.writerow(data)
print(f"Script executed in {elapsed:0.2f} seconds.")
    



In [4]:
import pandas
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import jieba
news = pandas.read_csv('output.csv')

corpus = []
titles = []
for article in news.iterrows():
    titles.append(article[1]['title'])
    try:
        corpus.append(' '.join(jieba.cut(article[1]['content'])))
    except Exception as e:
        print(article)
        raise e   
vectorizer = CountVectorizer() 
X = vectorizer.fit_transform(corpus)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\smart\AppData\Local\Temp\jieba.cache
Loading model cost 0.558 seconds.
Prefix dict has been built successfully.


In [16]:
def get_similarity_sentence(articleid):
    print('[查詢文章]:{}'.format(titles[articleid]))
    cosine_similarities = cosine_similarity(tfidf[articleid], tfidf).flatten()
    related_docs_indices = cosine_similarities.argsort()[-2::-1]
    for idx in related_docs_indices:
        if cosine_similarities[idx] > 0.3:
            print('[相關文章]:{} {}'.format(titles[idx], cosine_similarities[idx]))
get_similarity_sentence(100)

[查詢文章]:GCP用戶現可在GKE上執行Cloud Dataproc的Spark工作負載
[相關文章]:Google雲端Spark叢集服務Dataproc現可運用GPU加速運算 0.5262928890852002
