In [1]:
import requests
import json
import jieba
import networkx as nx
import re
from bs4 import BeautifulSoup

# 15064
MOVIE_TOTAL_NUM = 15064
LINK = "https://movies.yahoo.com.tw/movieinfo_main/"


In [2]:
class MovieInfo:
    def __init__(self, doc_id=0, cname='unknown', ename='unknown', pagerank=0, label=['unknown'], intro='unknown', released_date='unknown', 
                links=[]) -> None:
        self.doc_id = doc_id
        self.cname = cname
        self.ename = ename
        self.pagerank = pagerank
        self.label = label
        self.intro = intro
        self.released_date = released_date
        self.links = list(set([14653, 13733, 14652, 14690, 14848, 14301, 14558, 14822, 14850, 14557, 14208, 14245, 14299, 14849, 14824, 14618, 14944, 13749] + links))
    
    def generate_dict(self) -> dict:
        dict = {}
        dict['doc_id'] = self.doc_id
        dict['cname'] = self.cname
        dict['ename'] = self.ename
        dict['pagerank'] = self.pagerank
        dict['label'] = self.label
        dict['intro'] = self.intro
        dict['released_date'] = self.released_date
        dict['links'] = self.links

        return dict


In [27]:
class SearchEngine:
    def __init__(self, inverted_index, path) -> None:
        self.inverted_index = inverted_index
        self.movie_data = []
        self._get_movie_data(path)
    
    def query(self, query) -> None:
        seg = jieba.cut_for_search(query)
        words = {word for word in seg}
        match_resaults = set(self.inverted_index[words.pop()])
        while words:
            match_resaults.intersection_update(self.inverted_index[words.pop()])

        match_resaults = self._sort_by_rankpage(match_resaults)
        
        if match_resaults == set():
            print("None")
        self._print_query_resault(match_resaults, query)

    def _get_movie_data(self, path) -> None:
        with open(path) as f:
            self.movie_data = json.load(f)

    def _print_query_resault(self, id_list, query) -> None:
        print(f"您的搜尋結果:")
        print(f"共{len(id_list)}筆, 符合\"{query}\"")

        regex = re.compile(rf'(.*){query}')

        relate = 0
        for movie in self.movie_data:
            if regex.match(movie['cname']) or regex.match(movie['ename']) or regex.match(movie['intro']):
                relate += 1

        count = 0
        for id in id_list:
            for movie in self.movie_data:
                if movie['doc_id'] == id:
                    if regex.match(movie['cname']) or regex.match(movie['ename']) or regex.match(movie['intro']):
                        count += 1
                    print(f"\n{id} ({movie['pagerank']}):")
                    print(f"Chinese Name: {movie['cname']}")
                    print(f"English Name: {movie['ename']}")
                    print(f"Intro: {movie['intro']}")
        print(f"\nPrecision = {count} / {len(id_list)} = {count / len(id_list) * 100}%")
        print(f"Recall = {count} / {relate} = {count / relate * 100}%")

    def _sort_by_rankpage(self, matches) -> list:
        dict = {}
        for id in matches:
            for movie in self.movie_data:
                if movie['doc_id'] == id:
                    dict[id] = movie['pagerank']
                    break
        
        return [key for key, pagerank in sorted(dict.items(), key=lambda x: x[1], reverse=True)]


In [4]:
response = requests.get("https://movies.yahoo.com.tw/chart.html")
soup = BeautifulSoup(response.text, "lxml")

resault = soup.find('div', 'rank_list table rankstyle1').find('dl', 'rank_list_box').find('h2').text.strip()
lt1 = [resault]
resault = soup.find('div', 'rank_list table rankstyle1').find_all('div', 'rank_txt')[:9]
for m in resault:
    lt1.append(m.text.strip())

response = requests.get("https://movies.yahoo.com.tw/chart.html?cate=us")
soup = BeautifulSoup(response.text, "lxml")

resault = soup.find('div', 'rank_list table rankstyle1').find('dl', 'rank_list_box').find('h2').text.strip()
lt2 = [resault]
resault = soup.find('div', 'rank_list table rankstyle1').find_all('div', 'rank_txt')[:9]
for m in resault:
    lt2.append(m.text.strip())

response = requests.get("https://movies.yahoo.com.tw/chart.html?cate=trailer")
soup = BeautifulSoup(response.text, "lxml")

resault = soup.find('div', 'rank_list table rankstyle3').find('dl', 'rank_list_box').find('h2').text.strip()
lt3 = [resault]
resault = soup.find('div', 'rank_list table rankstyle3').find_all('div', 'rank_txt')[:9]
for m in resault:
    lt3.append(m.text.strip())

ranking = set(lt1 + lt2 + lt3)
ranking.remove('做工的人電影版')

print(ranking)

{'做工的人 電影版', '疫起', '龍與地下城：盜賊榮耀', '靈魂伴侶', '金牌拳手3', '驚聲尖叫6', '魔女宅急便', '捍衛任務4', '蟻人與黃蜂女：量子狂熱', '超級瑪利歐兄弟電影版', '鏡之孤城', '65：恐怖行星', '鈴芽之旅', '鬼玩人：復活', '公主與野獸', 'AIR', 'The Journey: A Music Special from Andrea Bocelli', '關於我和鬼變成家人的那件事', '名偵探柯南 灰原哀物語〜黑鐵的神祕列車〜', 'A Thousand and One', '沙贊！眾神之怒', 'His Only Son'}


In [5]:
movie_infos = []
n = 0

for i in range(MOVIE_TOTAL_NUM):
    link = LINK + str(i + 1)
    links = []
    labels = []

    response = requests.get(link)
    soup = BeautifulSoup(response.text, "lxml")

    try:
        resault = soup.find('div', 'movie_intro_info_r')
        cname = resault.find('h1').text.strip()
        ename = resault.find('h3').text.strip()
        released_date = resault.find('span', class_=None).text[5:]

        resault = soup.find('div', 'level_name_box').find_all('div', 'level_name')
        for label in resault:
            labels.append(label.text.strip())

        resault = soup.find('div', 'gray_infobox_inner')
        intro = str(resault.select_one('span').text).strip().replace('\n', '').replace('\r', '').replace('　', ' ')

        try:
            resault = soup.find('ul', 'maylike_list starlist').find_all('li', 'gabtn')
            regex = re.compile(r'[0-9]+$')
            for movie in resault:
                links.append(int(regex.findall(movie.find('a')['href'])[0]))
        except:
            pass
    except:
        continue

    movie_info = MovieInfo(doc_id=i+1, cname=cname, ename=ename, label=labels, intro=intro, released_date=released_date, links=links)
    movie_infos.append(movie_info)
    n += 1

print(n)

11450


In [36]:
G = nx.DiGraph()
for movie_info in movie_infos:
    for link_target in movie_info.links:
        G.add_edge(movie_info.doc_id, link_target)
pagerank_list = nx.pagerank(G, alpha=1)
sorted_pagerank_list = sorted(pagerank_list.items(), reverse=True)
for movie_info in movie_infos:
    item = sorted_pagerank_list.pop()
    movie_info.pagerank = item[1]

movie = [movie_info.generate_dict() for movie_info in movie_infos]

with open("hw2.json", "w") as f:
    json.dump(movie, f, indent=4)


In [28]:
inverted_index = {}
for movie in movie_infos:
    seg = jieba.cut_for_search(movie.cname)
    for word in seg:
        if word in inverted_index.keys():
            if movie.doc_id in inverted_index[word]:
                continue
            inverted_index[word].append(movie.doc_id)
        else:
            inverted_index[word] = [movie.doc_id]

    seg = jieba.cut_for_search(movie.intro)
    for word in seg:
        if word in inverted_index.keys():
            if movie.doc_id in inverted_index[word]:
                continue
            inverted_index[word].append(movie.doc_id)
        else:
            inverted_index[word] = [movie.doc_id]

with open("inverted.json", "w") as f:
    json.dump(inverted_index, f, indent=4)


In [40]:
with open("inverted.json") as f:
    inverted_index = json.load(f)

search_engine = SearchEngine(inverted_index, "hw2.json")
search_engine.query("捍衛任務")


您的搜尋結果:
共41筆, 符合"捍衛任務"

13573 (1.5808705135843418e-06):
Chinese Name: 殺戮基地
English Name: Black Site
Intro: ★ 《怒火邊界》《捍衛任務》製片打造黑暗系火爆動作鉅片★ 《不可能的任務系列》動作女星蜜雪兒摩納漢挑大樑主演★ 《魔鬼終結者：創世契機》男星傑森克拉克演出兇殘反派★ 揭開正義的表象，裡面往往比你想的更黑暗...中情局特務艾比（蜜雪兒摩納漢 飾） 在一次恐怖攻擊中失去丈夫和兒子，她自願調往一個專門收留恐怖份子的秘密基地，以調查事故背後的真相。這天，基地裡來了一個新的嫌疑犯哈契特（傑森克拉克 飾），哈契特在內應的協助下成功脫逃，並血洗整個基地，艾比則被迫與哈契特展開一場貓捉老鼠般的致命遊戲……。

14653 (1.0647066239985156e-06):
Chinese Name: 捍衛任務4
English Name: John Wick: Chapter 4
Intro: ★ 台灣搶先全球上映，IMAX、Dolby Cinema版本同步上映★ 系列全球賣座近6億《捍衛任務系列》原班人馬打造最新史詩篇章★ 「葉問」對決「殺神」！香港動作男星甄子丹正式參戰，與基努李維展開大銀幕對決★ 《牠》人氣男星「小丑潘尼懷斯」比爾史柯斯嘉加盟演出★ 日本動作男星真田廣之參演，帶來嶄新日式武打場面★ 橫跨歐美大陸實地拍攝，耗資破億美金成本打造「捍衛任務」宇宙約翰維克（基努李維 飾）經歷上集的事件後，繼續與驅逐他的殺手組織「高桌會」展開對抗，隨著懸賞金額的不斷飆升，約翰維克將迎來他殺手生涯最強大的對手……。

12776 (8.687221522247683e-07):
Chinese Name: 救命緝約
English Name: The Contractor
Intro: ★《捍衛任務》《天劫倒數》製作團隊最爽動作鉅獻★《神力女超人》克里斯潘恩 《地獄》班佛斯特 《恐懼大街》吉莉安雅各布斯 大銀幕超激火拼★ 執行任務竟成獵殺標靶？當簽署的契約變調 「違約」成為救命關鍵★ 神秘契約背後隱藏的危機 原來是一場精心策畫的致命陰謀★ 身處異國孤軍奮戰 該如何殺出重圍 活著回家？詹姆士(克里斯潘恩 飾)自特種部隊退伍後，為了養家糊口加入秘密軍事組織，簽下神秘契約後奉命至東歐