# Import Module

In [1]:
import requests
import re
from bs4 import BeautifulSoup
from time import sleep
from random import random

# 整合

In [15]:
# YahooMovies 邏輯 : 使用者輸入 > 搜尋電影 > 確認哪一部(沒有就重找or沒有這部) > 找這部資訊 > 返回搜尋結果
# IMDbMovies 邏輯 : 從YahooMovies拿完整確定查找的電影名稱(有不同或多選擇需要使用者去Y/N決定) > 搜尋該電影評論 > 返回資訊
class YahooMovies():
    def __init__(self):
        self.movie_link = None

    def user_input(self): # 返回使用者輸入的電影名稱(可能不是完整電影名)
        while True:
            try:
                movie_name = input("請輸入欲搜尋的電影名稱：")
                if movie_name.strip():
                    return movie_name
            except:
                print('\nTry Again')
                return None

    def search_movie(self, userquery): # 返回搜尋列表、使用者查的電影名稱(可能不是完整電影名)
        encoded_query = requests.utils.quote(userquery)
        url = f"https://movies.yahoo.com.tw/moviesearch_result.html?keyword={encoded_query}"
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")
        movie_titles = soup.find_all("div", class_="release_movie_name")
        return movie_titles, userquery

    def search_check(self, movie_titles, userquery): # 返回該電影完整名稱、電影頁面連結(for 爬蟲)
        res_count = len(movie_titles)
        if res_count == 0:
            return userquery, None
        elif res_count > 1:
            for m in movie_titles:
                ans = None
                while ans != 'Y' and ans != 'N':
                    ans = input(f'找到{res_count}部電影，返回第一個搜尋{m.a.text}(Y)或是往下搜索(N):')
                if ans == 'Y':
                    return m.a.text, m.a['href']
                elif ans == 'N':
                    continue
            # 該次搜尋都沒找到，重新搜尋
            sleep(1 + random())
            new_query = self.user_input()
            new_movie_titles, new_query = self.search_movie(new_query)
            return self.search_check(new_movie_titles, new_query)
        else:
            return movie_titles[0].a.text, movie_titles[0].a['href']

    def specific_movie_info(self, movie_link): # 返回這部電影所需的資訊
        def review_latest(review_link):
            if review_link is None:
                return None
            res = []
            response = requests.get(review_link)
            soup = BeautifulSoup(response.text, "html.parser")
            reviews = soup.find_all('div', class_='usercom_inner _c')

            review_counts = 0
            for i in reviews:
                if review_counts == 5: break
                content = re.sub(r'[\s]{2,}|[\n\r\t]+', '', i.find_all('span')[-1].text)
                
                pattern = re.compile(r'(微信|line|外約)', re.IGNORECASE)
                if pattern.search(content): continue

                res.append(content)
                review_counts += 1
            return res if res else '該電影目前無評論'

        movie_info = {}
        if movie_link:
            response = requests.get(movie_link)
            soup = BeautifulSoup(response.text, "html.parser")
            movie = soup.find('div', class_='movie_intro_info_r')
            review = soup.find('div', class_='btn_plus_more usercom_more gabtn')
            review_link = review.a['href'] if review else None
            starscore = soup.find('div', class_='score_num count')
            starbox = soup.find('div', class_='starbox2')

            chname = re.sub(r'[\s]{2,}|[\n\r\t]+', '', movie.h1.text)
            engname = re.sub(r'[\s]{2,}|[\n\r\t]+', '', movie.h3.text) if movie.h3.text else None

            if starscore and starbox:
                starscore = f'{starscore.text} / 5 {starbox.text.strip()}'
            else:
                starscore = None

            reviews = review_latest(review_link)

            release_date, IMDB = None, None
            for i in movie.find_all('span'):
                text = i.text.strip()
                if '上映日期' in text:
                    release_date = text.split('：')[1]
                elif 'IMDb分數' in text:
                    IMDB = f"{text.split('：')[1]} / 10"
            movie_info['中文名稱'] = chname
            movie_info['英文名稱'] = engname
            movie_info['上映日期'] = release_date
            movie_info['滿意度'] = starscore
            movie_info['IMDb'] = IMDB
            movie_info['Yahoo最新5則評論'] = reviews

            return movie_info

    def search_and_get_movie_info(self): # 主程式
        userquery = self.user_input()
        movie_titles, checkquery = self.search_movie(userquery)
        movie_name, movie_link = self.search_check(movie_titles, checkquery)
        movie_info = self.specific_movie_info(movie_link)
        return movie_name, movie_info

class IMDbMovies(YahooMovies):
    def __init__(self, name):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        self.yahoosearchname = name        # Yahoo找到完整電影名稱(不是使用者輸入的)

    def specific_movie_reviews(self, movie_link): # 返回電影評論list
        response = requests.get(movie_link, headers=self.headers)
        soup = BeautifulSoup(response.text, "html.parser")
        reviews = soup.find_all('div', class_='review-container')

        movie_reviews = []
        for review in reviews[:5]:
            review_title = review.find('a', class_='title').text.strip()
            movie_reviews.append(review_title)
        return movie_reviews if movie_reviews else '該電影目前無評論'

    def search_movie(self, query): # 返回電影評論網址
        # search
        encoded_query = requests.utils.quote(query)
        url = f"https://www.imdb.com/find/?q={encoded_query}&ref_=nv_sr_sm"
        response = requests.get(url, headers=self.headers)
        soup = BeautifulSoup(response.text, "html.parser")
        search_results = soup.select('#__next > main > div.ipc-page-content-container.ipc-page-content-container--full.sc-5352556-0.cAzlUg > div.ipc-page-content-container.ipc-page-content-container--center > section > div > div.ipc-page-grid__item.ipc-page-grid__item--span-2 > section:nth-child(3) > div.sc-17bafbdb-2.ffAEHI > ul > li')

        # check search list
        if search_results:
            search_count = len(search_results)
            movie = search_results[0]
            k = movie.select_one('div > div > a')
            movie_number = k.get('href').split('/')[2]
            review_link = f'https://www.imdb.com/title/{movie_number}/reviews?ref_=tt_urv'
            if query == k.text:
                return self.specific_movie_reviews(review_link)

            if search_count > 1:
                for m in search_results:
                    k = m.select_one('div > div > a')
                    movie_number = k.get('href').split('/')[2]
                    review_link = f'https://www.imdb.com/title/{movie_number}/reviews?ref_=tt_urv'
                    ans = None
                    while ans != 'Y' and ans != 'N':
                        ans = input(f'找到{search_count}部電影，返回第一個搜尋{k.text} (Y)或往下搜索(N):')
                        if ans == 'Y':
                            return self.specific_movie_reviews(review_link)
                        elif ans == 'N':
                            continue
                # 該次搜尋都沒找到，重新搜尋
                sleep(1 + random())
                return self.search_movie(self.user_input())
            else:
                return self.specific_movie_reviews(review_link)
        else:
            return f'沒有找到{query}'

In [17]:
def main():
    try:
        YahooMovies_info = YahooMovies()
        movie_name, movie_info = YahooMovies_info.search_and_get_movie_info()

        IMDbMovies_info = IMDbMovies(movie_name)
        reviews = IMDbMovies_info.search_movie(IMDbMovies_info.yahoosearchname)
        movie_info['IMDb最新5則評論'] = reviews
        return movie_info
    except TypeError:
        return '找不到就是找不到'

main()

{'中文名稱': '玩命關頭8',
 '英文名稱': 'Fast & Furious 8',
 '上映日期': '2017-04-12',
 '滿意度': '4.3 / 5 (共925人投票)',
 'IMDb': '6.9 / 10',
 'Yahoo最新5則評論': ['玩命關頭8它是屬於美國一種動作片滿好看',
  '這個系列好看好看就是好看不用多收看就對了每一集出來都比看',
  '傑森史塔森變好人也變得太快了吧',
  '中文翻譯的名字毀了影片，大陸翻譯《激情與速度》還好點。本很感人，分享一個下載的網站，niigu.com,電影/電視劇/音樂都有資源',
  '第八集裏頭Diesel想讓核心人物唐老大一個較合適且灰暗的轉變，Diesel說：「我知道必須走向黑暗，即便有些參與電影的人質疑是否太過黑暗。」劇本還在早期發展階段時，曾和女友討論到：「她說了一些深刻有力的話—全世界都不想看到唐老大在下一集裡高興的樣子，不會在第七集之後，不會在電影史上最大結局之後。快樂是世界最不想看到的東西。」 https://goo.gl/vmh7vk'],
 'IMDb最新5則評論': ['Cringefest - Stupidity level 11',
  'The Fast & The Furious:Part 8.',
  'The worst movie of the series',
  'Charlize Theron\'s Master Criminal Cipher " 💎 Sparkles " . . . . . With { Panache 👌🔥 } .',
  'A problematic re-introduction to the franchise']}