# 猫眼电影排名
为简化问题，使用镜像网站进行爬取[猫眼电影镜像](https://ssr1.scrape.center/)

# bs库

In [None]:
import json
import requests
from requests.exceptions import RequestException
import re
import time
from bs4 import BeautifulSoup

def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def parse_one_page(html):
    soup= BeautifulSoup(html,'xml')
    rows=soup.select(".el-card__body > .el-row")
    for row in rows :
        yield {
            'index': int(row.select_one(".name").get("href").split('/')[-1]),
            'image': row.select_one("img").get("src"),
            'title': row.select_one(".name").text.strip(),
            'categories': [span.text for span in row.select(".categories span") ],
            'time': [info.text for info in row.select(".info span")][-1],
            'score': float(row.select_one(".score").text.strip())
        }

def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False) + '\n')

In [None]:
def main(page):
    url = f"https://ssr1.scrape.center/page/{str(page)}"
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)
        write_to_file(item)

In [None]:
for i in range(1,11):
    main(i)
    time.sleep(1)


# 正则表达式

In [None]:
import requests
import logging
import re
from urllib.parse import urljoin

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s: %(message)s')

BASE_URL = 'https://ssr1.scrape.center'
TOTAL_PAGE = 10


def scrape_page(url):
    logging.info('scraping %s...', url)
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
        logging.error('get invalid status code %s while scraping %s', response.status_code, url)
    except requests.RequestException:
        logging.error('error occurred while scraping %s', url, exc_info=True)

def parse_detail(html):
    
    cover_pattern = re.compile('class="item.*?<img.*?src="(.*?)".*?class="cover">', re.S)
    name_pattern = re.compile('<h2.*?>(.*?)</h2>')
    categories_pattern = re.compile('<button.*?category.*?<span>(.*?)</span>.*?</button>', re.S)
    published_at_pattern = re.compile('(\d{4}-\d{2}-\d{2})\s?上映')
    drama_pattern = re.compile('<div.*?drama.*?>.*?<p.*?>(.*?)</p>', re.S)
    score_pattern = re.compile('<p.*?score.*?>(.*?)</p>', re.S)
    
    cover = re.search(cover_pattern, html).group(1).strip() if re.search(cover_pattern, html) else None
    name = re.search(name_pattern, html).group(1).strip() if re.search(name_pattern, html) else None
    categories = re.findall(categories_pattern, html) if re.findall(categories_pattern, html) else []
    published_at = re.search(published_at_pattern, html).group(1) if re.search(published_at_pattern, html) else None
    drama = re.search(drama_pattern, html).group(1).strip() if re.search(drama_pattern, html) else None
    score = float(re.search(score_pattern, html).group(1).strip()) if re.search(score_pattern, html) else None
    return {
        'cover': cover,
        'name': name,
        'categories': categories,
        'published_at': published_at,
        'drama': drama,
        'score': score
    }

根据列表页获取所有详情页链接

In [None]:
pattern = re.compile('<a.*?href="(.*?)".*?class="name">')
url_list =[]
for i in range(1,11):
    url_list += re.findall(pattern,scrape_page(f"{BASE_URL}/page/{i}"))
url_list =[f"{BASE_URL}{url}" for url in url_list]


In [None]:
ret=[]
for url in url_list:
    ret.append(parse_detail(scrape_page(url)))
ret 
    

In [12]:
import pandas as pd 
df=pd.DataFrame(ret)
df.to_json("result.json",force_ascii=False)

In [13]:
pd.read_json("result.json")

Unnamed: 0,cover,name,categories,published_at,drama,score
0,https://p0.meituan.net/movie/ce4da3e03e655b5b8...,霸王别姬 - Farewell My Concubine,"[剧情, 爱情]",1993-07-26,影片借一出《霸王别姬》的京戏，牵扯出三个人之间一段随时代风云变幻的爱恨情仇。段小楼（张丰毅 ...,9.5
1,https://p1.meituan.net/movie/6bea9af4524dfbd0b...,这个杀手不太冷 - Léon,"[剧情, 动作, 犯罪]",1994-09-14,里昂（让·雷诺 饰）是名孤独的职业杀手，受人雇佣。一天，邻居家小姑娘马蒂尔德（纳塔丽·波特曼...,9.5
2,https://p0.meituan.net/movie/283292171619cdfd5...,肖申克的救赎 - The Shawshank Redemption,"[剧情, 犯罪]",1994-09-10,20世纪40年代末，小有成就的青年银行家安迪（蒂姆·罗宾斯 饰）因涉嫌杀害妻子及她的情人而锒...,9.5
3,https://p1.meituan.net/movie/b607fba7513e7f15e...,泰坦尼克号 - Titanic,"[剧情, 爱情, 灾难]",1998-04-03,1912年4月15日，载着1316号乘客和891名船员的豪华巨轮“泰坦尼克号”与冰山相撞而沉...,9.5
4,https://p0.meituan.net/movie/289f98ceaa8a0ae73...,罗马假日 - Roman Holiday,"[剧情, 喜剧, 爱情]",1953-08-20,欧洲某国的安妮公主（奥黛丽·赫本 饰）到访罗马，国务烦身，但她又厌倦繁文缛节。一天晚上，身心...,9.5
...,...,...,...,...,...,...
95,https://p1.meituan.net/movie/14a7b337e8063e3ce...,大闹天宫 - The Monkey King,"[动画, 奇幻]",1965-12-31,话说在东土傲来国有一座花果山，山上有一尊石猴吸收日精月华化身为一只神猴（邱岳峰 配音），统领...,9.0
96,https://p1.meituan.net/movie/ba1ed511668402605...,天空之城 - 天空の城ラピュタ,"[动画, 奇幻, 冒险]",1992-05-01,小姑娘希达 （SHEETA）是传说中“天空之城拉普达 （Laputa ）”王族的后裔，那曾是...,9.0
97,https://p0.meituan.net/movie/ef6d7e040278f3d72...,音乐之声 - The Sound of Music,"[剧情, 爱情, 歌舞, 传记]",1965-03-02,玛利亚（朱丽·安德鲁斯 饰）是一个年轻活泼的修女，喜欢在大自然下高声歌唱，所以她常常忘记了修...,9.0
98,https://p0.meituan.net/movie/b0d986a8bf89278af...,辛德勒的名单 - Schindler&#x27;s List,"[剧情, 历史, 战争]",1993-11-30,1939年，波兰在纳粹德国的统治下，党卫军对犹太人进行了隔离统治。德国商人奥斯卡·辛德勒（连...,9.5
