In [32]:
import urllib
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd


image_output_dir = Path("./images")
image_output_dir.mkdir(parents=True, exist_ok=True)
data_dir = Path('./data')
data_dir.mkdir(parents=True, exist_ok=True)


def get_soup(url):
    with urllib.request.urlopen(url) as response:
        html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    return soup


def get_info(soup):

    info = soup.body.find_next(id="v-details-list").find_all('p')
    res = {}
    for i in info:
        text = i.text
        name, value = i.text.split('：', 1)
        res[name.strip()] = value.strip()
    return res


def get_poster(soup, image_output_dir):
    poster = soup.body.find_next(id="v-poster").find_next('img')
    link = poster.get('src', '')
    if len(link) > 0:
        link = "https:" + link
        image_path = image_output_dir.joinpath(Path(link).name)
    else:
        link = ""
        image_path = ""
    return link, image_path


def save_poster(link, image_path):
    if link != "":
        urllib.request.urlretrieve(link, image_path)


def get_star_list(url):
    soup = get_soup(url)
    page_list = soup.body.find_next(id="list_stars").find_all("li")
    res = []
    for p in page_list:
        a = p.find_next('a')
        page_url = "https://www.ijq.tv"+a.get('href', '')
        avatar_url = "https:" + a.img.get('src', '')
        name = a.img.get("alt")
        res += [{'name': name, 'page_url': page_url, 'avatar_url': avatar_url}]
    return res


def crawl_page(url, save_img=False):
    if len(url) > 0:
        soup = get_soup(url)
        info = get_info(soup)
        poster, image_local_path = get_poster(soup, image_output_dir)
        info['image_url'] = poster
        info['image_path'] = image_local_path.as_posix()

        if save_img:
            save_poster(poster, image_local_path)

    else:
        info = {}
    return info


def crawl_list(url):
    page_list = get_star_list(url)
    res = []
    for p in tqdm(page_list):
        try:
            info = crawl_page(p.get('page_url', ''))
        except:
            print('error: '+ p.get('page_url'))
            info = {}
        info.update(p)
        res += [info]

    # save result
    list_name = Path(url).name.replace('html', 'csv')
    res = pd.DataFrame(res)
    res.columns = [i.replace('\xa0', '').replace('\u3000', '')
                   for i in list(res.columns)]
    res.to_csv(data_dir.joinpath(list_name), index=False)

In [None]:
for page_name in range(21, 100):
    url = f"https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_{page_name}.html"
    print(f'crawling page {page_name}')
    res = crawl_list(url)

crawling page 21


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:34<00:00,  3.15s/it]


crawling page 22


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:16<00:00,  2.56s/it]


crawling page 23


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:16<00:00,  2.56s/it]


crawling page 24


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:21<00:00,  2.72s/it]


crawling page 25


 57%|██████████████████████████████████████████████▍                                   | 17/30 [00:43<00:30,  2.34s/it]

error: https://www.ijq.tv/mingxing/15717965646803.html


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:14<00:00,  2.47s/it]


crawling page 26


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [01:18<00:00,  2.61s/it]


crawling page 27


  3%|██▊                                                                                | 1/30 [00:02<01:10,  2.44s/it]

In [None]:
data = []
for p in Path('./data').glob('*.csv'):
    dlist = pd.read_csv(p)
    data += [dlist]
data = pd.concat(data)
data

In [None]:
len(list(image_output_dir.glob("*")))