In [9]:
x = Path('../list')

In [11]:
x.parent.joinpath('list.csv')

PosixPath('../list.csv')

In [56]:
import urllib
from pathlib import Path
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd


class Crawler:

    def __init__(self, image_dir, data_dir, page_dir, list_dir):
        self.IMAGE_DIR = Path(image_dir)
        self.DATA_DIR = Path(data_dir)
        self.PAGE_DIR = Path(page_dir)
        self.LIST_DIR = Path(list_dir)
        self.LIST_FILE = self.LIST_DIR.parent.joinpath('list.csv')
        output_dirs = [self.IMAGE_DIR, self.DATA_DIR,
                       self.PAGE_DIR, self.LIST_DIR]
        for d in output_dirs:
            d.mkdir(parents=True, exist_ok=True)

    def _list_urls(self, pat, pages):
        pat = urllib.parse.quote(pat)
        urls = [f"https://www.ijq.tv/mingxing/{pat}_{page_name}.html"
                for page_name in range(1, pages + 1)]
        return urls

    def check_pending_lists(self):

        urls_saved = [i.name.replace('.csv', '.html')
                      for i in self.LIST_DIR.glob("*.csv")]

        urls = []
        # Mainland
        urls += self._list_urls("list_内地", 213)
        # Hong Kong
        urls += self._list_urls("list_香港", 8)
        # Taiwan
        urls += self._list_urls("list_台湾", 9)
        # Korea
        urls += self._list_urls("list_韩国", 20)
        # Japan
        urls += self._list_urls("list_日本", 12)
        # US
        urls += self._list_urls(f"list_美国", 27)

        urls_pending = [i for i in urls if Path(i).name not in urls_saved]

        return urls_pending, urls_saved

    def crawl_list(self, url, save=True):
        with urllib.request.urlopen(url) as response:
            html = response.read()
        soup = BeautifulSoup(html, 'html.parser')
        page_list = soup.body.find_next(id="list_stars").find_all("li")
        res = []
        for p in page_list:
            a = p.find_next('a')
            page_url = "https://www.ijq.tv"+a.get('href', '')
            avatar_url = "https:" + a.img.get('src', '')
            name = a.img.get("alt")
            res += [{'name': name, 'page_url': page_url}]

        if save:
            filename = Path(url).name.replace('html', 'csv')
            pd.DataFrame(res).to_csv(
                self.LIST_DIR.joinpath(filename), index=False)
        return res

    def combine_list(self):
        urls_pending, _ = self.check_pending_lists()
        if len(urls_pending) == 0:
            print('='*60)
            print('COMBINE LIST')
            print('='*60)

            rows = []
            for p in tqdm(self.LIST_DIR.glob("*.csv")):
                rows += [pd.read_csv(p)]
            data = pd.concat(rows)
            data.to_csv(self.LIST_FILE, index=False)
            print(f'combined list is saved [{self.LIST_FILE.as_posix()}]')

    def check_pending_pages(self):
        pages = pd.read_csv(self.LIST_FILE)
        urls = pages['page_url'].tolist()
        saved_pages = [i.name for i in self.PAGE_DIR.glob('*.html')]
        urls_pending = [i for i in urls if Path(i).name not in saved_pages]
        return urls_pending, saved_pages

    def crawl_page(self, url, save=True):
        with urllib.request.urlopen(url) as response:
            html = response.read()
        filename = Path(url).name
        if save:
            with open(self.PAGE_DIR.joinpath(filename), 'wb') as f:
                f.write(html)
        return html

    def check_pending_parsed_pages(self):
        pages = [i.name for i in self.PAGE_DIR.glob("*.html")]
        pages_parsed = [i.name.replace('.csv', '.html')
                        for i in self.DATA_DIR.glob("*.csv")]
        pages_pending = [i for i in pages if i not in pages_parsed]
        return pages_pending, pages_parsed

    def parse_page(self, filename, save=True):

        with open(self.PAGE_DIR.joinpath(filename), 'r') as f:
            html = f.read()
        soup = BeautifulSoup(html, 'html.parser')
        info = self._get_info(soup)
        info['avatar'] = self._get_avatar(soup)
        info['bio'] = self._get_bio(soup)
        info['poster'] = self._get_poster(soup)

        if save:
            filename = filename.replace('.html', '.csv')
            pd.DataFrame([info]).to_csv(
                self.DATA_DIR.joinpath(filename), index=False)
        return info

    def combine_parsed_page(self):
        pages_pending, pages_parsed = self.check_pending_parsed_pages()
        if len(pages_pending) == 0:
            rows = []
            for d in tqdm(pages_parsed):
                d = d.replace('.html', '.csv')
                rows += [pd.read_csv(self.DATA_DIR.joinpath(d))]
            data = pd.concat(rows)
        else:
            print("there's pending parsing page")
            data = None
        return data

    def _get_info(self, soup):

        info = soup.body.find_next(id="v-details-list").find_all('p')
        res = {}
        for i in info:
            text = i.text
            name, value = i.text.split('：', 1)
            res[name.strip()] = value.strip()
        res['weibo'] = (info[14]).a.get('href', '')
        return res

    def _get_bio(self, soup):
        try:
            return soup.find(id="v-summary").find("div").text
        except:
            return ''

    def _get_poster(self, soup):
        try:
            res = soup.find(id="v-summary")
            res = res.find("div", {"class": "content", "class": "textindent2em"})
            res = res.find('img').get('src', '')
            return res
        except:
            return ''
        

    def _get_avatar(self, soup):
        poster = soup.body.find_next(id="v-poster").find_next('img')
        link = poster.get('src', '')
        if len(link) > 0:
            link = "https:" + link
        else:
            link = ""
        return link

    def save_image(link):
        image_path = self.IMAGE_DIR
        if link != "":
            urllib.request.urlretrieve(link, image_path)


crawler = Crawler(image_dir='../images',
                  data_dir='../data',
                  list_dir='../lists',
                  page_dir='../pages')

In [44]:
crawler.check_pending_pages()

(['https://www.ijq.tv/mingxing/1281.html',
  'https://www.ijq.tv/mingxing/1280.html',
  'https://www.ijq.tv/mingxing/1278.html',
  'https://www.ijq.tv/mingxing/1279.html',
  'https://www.ijq.tv/mingxing/1277.html',
  'https://www.ijq.tv/mingxing/1275.html',
  'https://www.ijq.tv/mingxing/1274.html',
  'https://www.ijq.tv/mingxing/1273.html',
  'https://www.ijq.tv/mingxing/1276.html',
  'https://www.ijq.tv/mingxing/1272.html',
  'https://www.ijq.tv/mingxing/1270.html',
  'https://www.ijq.tv/mingxing/1269.html',
  'https://www.ijq.tv/mingxing/1271.html',
  'https://www.ijq.tv/mingxing/1268.html',
  'https://www.ijq.tv/mingxing/1267.html',
  'https://www.ijq.tv/mingxing/1265.html',
  'https://www.ijq.tv/mingxing/1266.html',
  'https://www.ijq.tv/mingxing/1263.html',
  'https://www.ijq.tv/mingxing/1264.html',
  'https://www.ijq.tv/mingxing/1262.html',
  'https://www.ijq.tv/mingxing/1261.html',
  'https://www.ijq.tv/mingxing/1260.html',
  'https://www.ijq.tv/mingxing/1259.html',
  'https://

In [53]:
pages_pending, pages_parsed = crawler.check_pending_parsed_pages()

In [57]:
for p in pages_pending:
    crawler.parse_page(p)

In [58]:
crawler.combine_parsed_page()

100%|██████████| 5/5 [00:00<00:00, 108.57it/s]


Unnamed: 0,中 文 名,英 文 名,曾 用 名,民 族,国家地区,出生日期,出 生 地,身 高,体 重,血 型,...,毕业院校,职 业,经纪公司,微 博,代 表 作,相关明星,weibo,avatar,bio,poster
0,王文绮,Vicky.Q,,汉族,内地,1990-2-2,西安,174CM,50KG,B型,...,北京电影学院,演员,中盟世纪,王文绮的新浪微博,《冰封：重生之门温柔的诱惑》《裸漂》,江铠同,http://weibo.com/jackywwq?from=hissimilar_home,https://image.ijq.tv/201502/26/10-42-42-33-8.jpg,王文绮，1990年出生于西安，毕业于北京电影学院表演系，中国内地新生代女演员。王文琦是位身材...,https://image.ijq.tv/201703/11/10-27-53-39-26.jpg
0,余男,Yu Nan,,汉族,内地,1976-9-5,辽宁省大连市中山区,169CM,50KG,A型,...,北京电影学院表演系,演员,创新艺人经纪公司CAA,余男的新浪微博,《敢死队2》《月蚀》《惊蛰》《图雅的婚事》,旧爱王全安 搭档王学兵 搭档梁家辉 搭档陈思诚,http://weibo.com/u/1280435871?refer_flag=10010...,https://image.ijq.tv/201502/23/16-17-18-22-10.png,余男，1976年9月5日出生在辽宁省大连市，1999年毕业于北京电影学院表演系，同年参与了电...,https://image.ijq.tv/201705/16/11-40-31-20-10.jpg
0,余香凝,,,汉族,香港,,香港,169CM,45KG,,...,,演员,无线电视,余香凝的新浪微博,《骨妹》,刘嘉玲 谢君豪 吴肇轩,https://weibo.com/p/1005051839580430/info?mod=...,https://image.ijq.tv/201909/03/16-42-08-46-46.jpg,中国香港女演员余香凝，拥有着高挑的身材，精致的五官，凭借着先天的优势在娱乐圈混得如鱼，备受喜欢！,https://image.ijq.tv/201909/03/17-01-43-35-46.jpg
0,冯嘉怡,,,汉族,内地,1969-7-10,北京,180CM,78KG,AB型,...,,演员,,冯嘉怡的新浪微博,《我的左手右手》《幸福保卫战》《裸婚时代》《媳妇的美好时代》,搭档吴彦祖 搭档姚晨,http://weibo.com/u/1803492305,https://image.ijq.tv/201502/26/10-14-20-35-10.jpg,冯嘉怡，1969年7月10日出生于北京，2006年参演海清主演的电视剧《双面胶》，自此开始进...,https://image.ijq.tv/201502/26/10-00-33-14-10.jpg
0,王诗槐,,,汉族,内地,1957-11-27,安徽合肥,178CM,65KG,A型,...,上海戏剧学院（表演系）,演员,安徽省话剧团,,《淑女之家》,,/yingshi/444.html,https://image.ijq.tv/201502/26/10-33-25-38-12.jpg,"王诗槐,中国大陆男演员，1957年11月27日生于安徽合肥。1981年毕业于上海戏剧学院表演...",


(['15675001226429.html', '1284.html', '1286.html', '1287.html'], ['1288.html'])

In [48]:
crawler.check_pending_lists()

(['https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_2.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_3.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_4.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_5.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_6.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_7.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_8.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_9.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_10.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_11.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_12.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_13.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_14.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_15.html',
  'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_16.html',
  'https://www.ijq.tv/mingxing/li

In [30]:
crawler.parse_page('1288.html')

{'中\xa0文\xa0名': '王文绮',
 '英\xa0文\xa0名': 'Vicky.Q',
 '曾\xa0用\xa0名': '',
 '民\u3000\u3000族': '汉族',
 '国家地区': '内地',
 '出生日期': '1990-2-2',
 '出\xa0生\xa0地': '西安',
 '身\u3000\u3000高': '174CM',
 '体\u3000\u3000重': '50KG',
 '血\u3000\u3000型': 'B型',
 '星\u3000\u3000座': '水瓶座',
 '毕业院校': '北京电影学院',
 '职\u3000\u3000业': '演员',
 '经纪公司': '中盟世纪',
 '微\u3000\u3000博': '王文绮的新浪微博',
 '代\xa0表\xa0作': '《冰封：重生之门温柔的诱惑》《裸漂》',
 '相关明星': '江铠同',
 'weibo': 'http://weibo.com/jackywwq?from=hissimilar_home',
 'avatar': 'https://image.ijq.tv/201502/26/10-42-42-33-8.jpg',
 'bio': '王文绮，1990年出生于西安，毕业于北京电影学院表演系，中国内地新生代女演员。王文琦是位身材高挑的气质女孩，在校期间显得出类拔萃，有着“北影校花”的美称。王文琦代表作品有：《冰封：重生之门》、《温柔的诱惑》、《裸漂》等。',
 'poster': 'https://image.ijq.tv/201703/11/10-27-53-39-26.jpg'}

In [20]:
crawler.combine_list()

25it [00:00, 239.62it/s]

COMBINE LIST


221it [00:00, 290.61it/s]


combined list is saved [../list.csv]


In [15]:
crawler.crawl_list('https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_1.html')

[{'name': '韩秀一',
  'page_url': 'https://www.ijq.tv/mingxing/161690937512015.html'},
 {'name': '池忠国',
  'page_url': 'https://www.ijq.tv/mingxing/160897411112007.html'},
 {'name': '刘殿座',
  'page_url': 'https://www.ijq.tv/mingxing/160897142112004.html'},
 {'name': '董学升',
  'page_url': 'https://www.ijq.tv/mingxing/160880506011999.html'},
 {'name': '梅方',
  'page_url': 'https://www.ijq.tv/mingxing/160880153811995.html'},
 {'name': '韩端',
  'page_url': 'https://www.ijq.tv/mingxing/160873467411992.html'},
 {'name': '王燊超',
  'page_url': 'https://www.ijq.tv/mingxing/160851895311991.html'},
 {'name': '汪嵩',
  'page_url': 'https://www.ijq.tv/mingxing/160851667711988.html'},
 {'name': '张恩华',
  'page_url': 'https://www.ijq.tv/mingxing/160828576711978.html'},
 {'name': '彭伟国',
  'page_url': 'https://www.ijq.tv/mingxing/160819876711975.html'},
 {'name': '朱广沪',
  'page_url': 'https://www.ijq.tv/mingxing/160819601311972.html'},
 {'name': '张鹭',
  'page_url': 'https://www.ijq.tv/mingxing/160791393811965.html

In [116]:
url = 'https://www.ijq.tv/mingxing/list_%E5%86%85%E5%9C%B0_1.html'
res = crawler.get_star_list(url, save = True)

In [3]:
url = 'https://www.ijq.tv/mingxing/15675001226429.html'
crawler.crawl_page(url)

In [4]:
crawler.parse_page('15675001226429.html')

{'中\xa0文\xa0名': '余香凝',
 '英\xa0文\xa0名': '',
 '曾\xa0用\xa0名': '',
 '民\u3000\u3000族': '汉族',
 '国家地区': '香港',
 '出生日期': '',
 '出\xa0生\xa0地': '香港',
 '身\u3000\u3000高': '169CM',
 '体\u3000\u3000重': '45KG',
 '血\u3000\u3000型': '',
 '星\u3000\u3000座': '巨蟹座',
 '毕业院校': '',
 '职\u3000\u3000业': '演员',
 '经纪公司': '无线电视',
 '微\u3000\u3000博': '余香凝的新浪微博',
 '代\xa0表\xa0作': '《骨妹》',
 '相关明星': '刘嘉玲\xa0谢君豪\xa0吴肇轩',
 'weibo': 'https://weibo.com/p/1005051839580430/info?mod=pedit_more',
 'avatar': 'https://image.ijq.tv/201909/03/16-42-08-46-46.jpg',
 'bio': '中国香港女演员余香凝，拥有着高挑的身材，精致的五官，凭借着先天的优势在娱乐圈混得如鱼，备受喜欢！',
 'poster': 'https://image.ijq.tv/201909/03/17-01-43-35-46.jpg'}