-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
92 lines (64 loc) · 2.61 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from bs4 import BeautifulSoup
import requests
import enum
from pprint import pformat
import logging
logging.basicConfig(filename='app.log', filemode='w', format='%(asctime)s - %(message)s', level=logging.INFO)
class Scraper():
@staticmethod
def getChapter(obj):
chapter_number = 'chapter_'+str(obj.get('chapter'))
final_dict = (chapter_number, [])
url = Scraper.urlBuilder(UrlType.chapter.name,
obj.get('name'), chapter_number)
logging.info(pformat(url))
soup = Scraper.getSoup(url)
image_div = soup.find('div', attrs={'class': 'container-chapter-reader'})
images = image_div.findAll('img')
for link in images:
final_dict[1].append(link.get('src'))
logging.info(pformat(final_dict))
return final_dict
# This function will take a string name and return list containing a
# dictionary of name of the manga and the url
@staticmethod
def getSearchResults(name):
search_results = dict()
fname = name.replace(" ", "_")
# build sarch url
url = Scraper.urlBuilder(UrlType.search.name, fname)
soup = Scraper.getSoup(url)
# get the div that contains the info i need
story_item = soup.findAll('div', attrs= {'class': 'search-story-item'})
for item in story_item:
link = item.find('a', attrs={'class':'item-img'})
search_results[link.get("title")] = link.get('href')
logging.info('search result: %s', str(search_results))
return search_results
@staticmethod
def getMangaInfo(name):
manga_info = dict(count=0, chapters=[])
url = Scraper.urlBuilder(UrlType.manga.name, name)
soup = Scraper.getSoup(url)
chapter_list = soup.find_all('li', attrs={'class': 'a-h'})
manga_info['count'] = len(chapter_list)
for chapter in chapter_list:
link = chapter.find('a')
manga_info['chapters'].append({link.get('title'): link.get('href')})
# logging.info('manga info: %s', str(manga_info))
return manga_info
@classmethod
def getSoup(cls, url):
html = requests.get(url)
return BeautifulSoup(html.content, 'html.parser')
@classmethod
def urlBuilder(cls, enumType, *args):
base_url = 'https://manganelo.com/'+enumType
separator = '/'
for word in args:
base_url = base_url+separator+word
return base_url
class UrlType(enum.Enum):
search = 'search'
manga = 'manga'
chapter = 'chapter'