-
Notifications
You must be signed in to change notification settings - Fork 165
/
Copy pathmyXcbdScraper.py
123 lines (111 loc) · 5.43 KB
/
myXcbdScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#! python3
import requests, os, bs4, re
url = 'https://xkcd.com'
#create a directory to store all the comics
os.makedirs('xkcd', exist_ok=True)
def imgdownloader(url):
while not url.endswith('#'):
#print out the current page
res = requests.get(url)
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 404 there is an exception for bad request
soup = bs4.BeautifulSoup(res.text,"lxml") #r.text is the content of the response in unicode, and r.content is the content of the response in bytes.
#find the comic image on the current page
comic = soup.select('#comic img') #finds tag with comic and its sub tag img
#print(comic)
if comic == []:
#the page did not contaib a comic.. move on
print("No comic was found..")
break
else:
try:
#get the full url to the comic
comicimg = 'http:' + comic[0].get('src') #finds url from the list comic|| basically comic[0] is used as there is just single one element in list!! try print(comic) && print(comic[0]) to see for yourself..
#check that it is actually a comic and not an interactive page
if "/comics/" in comicimg:
print('Download image %s' % comicimg)
res = requests.get(comicimg)
res.raise_for_status()
#write the image to the xkcd folder
image = open(os.path.join('xkcd', os.path.basename(comicimg)), 'wb')
for chunk in res.iter_content(10000): #default way to write requested content basically chunk is byte by byte writing
image.write(chunk)
image.close()
print('Finished')
break
else:
print("No comic was found..")
break
except requests.exceptions.MissingSchema:
print("Error in downloading img!!")
break
def getLatestComicNumber(url):
res=requests.get(url)
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 404 there is an exception for bad request
soup=bs4.BeautifulSoup(res.text,"lxml")
prevLink=soup.select('a[rel="prev"]')[0]
url = 'https://xkcd.com' + prevLink.get('href')
x=re.findall('\d+',url)
x=int(x[0])+1
#print(x)
return x;
#this function is basically traversing backwards, it starts from the most recent comic and goes back until n-1 n being number of pages
#as there are no prev before 1 ( :p quite obvious)
def getNextComic(soup):
prevLink=soup.select('a[rel="prev"]')[0]
url = 'https://xkcd.com' + prevLink.get('href') # gets /comic-num/ from current page prev button ..basic crawling!!
return url;
def getSpecificComic(comic_number): #comic_number
res=url+'/'+comic_number+'/'
try:
imgdownloader(res)
except Exception as e:
print(str(e))
def batchDownloader():
url = 'https://xkcd.com'
#check to make sure it's not the first page
while not url.endswith('#'):
#print out the current page
print('Current page: %s' % url)
res = requests.get(url)
res.raise_for_status() #returns None as the request received is 200 which is fine, if received status is 400
soup = bs4.BeautifulSoup(res.text,"lxml") #r.text is the content of the response in unicode, and r.content is the content of the response in bytes.
#find the comic image on the current page
comic = soup.select('#comic img') #finds tag with comic and its sub tag img
#print(comic)
if comic == []:
#the page did not contaib a comic.. move on
print("No comic was found..")
else:
try:
#get the full url to the comic
comicimg = 'http:' + comic[0].get('src') #finds url from the list comic|| basically comic[0] is used as there is just single one element in list!! try print(comic) && print(comic[0]) to see for yourself..
#check that it is actually a comic and not an interactive page
if "/comics/" in comicimg:
print('Download image %s' % comicimg)
res = requests.get(comicimg)
res.raise_for_status()
#write the image to the xkcd folder
image = open(os.path.join('xkcd', os.path.basename(comicimg)), 'wb')
for chunk in res.iter_content(10000): #default way to write requested content basically chunk is byte by byte writing
image.write(chunk)
image.close()
else:
print("No comic was found..")
except requests.exceptions.MissingSchema:
url = getNextComic(soup)
continue
url=getNextComic(soup) #basically for downloading the first image
#all comics have downloaded
print('Finished')
def main():
x=int(input("Choose your option: \n1.Download all images\t2.Download Specific image\n"))
if x==1:
batchDownloader()
if x==2:
y=str(input("Enter any comic number between 1-"+str(getLatestComicNumber(url))))
try:
getSpecificComic(y)
except Exception as e:
print(str(e))
if __name__ == '__main__':
main()