Skip to content

Commit d55956d

Browse files
committed
fix(IMDB Scraper) : Fixed bugs and made changes recommended
Cleaned output, fixed bugs, made changes recommended in PR review
1 parent 4b4c296 commit d55956d

File tree

3 files changed

+47
-26
lines changed

3 files changed

+47
-26
lines changed

IMDB-Scraper/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@ Collects the information given on IMDB for the given title
2020
## Usage
2121

2222
- Install dependencies
23-
- python scraper.py --t movie-name-here
24-
- sample : python scraper.py --t red
23+
- python scraper.py --t movie-name-here(in double quotes)
24+
- sample : python scraper.py --t "red"
2525

2626
## Screenshots
2727

IMDB-Scraper/scraper.py

Lines changed: 45 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,52 +12,73 @@
1212
base_id = "https://www.imdb.com/title"
1313
# base url is used when the user gives a title to search for
1414
base = "https://www.imdb.com/find?s=tt&q="
15-
# for parsing we have used the lxml parser for optimization purposes, if lxml does not work for you replace 'lxml' with 'html.parser'
1615

1716

1817
def get_info(soup):
1918
info = {}
19+
labels = ["title", "year", "rating", "genre", "plot", "date", "country",
20+
"language", "budget", "gross", "gross_usa", "opening_week_usa"]
2021
try:
2122
info["title"] = soup.find(
22-
'div', attrs={"class": "title_wrapper"}).h1.text
23-
info["year"] = soup.find('span', attrs={"id": "titleYear"}).a.text
23+
'div', attrs={"class": "title_wrapper"}).h1.get_text(strip=True)
24+
info["year"] = soup.find(
25+
'span', attrs={"id": "titleYear"}).a.get_text(strip=True)
2426
info["rating"] = soup.find(
25-
'span', attrs={"itemprop": "ratingValue"}).text
27+
'span', attrs={"itemprop": "ratingValue"}).get_text(strip=True)
2628
subtext = soup.find("div", attrs={"class": "subtext"})
27-
info["genre"] = subtext.a.text
29+
info["genre"] = subtext.a.get_text(strip=True)
2830
article = soup.find('div', attrs={"id": "titleStoryLine"})
2931
info["plot"] = article.find(
30-
'div', attrs={"class": "canwrap"}).p.span.text
32+
'div', attrs={"class": "canwrap"}).p.span.get_text(strip=True)
3133
details = soup.find('div', attrs={"id": "titleDetails"})
3234
blocks = details.findAll('div', attrs={"class": "txt-block"})
33-
# Strings have been stripped as the html of imdb is not clean
34-
# Formatting of strings is preferrential you can change that to suit your needs
35-
info["country"] = blocks[1].a.text
36-
info["date"] = blocks[3].text[15:30]
37-
info["budget"] = blocks[6].text[8:23]
38-
info["gross_USA"] = blocks[8].text[11:]
39-
info["gross_worldwide"] = blocks[9].text[28:]
35+
for block in blocks:
36+
heading = block.h4.get_text(strip=True)
37+
if heading == "Release Date:":
38+
info["date"] = block.get_text(strip=True).replace(
39+
"See more»", '').replace(heading, '')
40+
if heading == "Country:":
41+
info["country"] = block.a.get_text(strip=True)
42+
if heading == "Language":
43+
info["language"] = block.a.get_text(strip=True)
44+
if heading == "Budget:":
45+
info["budget"] = block.get_text(
46+
strip=True).replace(heading, '')
47+
if heading == "Cumulative Worldwide Gross:":
48+
info["gross"] = block.get_text(
49+
strip=True).replace(heading, '')
50+
if heading == "Gross USA:":
51+
info["gross_usa"] = block.get_text(
52+
strip=True).replace(heading, '')
53+
if heading == "Opening Weekend USA:":
54+
info["opening_week_usa"] = block.get_text(
55+
strip=True).replace(heading, '')
4056
except:
41-
print("Something went wrong")
57+
assert any(obj in labels for obj in info), "No info found"
4258

43-
print(info, end="\n\n\n")
59+
if len(info) > 4:
60+
print(info, end="\n\n\n")
4461

4562

4663
def find_movie(query):
4764
url = base+query
4865
resp = requests.get(url)
66+
# for parsing we have used the lxml parser for optimization purposes, if lxml does not work for you replace 'lxml' with 'html.parser'
4967
soup1 = bs(resp.text, 'lxml')
50-
# Since for every query imdb gives about 150-200 responses , we choose the top 5 and return the details for them
68+
# Since for every query imdb gives about 150-200 responses , we choose the top 5 and return the details for them
5169
movie_list = soup1.findAll("tr", attrs={"class": "findResult"})[0:5]
52-
for movie in movie_list:
53-
# Through the table given , we extract the title id from the 'href' attribute of the <a> tag
54-
title_id = movie.find(
55-
'td', attrs={"class": "result_text"}).a.attrs["href"][6:]
70+
if movie_list:
71+
for movie in movie_list:
72+
# Through the table given , we extract the title id from the 'href' attribute of the <a> tag
73+
title_id = movie.find(
74+
'td', attrs={"class": "result_text"}).a.attrs["href"][6:]
5675

57-
url = base_id+title_id
58-
respo = requests.get(base_id+title_id)
59-
soup = bs(respo.text, 'lxml')
60-
get_info(soup)
76+
url = base_id+title_id
77+
respo = requests.get(base_id+title_id)
78+
soup = bs(respo.text, 'lxml')
79+
get_info(soup)
80+
else:
81+
print("No results found")
6182

6283

6384
if __name__ == "__main__":

IMDB-Scraper/ss1.PNG

2.87 KB
Loading

0 commit comments

Comments
 (0)