|
12 | 12 | base_id = "https://www.imdb.com/title" |
13 | 13 | # base url is used when the user gives a title to search for |
14 | 14 | base = "https://www.imdb.com/find?s=tt&q=" |
15 | | -# for parsing we have used the lxml parser for optimization purposes, if lxml does not work for you replace 'lxml' with 'html.parser' |
16 | 15 |
|
17 | 16 |
|
18 | 17 | def get_info(soup): |
19 | 18 | info = {} |
| 19 | + labels = ["title", "year", "rating", "genre", "plot", "date", "country", |
| 20 | + "language", "budget", "gross", "gross_usa", "opening_week_usa"] |
20 | 21 | try: |
21 | 22 | info["title"] = soup.find( |
22 | | - 'div', attrs={"class": "title_wrapper"}).h1.text |
23 | | - info["year"] = soup.find('span', attrs={"id": "titleYear"}).a.text |
| 23 | + 'div', attrs={"class": "title_wrapper"}).h1.get_text(strip=True) |
| 24 | + info["year"] = soup.find( |
| 25 | + 'span', attrs={"id": "titleYear"}).a.get_text(strip=True) |
24 | 26 | info["rating"] = soup.find( |
25 | | - 'span', attrs={"itemprop": "ratingValue"}).text |
| 27 | + 'span', attrs={"itemprop": "ratingValue"}).get_text(strip=True) |
26 | 28 | subtext = soup.find("div", attrs={"class": "subtext"}) |
27 | | - info["genre"] = subtext.a.text |
| 29 | + info["genre"] = subtext.a.get_text(strip=True) |
28 | 30 | article = soup.find('div', attrs={"id": "titleStoryLine"}) |
29 | 31 | info["plot"] = article.find( |
30 | | - 'div', attrs={"class": "canwrap"}).p.span.text |
| 32 | + 'div', attrs={"class": "canwrap"}).p.span.get_text(strip=True) |
31 | 33 | details = soup.find('div', attrs={"id": "titleDetails"}) |
32 | 34 | blocks = details.findAll('div', attrs={"class": "txt-block"}) |
33 | | - # Strings have been stripped as the html of imdb is not clean |
34 | | - # Formatting of strings is preferrential you can change that to suit your needs |
35 | | - info["country"] = blocks[1].a.text |
36 | | - info["date"] = blocks[3].text[15:30] |
37 | | - info["budget"] = blocks[6].text[8:23] |
38 | | - info["gross_USA"] = blocks[8].text[11:] |
39 | | - info["gross_worldwide"] = blocks[9].text[28:] |
| 35 | + for block in blocks: |
| 36 | + heading = block.h4.get_text(strip=True) |
| 37 | + if heading == "Release Date:": |
| 38 | + info["date"] = block.get_text(strip=True).replace( |
| 39 | + "See more»", '').replace(heading, '') |
| 40 | + if heading == "Country:": |
| 41 | + info["country"] = block.a.get_text(strip=True) |
| 42 | + if heading == "Language": |
| 43 | + info["language"] = block.a.get_text(strip=True) |
| 44 | + if heading == "Budget:": |
| 45 | + info["budget"] = block.get_text( |
| 46 | + strip=True).replace(heading, '') |
| 47 | + if heading == "Cumulative Worldwide Gross:": |
| 48 | + info["gross"] = block.get_text( |
| 49 | + strip=True).replace(heading, '') |
| 50 | + if heading == "Gross USA:": |
| 51 | + info["gross_usa"] = block.get_text( |
| 52 | + strip=True).replace(heading, '') |
| 53 | + if heading == "Opening Weekend USA:": |
| 54 | + info["opening_week_usa"] = block.get_text( |
| 55 | + strip=True).replace(heading, '') |
40 | 56 | except: |
41 | | - print("Something went wrong") |
| 57 | + assert any(obj in labels for obj in info), "No info found" |
42 | 58 |
|
43 | | - print(info, end="\n\n\n") |
| 59 | + if len(info) > 4: |
| 60 | + print(info, end="\n\n\n") |
44 | 61 |
|
45 | 62 |
|
46 | 63 | def find_movie(query): |
47 | 64 | url = base+query |
48 | 65 | resp = requests.get(url) |
| 66 | +# for parsing we have used the lxml parser for optimization purposes, if lxml does not work for you replace 'lxml' with 'html.parser' |
49 | 67 | soup1 = bs(resp.text, 'lxml') |
50 | | -# Since for every query imdb gives about 150-200 responses , we choose the top 5 and return the details for them |
| 68 | +# Since for every query imdb gives about 150-200 responses , we choose the top 5 and return the details for them |
51 | 69 | movie_list = soup1.findAll("tr", attrs={"class": "findResult"})[0:5] |
52 | | - for movie in movie_list: |
53 | | - # Through the table given , we extract the title id from the 'href' attribute of the <a> tag |
54 | | - title_id = movie.find( |
55 | | - 'td', attrs={"class": "result_text"}).a.attrs["href"][6:] |
| 70 | + if movie_list: |
| 71 | + for movie in movie_list: |
| 72 | + # Through the table given , we extract the title id from the 'href' attribute of the <a> tag |
| 73 | + title_id = movie.find( |
| 74 | + 'td', attrs={"class": "result_text"}).a.attrs["href"][6:] |
56 | 75 |
|
57 | | - url = base_id+title_id |
58 | | - respo = requests.get(base_id+title_id) |
59 | | - soup = bs(respo.text, 'lxml') |
60 | | - get_info(soup) |
| 76 | + url = base_id+title_id |
| 77 | + respo = requests.get(base_id+title_id) |
| 78 | + soup = bs(respo.text, 'lxml') |
| 79 | + get_info(soup) |
| 80 | + else: |
| 81 | + print("No results found") |
61 | 82 |
|
62 | 83 |
|
63 | 84 | if __name__ == "__main__": |
|
0 commit comments