fix(IMDB Scraper) : Fixed bugs and made changes recommended

priyanshu20 · priyanshu20 · commit d55956d6bc9d · 2021-03-13T00:25:59.000+05:30
Cleaned output, fixed bugs, made changes recommended in PR review
diff --git a/IMDB-Scraper/README.md b/IMDB-Scraper/README.md
@@ -20,8 +20,8 @@ Collects the information given on IMDB for the given title
 ## Usage
 
 - Install dependencies
-- python scraper.py --t movie-name-here
-- sample : python scraper.py --t red
+- python scraper.py --t movie-name-here(in double quotes)
+- sample : python scraper.py --t "red"
 
 ## Screenshots
 
diff --git a/IMDB-Scraper/scraper.py b/IMDB-Scraper/scraper.py
@@ -12,52 +12,73 @@
 base_id = "https://www.imdb.com/title"
 # base url is used when the user gives a title to search for
 base = "https://www.imdb.com/find?s=tt&q="
-# for parsing we have used the lxml parser for optimization purposes, if lxml does not work for you replace 'lxml' with 'html.parser'
 
 
 def get_info(soup):
     info = {}
+    labels = ["title", "year", "rating", "genre", "plot", "date", "country",
+              "language", "budget", "gross", "gross_usa", "opening_week_usa"]
     try:
         info["title"] = soup.find(
-            'div', attrs={"class": "title_wrapper"}).h1.text
-        info["year"] = soup.find('span', attrs={"id": "titleYear"}).a.text
+            'div', attrs={"class": "title_wrapper"}).h1.get_text(strip=True)
+        info["year"] = soup.find(
+            'span', attrs={"id": "titleYear"}).a.get_text(strip=True)
         info["rating"] = soup.find(
-            'span', attrs={"itemprop": "ratingValue"}).text
+            'span', attrs={"itemprop": "ratingValue"}).get_text(strip=True)
         subtext = soup.find("div", attrs={"class": "subtext"})
-        info["genre"] = subtext.a.text
+        info["genre"] = subtext.a.get_text(strip=True)
         article = soup.find('div', attrs={"id": "titleStoryLine"})
         info["plot"] = article.find(
-            'div', attrs={"class": "canwrap"}).p.span.text
+            'div', attrs={"class": "canwrap"}).p.span.get_text(strip=True)
         details = soup.find('div', attrs={"id": "titleDetails"})
         blocks = details.findAll('div', attrs={"class": "txt-block"})
-      #   Strings have been stripped as the html of imdb is not clean
-      # Formatting of strings is preferrential you can change that to suit your needs
-        info["country"] = blocks[1].a.text
-        info["date"] = blocks[3].text[15:30]
-        info["budget"] = blocks[6].text[8:23]
-        info["gross_USA"] = blocks[8].text[11:]
-        info["gross_worldwide"] = blocks[9].text[28:]
+        for block in blocks:
+            heading = block.h4.get_text(strip=True)
+            if heading == "Release Date:":
+                info["date"] = block.get_text(strip=True).replace(
+                    "See more»", '').replace(heading, '')
+            if heading == "Country:":
+                info["country"] = block.a.get_text(strip=True)
+            if heading == "Language":
+                info["language"] = block.a.get_text(strip=True)
+            if heading == "Budget:":
+                info["budget"] = block.get_text(
+                    strip=True).replace(heading, '')
+            if heading == "Cumulative Worldwide Gross:":
+                info["gross"] = block.get_text(
+                    strip=True).replace(heading, '')
+            if heading == "Gross USA:":
+                info["gross_usa"] = block.get_text(
+                    strip=True).replace(heading, '')
+            if heading == "Opening Weekend USA:":
+                info["opening_week_usa"] = block.get_text(
+                    strip=True).replace(heading, '')
     except:
-        print("Something went wrong")
+        assert any(obj in labels for obj in info), "No info found"
 
-    print(info, end="\n\n\n")
+    if len(info) > 4:
+        print(info, end="\n\n\n")
 
 
 def find_movie(query):
     url = base+query
     resp = requests.get(url)
+# for parsing we have used the lxml parser for optimization purposes, if lxml does not work for you replace 'lxml' with 'html.parser'
     soup1 = bs(resp.text, 'lxml')
-#     Since for every query imdb gives about 150-200 responses , we choose the top 5 and return the details for them
+# Since for every query imdb gives about 150-200 responses , we choose the top 5 and return the details for them
     movie_list = soup1.findAll("tr", attrs={"class": "findResult"})[0:5]
-    for movie in movie_list:
-      #     Through the table given , we extract the title id from the 'href' attribute of the <a> tag
-        title_id = movie.find(
-            'td', attrs={"class": "result_text"}).a.attrs["href"][6:]
+    if movie_list:
+        for movie in movie_list:
+            # Through the table given , we extract the title id from the 'href' attribute of the <a> tag
+            title_id = movie.find(
+                'td', attrs={"class": "result_text"}).a.attrs["href"][6:]
 
-        url = base_id+title_id
-        respo = requests.get(base_id+title_id)
-        soup = bs(respo.text, 'lxml')
-        get_info(soup)
+            url = base_id+title_id
+            respo = requests.get(base_id+title_id)
+            soup = bs(respo.text, 'lxml')
+            get_info(soup)
+    else:
+        print("No results found")
 
 
 if __name__ == "__main__":
diff --git a/IMDB-Scraper/ss1.PNG b/IMDB-Scraper/ss1.PNG