Merge pull request avinashkranjan#2924 from Mihan786Chistie/googleNews

avinashkranjan · web-flow · commit a16413385e93 · 2023-08-10T23:22:21.000+05:30
added Google News Scraper
diff --git a/GoogleNews-Scraper/README.md b/GoogleNews-Scraper/README.md
@@ -0,0 +1,15 @@
+## Google News
+
+### Scrape articles with title, descriptions, news source, date and link regarding a topic
+
+Create an instance of `GoogleNews` class.
+
+```python
+articles = GoogleNews("topic")
+```
+
+| Methods          | Details                                                                                  |
+| ---------------- | ---------------------------------------------------------------------------------------- |
+| `.getArticles()` | Returns the articles with title, descriptions, news source, date and link in JSON format |
+
+---
diff --git a/GoogleNews-Scraper/googleNews.py b/GoogleNews-Scraper/googleNews.py
@@ -0,0 +1,67 @@
+import requests
+from bs4 import BeautifulSoup
+
+
+class GoogleNews:
+    """
+    Class - `GoogleNews`
+    Example:
+    ```
+    articles = GoogleNews(topic = "topic")
+    ```\n
+    Methods :\n
+    1. ``.getArticles() | Response - Articles with title, descriptions, news source, date and link.
+    """
+
+    def __init__(self, topic):
+        self.topic = topic
+
+    def getArticles(self):
+        """
+        Class - `GoogleNews`
+        Example:
+        ```
+        articles = GoogleNews("github")
+        articles.getArticles()
+        ```
+        Returns:
+        {
+            "title": Tile of the article
+            "description": Description of the article
+            "news_source": News Source of the Article
+            "date": Date the article was posted
+            "link": Link to the article
+        }
+        """
+        url = "https://www.google.com/search?q=" + self.topic + "&tbm=nws"
+        try:
+            res = requests.get(url)
+            soup = BeautifulSoup(res.text, "html.parser")
+
+            articles_data = {"articles": []}
+
+            articles = soup.find_all("a", jsname="ACyKwe")
+            for a in articles:
+                title = a.find("div", class_="BNeawe vvjwJb AP7Wnd").getText()
+                date = a.find("span", class_="r0bn4c rQMQod").getText()
+                desc = (
+                    a.find("div", class_="BNeawe s3v9rd AP7Wnd")
+                    .getText()
+                    .replace(date, "")
+                )
+                news_source = a.find(
+                    "div", class_="BNeawe UPmit AP7Wnd lRVwie"
+                ).getText()
+                link = a["href"].replace("/url?q=", "")
+                articles_data["articles"].append(
+                    {
+                        "title": title,
+                        "description": desc,
+                        "news_source": news_source,
+                        "date": date,
+                        "link": link,
+                    }
+                )
+            return articles_data
+        except:
+            return None
diff --git a/GoogleNews-Scraper/requirements.txt b/GoogleNews-Scraper/requirements.txt
@@ -0,0 +1,2 @@
+bs4
+requests