From a86165158086e3c5ed0f25890b8b7838d7b371b9 Mon Sep 17 00:00:00 2001 From: Jaivardhan Shukla <93859359+jaivsh@users.noreply.github.com> Date: Thu, 10 Aug 2023 22:49:44 +0530 Subject: [PATCH 1/3] Create bbc.py --- BBC Scraper/bbc.py | 111 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 BBC Scraper/bbc.py diff --git a/BBC Scraper/bbc.py b/BBC Scraper/bbc.py new file mode 100644 index 0000000000..bca3b7185f --- /dev/null +++ b/BBC Scraper/bbc.py @@ -0,0 +1,111 @@ +from bs4 import BeautifulSoup +import requests + + +class NewsCNN: + """ + Create an instance of `NewsCNN` class.\n + ```python + news = NewsCNN() + ``` + | Methods | Details | + | ---------------------------- | -------------------------------------------------------------------------- | + | `.news_by_location(country="india)` | Returns the list of articles by a specific country. | + | `.news_by_category(type)` | Returns the list of articles by a specific category. | + """ + + def __init__(self): + self.headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36" + } + + def news_by_location(self, country: str): + """ + Returns the relevant news articles corresponding to that particular geo-continent or country\n + Class - `NewsCNN` + Parameters: \n + - country: Name of the country\n + ```python + news = newsCNN() + news.news_by_location() + ``` + """ + + try: + sol = [] + obj_keys = ["news", "link"] + location = country.lower() + URL = f"https://edition.cnn.com/world/{location}" + page = requests.get(URL) + parse = BeautifulSoup(page.content, "html.parser") + heads = parse.find_all("span", attrs={"data-editable": "headline"}) + links1 = parse.find_all( + "a", + attrs={ + "class": "container__link container_lead-plus-headlines-with-images__link" + }, + ) + links2 = parse.find_all( + "a", attrs={"class": "container__link container_vertical-strip__link"} + ) + links3 = parse.find_all( + "a", + attrs={"class": "container__link container_lead-plus-headlines__link"}, + ) + + base = "https://edition.cnn.com/" + allurls = [] + allheads = [] + + for i in heads: + tmp = i.text + allheads.append(tmp) + + for i in links1 + links2 + links3: + t = base + i["href"] + allurls.append(t) + allurls = list(set(allurls)) + + for i in range(len(allurls)): + obj_values = [allheads[i], allurls[i]] + new_obj = dict(zip(obj_keys, obj_values)) + sol.append(new_obj) + + return sol + except: + return None + + def news_by_category(self, type: str): + """ + Returns a list of news articles from a specific category. + + Parameters: + - type (str): The category of news articles to retrieve. Allowable types are: "politics", "business", "opinions", "health", "style". + + Returns: + A list of dictionaries, each containing news article information including title and link, or an exception if an error occurs. + + Example: + ```python + news = NewsCNN() + politics_articles = news.news_by_category("politics") + ``` + """ + try: + sol = [] + type = type.lower() + url = f"https://edition.cnn.com/{type}" + page = requests.get(url, headers=self.headers) + parse = BeautifulSoup(page.content, "html.parser") + articles = parse.find_all( + "a", {"class": "container__link container_lead-plus-headlines__link"} + ) + for article in articles: + text = article.find("span", {"data-editable": "headline"}) + if text: + link = "https://edition.cnn.com" + article["href"] + data = {"Title": text.text, "Link": link} + sol.append(data) + return sol + except Exception as e: + return e From 5dc68cceb6bc6a2aa964467d61311076b4b5fc0c Mon Sep 17 00:00:00 2001 From: Jaivardhan Shukla <93859359+jaivsh@users.noreply.github.com> Date: Thu, 10 Aug 2023 22:50:41 +0530 Subject: [PATCH 2/3] Create requirements.txt --- BBC Scraper/requirements.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 BBC Scraper/requirements.txt diff --git a/BBC Scraper/requirements.txt b/BBC Scraper/requirements.txt new file mode 100644 index 0000000000..5d3386da47 --- /dev/null +++ b/BBC Scraper/requirements.txt @@ -0,0 +1,3 @@ +beautifulsoup4==4.9.1 +bs4==0.0.1 +requests==2.31.0 From 143b50ded65d31dfbe3458427603d52e8a0b1c82 Mon Sep 17 00:00:00 2001 From: Jaivardhan Shukla <93859359+jaivsh@users.noreply.github.com> Date: Thu, 10 Aug 2023 22:51:06 +0530 Subject: [PATCH 3/3] Delete BBC Scraper directory --- BBC Scraper/bbc.py | 111 ----------------------------------- BBC Scraper/requirements.txt | 3 - 2 files changed, 114 deletions(-) delete mode 100644 BBC Scraper/bbc.py delete mode 100644 BBC Scraper/requirements.txt diff --git a/BBC Scraper/bbc.py b/BBC Scraper/bbc.py deleted file mode 100644 index bca3b7185f..0000000000 --- a/BBC Scraper/bbc.py +++ /dev/null @@ -1,111 +0,0 @@ -from bs4 import BeautifulSoup -import requests - - -class NewsCNN: - """ - Create an instance of `NewsCNN` class.\n - ```python - news = NewsCNN() - ``` - | Methods | Details | - | ---------------------------- | -------------------------------------------------------------------------- | - | `.news_by_location(country="india)` | Returns the list of articles by a specific country. | - | `.news_by_category(type)` | Returns the list of articles by a specific category. | - """ - - def __init__(self): - self.headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36" - } - - def news_by_location(self, country: str): - """ - Returns the relevant news articles corresponding to that particular geo-continent or country\n - Class - `NewsCNN` - Parameters: \n - - country: Name of the country\n - ```python - news = newsCNN() - news.news_by_location() - ``` - """ - - try: - sol = [] - obj_keys = ["news", "link"] - location = country.lower() - URL = f"https://edition.cnn.com/world/{location}" - page = requests.get(URL) - parse = BeautifulSoup(page.content, "html.parser") - heads = parse.find_all("span", attrs={"data-editable": "headline"}) - links1 = parse.find_all( - "a", - attrs={ - "class": "container__link container_lead-plus-headlines-with-images__link" - }, - ) - links2 = parse.find_all( - "a", attrs={"class": "container__link container_vertical-strip__link"} - ) - links3 = parse.find_all( - "a", - attrs={"class": "container__link container_lead-plus-headlines__link"}, - ) - - base = "https://edition.cnn.com/" - allurls = [] - allheads = [] - - for i in heads: - tmp = i.text - allheads.append(tmp) - - for i in links1 + links2 + links3: - t = base + i["href"] - allurls.append(t) - allurls = list(set(allurls)) - - for i in range(len(allurls)): - obj_values = [allheads[i], allurls[i]] - new_obj = dict(zip(obj_keys, obj_values)) - sol.append(new_obj) - - return sol - except: - return None - - def news_by_category(self, type: str): - """ - Returns a list of news articles from a specific category. - - Parameters: - - type (str): The category of news articles to retrieve. Allowable types are: "politics", "business", "opinions", "health", "style". - - Returns: - A list of dictionaries, each containing news article information including title and link, or an exception if an error occurs. - - Example: - ```python - news = NewsCNN() - politics_articles = news.news_by_category("politics") - ``` - """ - try: - sol = [] - type = type.lower() - url = f"https://edition.cnn.com/{type}" - page = requests.get(url, headers=self.headers) - parse = BeautifulSoup(page.content, "html.parser") - articles = parse.find_all( - "a", {"class": "container__link container_lead-plus-headlines__link"} - ) - for article in articles: - text = article.find("span", {"data-editable": "headline"}) - if text: - link = "https://edition.cnn.com" + article["href"] - data = {"Title": text.text, "Link": link} - sol.append(data) - return sol - except Exception as e: - return e diff --git a/BBC Scraper/requirements.txt b/BBC Scraper/requirements.txt deleted file mode 100644 index 5d3386da47..0000000000 --- a/BBC Scraper/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -beautifulsoup4==4.9.1 -bs4==0.0.1 -requests==2.31.0