Skip to content

Commit acf90ea

Browse files
Merge pull request avinashkranjan#2934 from jaivsh/second
Added CNN web scraper
2 parents 79a66fb + 2fdeb80 commit acf90ea

File tree

3 files changed

+120
-0
lines changed

3 files changed

+120
-0
lines changed

CNN Scraper/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
## CNN scraper
2+
3+
This scraper script scrapes the CNN content from it's website using the relevant functions. The details of various of it's functions are as follows:
4+
5+
- news_by_location(): Provides news by location/country/continent
6+
- news_by_category(): Proides news articles by category.

CNN Scraper/cnn.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
4+
5+
class NewsCNN:
6+
"""
7+
Create an instance of `NewsCNN` class.\n
8+
```python
9+
news = NewsCNN()
10+
```
11+
| Methods | Details |
12+
| ---------------------------- | -------------------------------------------------------------------------- |
13+
| `.news_by_location(country="india)` | Returns the list of articles by a specific country. |
14+
| `.news_by_category(type)` | Returns the list of articles by a specific category. |
15+
"""
16+
17+
def __init__(self):
18+
self.headers = {
19+
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36"
20+
}
21+
22+
def news_by_location(self, country: str):
23+
"""
24+
Returns the relevant news articles corresponding to that particular geo-continent or country\n
25+
Class - `NewsCNN`
26+
Parameters: \n
27+
- country: Name of the country\n
28+
```python
29+
news = newsCNN()
30+
news.news_by_location()
31+
```
32+
"""
33+
34+
try:
35+
sol = []
36+
obj_keys = ["news", "link"]
37+
location = country.lower()
38+
URL = f"https://edition.cnn.com/world/{location}"
39+
page = requests.get(URL)
40+
parse = BeautifulSoup(page.content, "html.parser")
41+
heads = parse.find_all("span", attrs={"data-editable": "headline"})
42+
links1 = parse.find_all(
43+
"a",
44+
attrs={
45+
"class": "container__link container_lead-plus-headlines-with-images__link"
46+
},
47+
)
48+
links2 = parse.find_all(
49+
"a", attrs={"class": "container__link container_vertical-strip__link"}
50+
)
51+
links3 = parse.find_all(
52+
"a",
53+
attrs={"class": "container__link container_lead-plus-headlines__link"},
54+
)
55+
56+
base = "https://edition.cnn.com/"
57+
allurls = []
58+
allheads = []
59+
60+
for i in heads:
61+
tmp = i.text
62+
allheads.append(tmp)
63+
64+
for i in links1 + links2 + links3:
65+
t = base + i["href"]
66+
allurls.append(t)
67+
allurls = list(set(allurls))
68+
69+
for i in range(len(allurls)):
70+
obj_values = [allheads[i], allurls[i]]
71+
new_obj = dict(zip(obj_keys, obj_values))
72+
sol.append(new_obj)
73+
74+
return sol
75+
except:
76+
return None
77+
78+
def news_by_category(self, type: str):
79+
"""
80+
Returns a list of news articles from a specific category.
81+
82+
Parameters:
83+
- type (str): The category of news articles to retrieve. Allowable types are: "politics", "business", "opinions", "health", "style".
84+
85+
Returns:
86+
A list of dictionaries, each containing news article information including title and link, or an exception if an error occurs.
87+
88+
Example:
89+
```python
90+
news = NewsCNN()
91+
politics_articles = news.news_by_category("politics")
92+
```
93+
"""
94+
try:
95+
sol = []
96+
type = type.lower()
97+
url = f"https://edition.cnn.com/{type}"
98+
page = requests.get(url, headers=self.headers)
99+
parse = BeautifulSoup(page.content, "html.parser")
100+
articles = parse.find_all(
101+
"a", {"class": "container__link container_lead-plus-headlines__link"}
102+
)
103+
for article in articles:
104+
text = article.find("span", {"data-editable": "headline"})
105+
if text:
106+
link = "https://edition.cnn.com" + article["href"]
107+
data = {"Title": text.text, "Link": link}
108+
sol.append(data)
109+
return sol
110+
except Exception as e:
111+
return e

CNN Scraper/requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
beautifulsoup4==4.9.1
2+
bs4==0.0.1
3+
requests==2.31.0

0 commit comments

Comments
 (0)