In [1]:
import requests
from bs4 import BeautifulSoup


def clean_text(st):
    return st.replace('\n', '').strip()


def process_claim_reviews_row(row):
    claim_review_link = row.findAll('a', attrs={'class': 'strong'})
    assert len(claim_review_link) == 1

    title = claim_review_link[0].findAll('h3', attrs={'class': 'noborder'})
    assert len(title) == 1
    title = title[0].text
    
    claim = row.findAll('div', attrs={'class': 'feedpages-excerpt feedpages__claim__container__content__text mb1'})
    assert len(claim) == 1
    claim = claim[0].text
    
    verdict = row.findAll('img', attrs={'class': 'fact-check-card__row__verdict__img'})
    assert len(verdict) == 1
    verdict = verdict[0]['src']
    claim_review_link = claim_review_link[0]['href']
    article_link = find_article_link(claim_review_link)
    return {
        'claim-review-link': claim_review_link,
        'title': clean_text(title),
        'claim': clean_text(claim),
        'verdict': ''.join(verdict.split('/')[-1].replace('.png', '').split('_')[1:]),
        'article-link': article_link
    }


def find_article_link(url):
    assert 'claimreview' in url

    res = requests.get(url)
    soup = BeautifulSoup(res.content)

    span = soup.findAll('span', attrs={'class': 'fact-check-card__details__text small'})
    assert len(span) == 1
    link = span[0].findAll('a', attrs={'target': '_blank', 'title': 'See the claim in context'})
    return link[0]['href']


def main(url):
    url = f'https://climatefeedback.org/claim-reviews/{page}'
    res = requests.get(url)

    print(f'{url} page')
    soup = BeautifulSoup(res.content)
    main = soup.findAll("main")
    assert len(main) == 1
    rows = main[0].findAll("div", attrs={"class": "row"})
    
    if len(rows) >= 2:
        data = [process_claim_reviews_row(r) for r in rows]
        return False, data
    
    else:
        return True, []

## claimreview page

In [2]:
url = 'https://climatefeedback.org/claimreview/greenhouse-gases-cause-global-warming-by-trapping-infra-red-radiations-not-by-causing-more-holes-in-the-ozone-layer/'
find_article_link(url)

'https://archive.fo/OKNmR'

In [3]:
done = False
page = 1
data = []
while not done:
    done, rows = main(page)
    data.extend(rows)
    page += 1
    
data = pd.DataFrame(data)

https://climatefeedback.org/claim-reviews/1 page
https://climatefeedback.org/claim-reviews/2 page
https://climatefeedback.org/claim-reviews/3 page
https://climatefeedback.org/claim-reviews/4 page
https://climatefeedback.org/claim-reviews/5 page
https://climatefeedback.org/claim-reviews/6 page
https://climatefeedback.org/claim-reviews/7 page
https://climatefeedback.org/claim-reviews/8 page
https://climatefeedback.org/claim-reviews/9 page
https://climatefeedback.org/claim-reviews/10 page
https://climatefeedback.org/claim-reviews/11 page
https://climatefeedback.org/claim-reviews/12 page
https://climatefeedback.org/claim-reviews/13 page


In [4]:
data.head()

Unnamed: 0,claim-review-link,title,claim,verdict,article-link
0,https://climatefeedback.org/claimreview/climat...,"Climate change, forest management and several ...",“Forest fires are caused by poor management. N...,Misleading,https://archive.is/Q2bMd
1,https://climatefeedback.org/claimreview/wester...,Western US wildfires are not the result of wid...,a distressing number of the [West coast] fires...,Inaccurate,https://archive.is/rSGaK
2,https://climatefeedback.org/claimreview/planti...,Planting trees can help mitigate some aspects ...,“Trees provide a solution to almost all enviro...,LacksContext,https://www.facebook.com/watch/?v=245623306548369
3,https://climatefeedback.org/claimreview/the-lo...,The long-term survival of polar bears is threa...,"""Global warming is driving polar bears toward ...",Correct,http://archive.md/xqtY1
4,https://climatefeedback.org/claimreview/greenh...,Greenhouse gases cause global warming by trapp...,Greenhouse gases emitted into the ocean are ca...,Incorrect,https://archive.fo/OKNmR


In [8]:
data.shape

(115, 5)

In [5]:
data.groupby('verdict').count()

Unnamed: 0_level_0,claim-review-link,title,claim,article-link
verdict,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Accurate,4,4,4,4
Correct,2,2,2,2
CorrectBut,1,1,1,1
FlawedReasoning,7,7,7,7
Imprecise,1,1,1,1
Inaccurate,36,36,36,36
Incorrect,35,35,35,35
LacksContext,2,2,2,2
Misleading,13,13,13,13
MostlyAccurate,2,2,2,2


In [6]:
from urllib.parse import urlparse

par = urlparse('https://www.nytimes.com/2020/06/29/climat')
print(par)
par.netloc

ParseResult(scheme='https', netloc='www.nytimes.com', path='/2020/06/29/climat', params='', query='', fragment='')


'www.nytimes.com'

In [7]:
data.loc[:, 'article-link'].apply(lambda x: urlparse(x).netloc).value_counts()

via.hypothes.is        34
archive.fo             23
web.archive.org        16
hyp.is                 11
www.facebook.com       10
archive.md              5
www.youtube.com         3
transcripts.cnn.com     2
archive.is              2
                        2
youtu.be                1
twitter.com             1
www.nbcnews.com         1
www.bbc.com             1
www.itv.com             1
www.nytimes.com         1
news3lv.com             1
Name: article-link, dtype: int64