# **This code is for checking how newspaper3k works for scraping**
Please replace aki_key(s), my_headers, and user_agent to demonstrate by yourself

In [None]:
pip install openai

In [None]:
pip install tenacity

In [None]:
pip install newspaper3k

In [5]:
import requests
from datetime import date
import pandas as pd
import newspaper
from newspaper import Article
from newspaper import Config

## **Make a sample dataset that has 50 different factcheckers' links**

In [6]:
"""
collect as many variety of factcheckers and links as possible.
Google doesn't provide the list of factcheckers. 
"""
my_headers = <put_your_header_here>
endpoint = 'https://factchecktools.googleapis.com/v1alpha1/claims:search'
query = 'Trump' # most factchecker should have examined 
language = 'en'
max_days = 10000 #Max age of returned search results, in days
page_size = 10000 #Number of pages in the search results
reviewPublisherSiteFilter = '' #Filter by review publisher
api_key = <put_your_api_key>
url = f'{endpoint}?query={query}&languageCode={language}&maxAgeDays={max_days}&key={api_key}&pageSize={page_size}&reviewPublisherSiteFilter={reviewPublisherSiteFilter}'
print(url)

https://factchecktools.googleapis.com/v1alpha1/claims:search?query=Trump&languageCode=en&maxAgeDays=10000&key=AIzaSyAAB8N8l47u7kt_EfrPFUKpwfOPYRmIlGw&pageSize=10000&reviewPublisherSiteFilter=


In [7]:
req = requests.get(url, headers = my_headers)
print(req.status_code)
data = req.json()
claims = data['claims']
print(len(claims))
print(claims[0].keys())

200
7378
dict_keys(['text', 'claimReview'])


In [8]:
queries = []
originators = []
claim_dates = []
reviews = []
publisher_names = []
publisher_sites = []
review_urls = []
titles = []
review_dates = []
ratings = []

for claim in claims: 
    query = claim['text']
    queries.append(query)

for claim in claims: 
    if 'claimant' in claim:
        originator = claim['claimant']
        originators.append(originator)
    else: 
        originators.append('NA')
    
for claim in claims: 
    if 'claimDate' in claim:
        date = claim['claimDate']
        claim_dates.append(date)
    else: 
        claim_dates.append('NA')

for claim in claims: 
    first_review = claim['claimReview'][0]
    reviews.append(first_review) #Most only have one review but some of them have many reviews all saying the same thing so gonna keep it to one to standardise the dataframe

for review in reviews: 
    if 'publisher' in review and 'name' in review['publisher']: 
        publisher_name = review['publisher']['name']
        publisher_names.append(publisher_name)
    else: 
        publisher_names.append('NA')

for review in reviews: 
    if 'publisher' in review and 'site' in review['publisher']: 
        publisher_site = review['publisher']['site']
        publisher_sites.append(publisher_site)
    else: 
        publisher_sites.append('NA')

for review in reviews: 
    if 'url' in review: 
        review_url = review['url']
        review_urls.append(review_url)
    else: 
        review_urls.append('NA')
        
for review in reviews: 
    if 'title' in review: 
        title = review['title']
        titles.append(title)
    else: 
        titles.append('NA')

for review in reviews: 
    if 'reviewDate' in review: 
        review_date = review['reviewDate']
        review_dates.append(review_date)
    else: 
        review_dates.append('NA')

for review in reviews: 
    if 'textualRating' in review: 
        rating = review['textualRating']
        ratings.append(rating)
    else: 
        ratings.append('NA')

In [9]:
df = pd.DataFrame({'claim': queries, 
                   'originator': originators, 
                   'claim_date': claim_dates, 
                   'review_publisher': publisher_names, 
                   'publisher_site': publisher_sites, 
                   'review_url': review_urls, 
                   'review_title': titles, 
                   'review_date': review_dates, 
                   'verdict': ratings})

In [10]:
# see how many factcheckers in df and who they are
publishers = df.groupby('review_publisher').size()
print(publishers)

num_publishers = len(publishers.keys())
print('Number of publishers is: ' + str(num_publishers))

review_publisher
AFP Fact Check                  167
AP News                           2
Africa Check                      3
Alt News                         10
Australian Associated Press       5
BBC                              35
BOOM FactCheck                   22
CBS News                        144
Check Your Fact                 329
Climate Feedback                  2
Digiteye India                    1
FACTLY                           10
FactCheck.org                  1019
FactRakers                        2
Full Fact                         7
Gigafact                         25
KHOU                              1
KSDK                              1
Lead Stories                    263
NA                               60
Newschecker                       8
Newsmeter                         1
Newsmobile                       38
Newsweek                        100
POLYGRAPH.info                    7
PolitiFact                     2109
Poynter                           3
Rappler    

In [11]:
# pick up one url from each factchecker in df
gdf = df.groupby('review_publisher')
sample_df = gdf.apply(lambda x: x.sample(n=1))# sample 1 row from each review_publisher
sample_df.shape

(50, 9)

In [12]:
# drop index next to review_publisher column 
sample_df = sample_df.reset_index(level='review_publisher', drop=True)

In [13]:
# Reset index at top left
sample_df = sample_df[0:].reset_index()

In [14]:
print(sample_df.size)
print(sample_df.dtypes)
print(sample_df.shape)
print(sample_df.columns)

500
index                int64
claim               object
originator          object
claim_date          object
review_publisher    object
publisher_site      object
review_url          object
review_title        object
review_date         object
verdict             object
dtype: object
(50, 10)
Index(['index', 'claim', 'originator', 'claim_date', 'review_publisher',
       'publisher_site', 'review_url', 'review_title', 'review_date',
       'verdict'],
      dtype='object')


# **Look into all websites and scrape contents**

In [None]:
contents = []

for url in sample_df['review_url']:
  if 'nytimes' in url:
    a = Article(url, language='en') 
    a.download()
    a.parse()
    contents.append(a.title + a.text[:10000])
    """
    NewYorkTimes allows request with user_agent only a few times \
    and then blocks. Above 'if' code solve the problem. 
    """
  else:
    user_agent = <put_your_user_agent_here>
    config = Config()
    config.browser_user_agent = user_agent
    config.request_timeout = 10 # avoid runtime error
    a = Article(url, config=config, language='en') 
    a.download()
    a.parse()
    contents.append(a.title + a.text[:10000])

sample_df['contents'] = contents

# **Ask GPT-4 to summarize them**

In [16]:
def gpt_request(query):
    
    """ Send a query to GPT-4 API and return the response """
    
    endpoint = "https://api.openai.com/v1/chat/completions"
    api_key = <put_your_api_key_here>
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    data = {
        "model": "gpt-4",
        # We don't need system message cos we don't ask fact checking
        "messages" : [{"role": "user", "content": query}],
        "max_tokens": 2000,
        "temperature": 0
    }

    response = requests.post(endpoint, headers=headers, json=data)
    response_json = response.json()
    # print(response_json)
    return response_json['choices'][0]['message']['content'].strip()

In [19]:
summaries = []
for content in sample_df['contents']:
  modified_content = f'Summarize following contents within 50 words, focusing on reasons for its verdict: {content}'
  response = gpt_request(modified_content)
  print(response)
  summaries.append(response)
sample_df['summaries'] = summaries
sample_df

A fabricated tweet attributed to former US President Donald Trump commenting on Oprah Winfrey's interview with Prince Harry and Meghan Markle circulated online. The claim is false as Trump's Twitter account remains suspended since January 2021, and the tweet uses an outdated format that Twitter stopped using in 2017.
The US Supreme Court has rejected a case calling for the removal of President Joe Biden and hundreds of elected officials over baseless 2020 election fraud claims. Social media users falsely claimed the court would hear the case, but it has been dismissed twice. Legal experts say it is unlikely the Supreme Court will ever hear such a case, as it relies on a false narrative.
US President Donald Trump did not tweet that South Africa's policy of black economic empowerment was "economic segregation". The claim was made by junk news website XpouZAR.com, which said the tweet had "disappeared before going to press". However, it is illegal for Trump to delete tweets while he is pr

Unnamed: 0,index,claim,originator,claim_date,review_publisher,publisher_site,review_url,review_title,review_date,verdict,contents,summaries
0,921,A genuine tweet from Donald Trump on Meghan Ma...,,,AFP Fact Check,factcheck.afp.com,https://factcheck.afp.com/facebook-posts-share...,Facebook posts share fabricated Trump tweet fo...,2021-03-11T06:00:00Z,Misleading,Facebook posts share fabricated Trump tweet fo...,A fabricated tweet attributed to former US Pre...
1,143,The U.S. Supreme Court will hear a case to rem...,social media users,,AP News,apnews.com,https://apnews.com/article/fact-check-supreme-...,Supreme Court will not hear case to oust Biden...,2023-03-02T00:00:00Z,False. The Supreme Court has not decided to he...,Supreme Court will not hear case to oust Biden...,The US Supreme Court has rejected a case calli...
2,3030,President Donald Trump said that BEE in South ...,XpouZAR.com,2019-02-24T00:00:00Z,Africa Check,africacheck.org,https://africacheck.org/fact-checks/meta-progr...,"No, Donald Trump didn’t tweet that BEE in Sout...",2019-03-22T00:00:00Z,False,"No, Donald Trump didn’t tweet that BEE in Sout...",US President Donald Trump did not tweet that S...
3,6431,"In new interview to Fox, President Trump sugge...",Rahul Kanwal,2016-06-20T00:00:00Z,Alt News,altnews.in,https://www.altnews.in/false-rahul-kanwals-cla...,False: Rahul Kanwal's claim about Trump sugges...,2020-04-12T00:00:00Z,False,False: Rahul Kanwal’s claim about Trump sugges...,"Rahul Kanwal, News director of India Today, cl..."
4,1555,A Facebook post claims that ABC chair Ita Butt...,A Facebook page,,Australian Associated Press,aap.com.au,https://www.aap.com.au/factcheck/abc-chief-ita...,ABC chief Ita Buttrose hasn’t endorsed Donald ...,2020-11-05T00:00:00Z,False – Content that has no basis in fact.,ABC chief Ita Buttrose hasn’t endorsed Donald ...,A Facebook post falsely claims that ABC chair ...
5,1071,"""When our nation was hit with the terrible pan...",Donald Trump,2021-01-20T00:00:00Z,BBC,bbc.co.uk,https://www.bbc.co.uk/news/55730719,President Trump's final day as president fact-...,2021-01-20T00:00:00Z,This requires context. The Pfizer/BioNTech vac...,President Trump's final day as president fact-...,Fact-checking President Trump's farewell speec...
6,1233,Donald Trump declared martial law.,Facebook posts,2020-12-30T07:32:58Z,BOOM FactCheck,boomlive.in,https://www.boomlive.in/world/fake-news-donald...,Did Donald Trump Declare Martial Law To Preven...,2020-12-30T07:32:58Z,False,Did Donald Trump Declare Martial Law To Preven...,An image circulating online claims to show a t...
7,5161,The U.S. is running a trade deficit of $817 bi...,Donald Trump,2018-06-09T00:00:00Z,CBS News,cbsnews.com,https://www.cbsnews.com/news/china-trade-defic...,Fact check: Trump ignores U.S. trade strengths...,2018-06-11T00:00:00Z,Not true.,Fact check: Trump ignores U.S. trade strengths...,President Donald Trump is presenting a lopside...
8,185,Elaine Chao tweet that Trump was a Chinese bitch,Viral Image,2023-01-11T00:00:00Z,Check Your Fact,checkyourfact.com,http://checkyourfact.com/2023/01/18/fact-check...,FACT CHECK: Did Elaine Chao Call Trump A ‘Chin...,2023-01-18T20:27:42Z,False,FACT CHECK: Did Elaine Chao Call Trump A ‘Chin...,A tweet claiming that former Transportation Se...
9,7069,California wildfires are being magnified & mad...,President Donald Trump,2018-07-08T00:00:00Z,Climate Feedback,climatefeedback.org,https://climatefeedback.org/claimreview/presid...,President Trump’s claim that water supply poli...,2018-08-05T00:00:00Z,Inaccurate,President Trump’s claim that water supply poli...,President Trump's claim that water supply poli...


# **Make a CSV file**

In [20]:
# sample_df.to_csv('test_50urls_50words.csv')
# from google.colab import files
# files.download('test_50urls_50words.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>