In [1]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tenacity

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
import json
import random 
import numpy
import pandas as pd
from datetime import date
import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential

# **Document → claims**

In [12]:
sample_query = "List out the factual claims made in this text: Globalist billionaire Bill Gates, who has a history of making uncannily prescient investments just before disaster strikes in the world, invested in artificial eggs before the price of eggs spiked. The mystery surrounding the egg shortage and price spike is growing. Per the Consumer Price Index, egg prices have spiked 66% percent since last year. In response to the shortage of eggs, many consumers have turned to artificial plant-based eggs."

In [18]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10)) 
#To prevent API requests from timing out, gpt_request will retry max 10 times with intervals of 1-60 seconds

def gpt_request(query):
    
    """ Send a query to GPT-4 API and return the response """
    
    endpoint = "https://api.openai.com/v1/chat/completions"
    api_key = "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"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    data = {
        "model": "gpt-3.5-turbo",
        # We don't need system message cos we don't ask fact checking
        "messages" : [{"role": "user", "content": query}],
        "max_tokens": 2000,
        "temperature": 0
    }

    response = requests.post(endpoint, headers=headers, json=data)
    response_json = response.json()
    # print(response_json)
    return response_json['choices'][0]['message']['content'].strip()

def summarise(text): 
    query = "Identify the factual claims made in the text below that would be of interest to fact checkers, \
    excluding any source attribution or contact information unless relevant to verifying another fact. \
    Each claim should be a self-contained sentence not dependent on context from other claims. \
    Leave out names of people unless they are crucial to the claim (i.e. the name is the claim) \
    or they are famous people. Also exclude any subjective facts (such as feelings or wishes) \
    that cannot be independently verified. \n\n"
    
    return gpt_request(query + text)
    
# Get a list of factual claims
user_input = str(input("Copy and paste some text here: "))
responses = summarise(user_input)
if "\n" in responses:
    responses = responses.split("\n")
responses

Copy and paste some text here: Globalist billionaire Bill Gates, who has a history of making uncannily prescient investments just before disaster strikes in the world, invested in artificial eggs before the price of eggs spiked. The mystery surrounding the egg shortage and price spike is growing. Per the Consumer Price Index, egg prices have spiked 66% percent since last year. In response to the shortage of eggs, many consumers have turned to artificial plant-based eggs.


['- Bill Gates invested in artificial eggs before the price of eggs spiked.',
 '- The price of eggs has spiked 66% since last year according to the Consumer Price Index.',
 '- Many consumers have turned to artificial plant-based eggs in response to the shortage of eggs.']

# Optimize

In [19]:
# modify claims in a certain style that can be requested to AI
modified_claims = []
for response in responses:
    response = 'Optimise following claim to search for relevant content in Google Fact Check Explorer: ' + response
    modified_claims.append(response)

modified_claims

['Optimise following claim to search for relevant content in Google Fact Check Explorer: - Bill Gates invested in artificial eggs before the price of eggs spiked.',
 'Optimise following claim to search for relevant content in Google Fact Check Explorer: - The price of eggs has spiked 66% since last year according to the Consumer Price Index.',
 'Optimise following claim to search for relevant content in Google Fact Check Explorer: - Many consumers have turned to artificial plant-based eggs in response to the shortage of eggs.']

In [20]:
# Ask AI for optimizing claims
optimized_claims = []
for i in modified_claims:
    response = gpt_request(i)
    optimized_claims.append(response)
    
optimized_claims

['"Bill Gates investment in artificial eggs before egg price increase"',
 '"Consumer Price Index reports 66% increase in egg prices since last year"',
 '"Artificial plant-based eggs as a response to egg shortage - fact check"']

# Search Data Commons

In [30]:
# Melissa's header
# my_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36; Melissa Zhu/ZhuM17@cardiff.ac.uk. Working on data for class.'}
# Koh's header
my_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56; Koh Yoshida/yoshidak1@cardiff.ac.uk. Working on data for class.'}

endpoint = 'https://factchecktools.googleapis.com/v1alpha1/claims:search'

links = []

for i in optimized_claims:
    
    query = i
    language = 'en'
    max_days = 10000 #Max age of returned search results, in days
    page_size = 10000 #Number of pages in the search results

    # reviewPublisherSiteFilter = 'factcheck.afp.com' #Filter by review publisher

    # api_key = 'AIzaSyDKpe6j4lqR_yuy6FZMI3NEdY9VP7Fa2jI' # Melissa's key
    api_key = 'AIzaSyAAB8N8l47u7kt_EfrPFUKpwfOPYRmIlGw' # Koh's key

    # url = f'{endpoint}?query={query}&languageCode={language}&maxAgeDays={max_days}&key={api_key}&pageSize={page_size}&reviewPublisherSiteFilter={reviewPublisherSiteFilter}'
    url = f'{endpoint}?query={query}&languageCode={language}&maxAgeDays={max_days}&key={api_key}&pageSize={page_size}'
    links.append(url)

# the links below contains sesrched results,the first link is for the first claim
# If computer got no hit with the claim, it just shows "{}"
links

['https://factchecktools.googleapis.com/v1alpha1/claims:search?query="Fact check: Did Bill Gates invest in artificial eggs?"&languageCode=en&maxAgeDays=10000&key=AIzaSyAAB8N8l47u7kt_EfrPFUKpwfOPYRmIlGw&pageSize=10000',
 'https://factchecktools.googleapis.com/v1alpha1/claims:search?query="Bill Gates prescient investments before disasters"&languageCode=en&maxAgeDays=10000&key=AIzaSyAAB8N8l47u7kt_EfrPFUKpwfOPYRmIlGw&pageSize=10000',
 'https://factchecktools.googleapis.com/v1alpha1/claims:search?query="Fact check: Has the price of eggs spiked recently?"&languageCode=en&maxAgeDays=10000&key=AIzaSyAAB8N8l47u7kt_EfrPFUKpwfOPYRmIlGw&pageSize=10000',
 'https://factchecktools.googleapis.com/v1alpha1/claims:search?query="Egg shortage mystery" fact check&languageCode=en&maxAgeDays=10000&key=AIzaSyAAB8N8l47u7kt_EfrPFUKpwfOPYRmIlGw&pageSize=10000',
 'https://factchecktools.googleapis.com/v1alpha1/claims:search?query="Consumer Price Index egg prices 66% increase last year"&languageCode=en&maxAgeDays=

# Extract results

In [32]:
claims = []
for link in links:
    req = requests.get(link, headers = my_headers)
#     print(req.status_code)
    data = req.json()
    claims.append(data)

claims

[{}, {}, {}, {}, {}, {}]

In [33]:
queries = []
originators = []
claim_dates = []
reviews = []
publisher_names = []
publisher_sites = []
review_urls = []
titles = []
review_dates = []
ratings = []

for claim in claims: 
    query = claim['text']
    queries.append(query)

for claim in claims: 
    if 'claimant' in claim:
        originator = claim['claimant']
        originators.append(originator)
    else: 
        originators.append('NA')
    
for claim in claims: 
    if 'claimDate' in claim:
        date = claim['claimDate']
        claim_dates.append(date)
    else: 
        claim_dates.append('NA')

for claim in claims: 
    first_review = claim['claimReview'][0]
    reviews.append(first_review) #Most only have one review but some of them have many reviews all saying the same thing so gonna keep it to one to standardise the dataframe

for review in reviews: 
    if 'publisher' in review and 'name' in review['publisher']: 
        publisher_name = review['publisher']['name']
        publisher_names.append(publisher_name)
    else: 
        publisher_names.append('NA')

for review in reviews: 
    if 'publisher' in review and 'site' in review['publisher']: 
        publisher_site = review['publisher']['site']
        publisher_sites.append(publisher_site)
    else: 
        publisher_sites.append('NA')

for review in reviews: 
    if 'url' in review: 
        review_url = review['url']
        review_urls.append(review_url)
    else: 
        review_urls.append('NA')
        
for review in reviews: 
    if 'title' in review: 
        title = review['title']
        titles.append(title)
    else: 
        titles.append('NA')

for review in reviews: 
    if 'reviewDate' in review: 
        review_date = review['reviewDate']
        review_dates.append(review_date)
    else: 
        review_dates.append('NA')

for review in reviews: 
    if 'textualRating' in review: 
        rating = review['textualRating']
        ratings.append(rating)
    else: 
        ratings.append('NA')
        
        
print(queries)
print(originators)
print(claim_dates)
print(publisher_names)
print(publisher_sites)
print(review_urls)
print(titles)
print(review_dates)
print(ratings)

KeyError: 'text'

In [None]:
# make a table
df = pd.DataFrame({'claim': queries, 
                   'originator': originators, 
                   'claim_date': claim_dates, 
                   'review_publisher': publisher_names, 
                   'publisher_site': publisher_sites, 
                   'review_url': review_urls, 
                   'review_title': titles, 
                   'review_date': review_dates, 
                   'verdict': ratings})

# Auto scraping (python library "newspaper")
### Just copied and pasted code from the documentation (https://newspaper.readthedocs.io/en/latest/)

In [34]:
pip install newspaper3k

Note: you may need to restart the kernel to use updated packages.


In [35]:
import newspaper
sample = newspaper.build('https://www.snopes.com/fact-check/alligator-eats-girl-photo/', language='en')

In [36]:
sample.size()

45

In [44]:
from newspaper import Article
url = 'https://www.snopes.com/fact-check/alligator-eats-girl-photo/'

a = Article(url, language='en') 

a.download()
a.parse()

title = a.title
contents = a.text[:10000]

print(title)
print(a.text[:10000])

Alligator Eats Girl Taking Photograph?
Advertisment:

Claim: Video shows a woman getting eaten by an alligator while posing for a photograph along a riverbank. Rating: About this rating False

A video purportedly showing a woman being attacked by an alligator while posing for a photograph along a riverbank has been circulating on the Internet since 2012.

The video does not show a real alligator attack, however: it's merely a commercial for Preview magazine.

When the video was originally uploaded to YouTube, it featured a few extra seconds of footage containing the tagline "choose your bag wisely," the hashtag #IMAPreviewGirl, and a link to Preview's web site. These aspects clearly identified the video as an advertisement; but (as is typical with such "reality" commercials) they were trimmed off by those who subsequently reposted it, making it difficult for viewers to recognize the clip as a commercial.

Although several versions of this video currently circulating on the Internet (ma

# Summarizing

In [49]:
# ask AI to summarize scraped contents
modified_contents = f'Summarize following contents within 50 words, focusing on reasons for its verdict: {contents}'
modified_contents

'Summarize following contents within 50 words, focusing on reasons for its verdict: Advertisment:\n\nClaim: Video shows a woman getting eaten by an alligator while posing for a photograph along a riverbank. Rating: About this rating False\n\nA video purportedly showing a woman being attacked by an alligator while posing for a photograph along a riverbank has been circulating on the Internet since 2012.\n\nThe video does not show a real alligator attack, however: it\'s merely a commercial for Preview magazine.\n\nWhen the video was originally uploaded to YouTube, it featured a few extra seconds of footage containing the tagline "choose your bag wisely," the hashtag #IMAPreviewGirl, and a link to Preview\'s web site. These aspects clearly identified the video as an advertisement; but (as is typical with such "reality" commercials) they were trimmed off by those who subsequently reposted it, making it difficult for viewers to recognize the clip as a commercial.\n\nAlthough several version

In [50]:
send_request(modified_contents)

"A video circulating on the internet since 2012, which appears to show a woman being attacked by an alligator while posing for a photograph, has been rated as false. The video is actually a commercial for Preview magazine, but the tagline and link to the magazine's website were trimmed off by those who subsequently reposted it, making it difficult for viewers to recognise it as an advertisement. Preview has made it clear that the clip is just a commercial and has listed the full creative team behind the clip on its website."