In [1]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install tenacity

Note: you may need to restart the kernel to use updated packages.


In [3]:
import requests
import json
import random 
import numpy
import pandas as pd
from datetime import date
import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential

# **Document → claims**

In [244]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10)) 
#To prevent API requests from timing out, gpt_request will retry max 10 times with intervals of 1-60 seconds

def gpt_request(query):
    
    """ Send a query to GPT-4 API and return the response """
    
    endpoint = "https://api.openai.com/v1/chat/completions"
    api_key = "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"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    data = {
        "model": "gpt-4",
        # We don't need system message cos we don't ask fact checking
        "messages" : [{"role": "user", "content": query}],
        "max_tokens": 2000,
        "temperature": 0
    }

    response = requests.post(endpoint, headers=headers, json=data)
    response_json = response.json()
    # print(response_json)
    return response_json['choices'][0]['message']['content'].strip()

def summarise(text): 
    
    """ Given a string of text, ask GPT-4 to summarise the claims made """
    
    query = "List out all of the claims made in the text below that might be contentious or need to be checked. \
    Exclude any source attribution or contact information unless relevant to verifying another fact. \
    Exclude names of people, using other identifying details, unless the name is crucial to the claim or they are famous. \
    Also exclude any subjective facts (such as thoughts, feelings or wishes) that cannot be independently verified. \
    List each claim as a brief bullet point, each independent and self-explanatory without reference to details from the rest of the text. \n\n"
    
    return gpt_request(query + text)
    
# Get a list of factual claims
user_input = str(input("Copy and paste some text here: "))

claims = summarise(user_input)
if "\n" in claims:
    claims = claims.split("\n")
claims

Copy and paste some text here: Globalist billionaire Bill Gates, who has a history of making uncannily prescient investments just before disaster strikes in the world, invested in artificial eggs before the price of eggs spiked. The mystery surrounding the egg shortage and price spike is growing. Per the Consumer Price Index, egg prices have spiked 66% percent since last year. In response to the shortage of eggs, many consumers have turned to artificial plant-based eggs.


['- Bill Gates has a history of making uncannily prescient investments just before disaster strikes.',
 '- Bill Gates invested in artificial eggs before the price of eggs spiked.',
 '- Egg prices have spiked 66% since last year according to the Consumer Price Index.',
 '- There is a mystery surrounding the egg shortage and price spike.',
 '- Many consumers have turned to artificial plant-based eggs in response to the shortage.']

# Optimize

In [245]:
# modify claims in a certain style that can be requested to AI
modified_queries = []
for claim in claims:
    new_query = 'Optimise the following claim to search for related information, focusing on the actors and general topic while avoiding overly specific details: ' + claim
    modified_queries.append(new_query)

modified_queries

['Optimise the following claim to search for related information, focusing on the actors and general topic while avoiding overly specific details: - Bill Gates has a history of making uncannily prescient investments just before disaster strikes.',
 'Optimise the following claim to search for related information, focusing on the actors and general topic while avoiding overly specific details: - Bill Gates invested in artificial eggs before the price of eggs spiked.',
 'Optimise the following claim to search for related information, focusing on the actors and general topic while avoiding overly specific details: - Egg prices have spiked 66% since last year according to the Consumer Price Index.',
 'Optimise the following claim to search for related information, focusing on the actors and general topic while avoiding overly specific details: - There is a mystery surrounding the egg shortage and price spike.',
 'Optimise the following claim to search for related information, focusing on th

In [246]:
# Ask AI for optimizing claims
optimised_queries = []
for query in modified_queries:
    response = gpt_request(query)
    optimised_queries.append(response)
    
optimised_queries

['Bill Gates history of prescient investments before disasters',
 'Bill Gates investment in artificial eggs',
 'Egg prices increase Consumer Price Index',
 'Mystery behind egg shortage and price increase: actors and general topic',
 'Consumers shift to artificial plant-based eggs due to shortage']

# Search Google Fact Check Tools

In [247]:
# Melissa's header
my_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36; Melissa Zhu/ZhuM17@cardiff.ac.uk. Working on data for class.'}
# Koh's header
# my_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56; Koh Yoshida/yoshidak1@cardiff.ac.uk. Working on data for class.'}

endpoint = 'https://factchecktools.googleapis.com/v1alpha1/claims:search'

links = []

for optimised_query in optimised_queries:
    
    query = optimised_query
    language = 'en'
    max_days = 10000 #Max age of returned search results, in days
    page_size = 20 #Number of pages in the search results

    reviewPublisherSiteFilter = '' #Filter by review publisher (can be blank)

    api_key = 'AIzaSyDKpe6j4lqR_yuy6FZMI3NEdY9VP7Fa2jI' # Melissa's key
#     api_key = 'AIzaSyAAB8N8l47u7kt_EfrPFUKpwfOPYRmIlGw' # Koh's key

    # url = f'{endpoint}?query={query}&languageCode={language}&maxAgeDays={max_days}&key={api_key}&pageSize={page_size}&reviewPublisherSiteFilter={reviewPublisherSiteFilter}'
    url = f'{endpoint}?query={query}&key={api_key}&languageCode={language}&maxAgeDays={max_days}&pageSize={page_size}&reviewPublisherSiteFilter={reviewPublisherSiteFilter}'
    links.append(url)

# the links below contains searched results,the first link is for the first claim
# If computer got no hit with the claim, it just shows "{}"
links

['https://factchecktools.googleapis.com/v1alpha1/claims:search?query=Bill Gates history of prescient investments before disasters&key=AIzaSyDKpe6j4lqR_yuy6FZMI3NEdY9VP7Fa2jI&languageCode=en&maxAgeDays=10000&pageSize=20&reviewPublisherSiteFilter=',
 'https://factchecktools.googleapis.com/v1alpha1/claims:search?query=Bill Gates investment in artificial eggs&key=AIzaSyDKpe6j4lqR_yuy6FZMI3NEdY9VP7Fa2jI&languageCode=en&maxAgeDays=10000&pageSize=20&reviewPublisherSiteFilter=',
 'https://factchecktools.googleapis.com/v1alpha1/claims:search?query=Egg prices increase Consumer Price Index&key=AIzaSyDKpe6j4lqR_yuy6FZMI3NEdY9VP7Fa2jI&languageCode=en&maxAgeDays=10000&pageSize=20&reviewPublisherSiteFilter=',
 'https://factchecktools.googleapis.com/v1alpha1/claims:search?query=Mystery behind egg shortage and price increase: actors and general topic&key=AIzaSyDKpe6j4lqR_yuy6FZMI3NEdY9VP7Fa2jI&languageCode=en&maxAgeDays=10000&pageSize=20&reviewPublisherSiteFilter=',
 'https://factchecktools.googleapis.

# Extract results

In [265]:
results = []
for link in links:
    req = requests.get(link, headers = my_headers)
    data = req.json()
    if 'claims' in data: 
        claims = data['claims']
        results.append(claims)
        
for result in results: 
    if len(result) > 3:  #Limit to max 3 results/query for brevity
        result = result[:3]
results

[[{'text': 'Bill Gates is connected to a "mysterious egg shortage"',
   'claimant': 'Multiple organizations',
   'claimDate': '2023-01-30T00:00:00Z',
   'claimReview': [{'publisher': {'name': 'AFP Fact Check',
      'site': 'factcheck.afp.com'},
     'url': 'https://factcheck.afp.com/doc.afp.com.338B7KL',
     'reviewDate': '2023-02-02T20:15:00Z',
     'textualRating': 'Misleading',
     'languageCode': 'en'}]}],
 [{'text': 'Egg prices have increased due to a bird flu outbreak.',
   'claimReview': [{'publisher': {'name': 'VERIFYThis.com',
      'site': 'verifythis.com'},
     'url': 'https://www.verifythis.com/article/news/verify/economy-verify/why-are-egg-prices-so-high-bird-flu-inflation-behind-increase-fact-check/536-05b12714-1b90-489f-830c-7b0ae623f9c1',
     'title': 'Why are egg prices so high? Avian flu, inflation to blame',
     'reviewDate': '2023-01-11T20:49:00Z',
     'textualRating': 'True, egg prices have increased due to a bird flu outbreak and inflation.',
     'language

In [271]:
search_queries = []
claims = []
originators = []
claim_dates = []
reviews = []
publisher_names = []
publisher_sites = []
review_urls = []
titles = []
review_dates = []
ratings = []

for i in range(len(results)): 
    search_query = optimised_queries[i] #Match queries to results
    search_queries += len(results[i]) * [search_query]

for result in results: 
    for i in range(len(result)):
        claim = result[i]['text']
        claims.append(claim)

for result in results: 
    for i in range(len(result)):
        if 'claimant' in result[i]:
            originator = result[i]['claimant']
            originators.append(originator)
        else: 
            originators.append('NA')

for result in results: 
    for i in range(len(result)):
        if 'claimDate' in result[i]:
            date = result[i]['claimDate']
            claim_dates.append(date)
        else: 
            claim_dates.append('NA')

for result in results: 
    for i in range(len(result)):
        first_review = result[i]['claimReview'][0]
        reviews.append(first_review) #Most only have one review but some of them have many reviews all saying the same thing so gonna keep it to one to standardise the dataframe

for review in reviews: 
    if 'publisher' in review and 'name' in review['publisher']: 
        publisher_name = review['publisher']['name']
        publisher_names.append(publisher_name)
    else: 
        publisher_names.append('NA')

for review in reviews: 
    if 'publisher' in review and 'site' in review['publisher']: 
        publisher_site = review['publisher']['site']
        publisher_sites.append(publisher_site)
    else: 
        publisher_sites.append('NA')

for review in reviews: 
    if 'url' in review: 
        review_url = review['url']
        review_urls.append(review_url)
    else: 
        review_urls.append('NA')
        
for review in reviews: 
    if 'title' in review: 
        title = review['title']
        titles.append(title)
    else: 
        titles.append('NA')

for review in reviews: 
    if 'reviewDate' in review: 
        review_date = review['reviewDate']
        review_dates.append(review_date)
    else: 
        review_dates.append('NA')

for review in reviews: 
    if 'textualRating' in review: 
        rating = review['textualRating']
        ratings.append(rating)
    else: 
        ratings.append('NA')
        
print(search_queries)
print(claims)
print(originators)
print(claim_dates)
print(publisher_names)
print(publisher_sites)
print(review_urls)
print(titles)
print(review_dates)
print(ratings)

['Bill Gates history of prescient investments before disasters', 'Bill Gates investment in artificial eggs', 'Bill Gates investment in artificial eggs']
['Bill Gates is connected to a "mysterious egg shortage"', 'Egg prices have increased due to a bird flu outbreak.', 'Food prices have gone up 7% and egg prices have gone up 700%']
['Multiple organizations', 'NA', 'Social media']
['2023-01-30T00:00:00Z', 'NA', '2023-01-12T00:00:00Z']
['AFP Fact Check', 'VERIFYThis.com', 'USA Today']
['factcheck.afp.com', 'verifythis.com', 'usatoday.com']
['https://factcheck.afp.com/doc.afp.com.338B7KL', 'https://www.verifythis.com/article/news/verify/economy-verify/why-are-egg-prices-so-high-bird-flu-inflation-behind-increase-fact-check/536-05b12714-1b90-489f-830c-7b0ae623f9c1', 'https://www.usatoday.com/story/news/factcheck/2023/01/20/fact-check-false-claim-egg-prices-have-risen-700-inflation-consumer-price-index-bird-flu-food-costs/11076635002/']
['NA', 'Why are egg prices so high? Avian flu, inflatio

In [272]:
# make a table
df = pd.DataFrame({'query': search_queries,
                   'claim': claims, 
                   'originator': originators, 
                   'claim_date': claim_dates, 
                   'review_publisher': publisher_names, 
                   'publisher_site': publisher_sites, 
                   'review_url': review_urls, 
                   'review_title': titles, 
                   'review_date': review_dates, 
                   'verdict': ratings})
df

Unnamed: 0,query,claim,originator,claim_date,review_publisher,publisher_site,review_url,review_title,review_date,verdict
0,Bill Gates history of prescient investments be...,"Bill Gates is connected to a ""mysterious egg s...",Multiple organizations,2023-01-30T00:00:00Z,AFP Fact Check,factcheck.afp.com,https://factcheck.afp.com/doc.afp.com.338B7KL,,2023-02-02T20:15:00Z,Misleading
1,Bill Gates investment in artificial eggs,Egg prices have increased due to a bird flu ou...,,,VERIFYThis.com,verifythis.com,https://www.verifythis.com/article/news/verify...,"Why are egg prices so high? Avian flu, inflati...",2023-01-11T20:49:00Z,"True, egg prices have increased due to a bird ..."
2,Bill Gates investment in artificial eggs,Food prices have gone up 7% and egg prices hav...,Social media,2023-01-12T00:00:00Z,USA Today,usatoday.com,https://www.usatoday.com/story/news/factcheck/...,Fact check: False claim that egg prices have r...,2023-01-20T22:25:24Z,False


# Auto scraping (python library "newspaper")
### Just copied and pasted code from the documentation (https://newspaper.readthedocs.io/en/latest/)

In [34]:
pip install newspaper3k

Note: you may need to restart the kernel to use updated packages.


In [35]:
import newspaper
sample = newspaper.build('https://www.snopes.com/fact-check/alligator-eats-girl-photo/', language='en')

In [36]:
sample.size()

45

In [44]:
from newspaper import Article
url = 'https://www.snopes.com/fact-check/alligator-eats-girl-photo/'

a = Article(url, language='en') 

a.download()
a.parse()

title = a.title
contents = a.text[:10000]

print(title)
print(a.text[:10000])

Alligator Eats Girl Taking Photograph?
Advertisment:

Claim: Video shows a woman getting eaten by an alligator while posing for a photograph along a riverbank. Rating: About this rating False

A video purportedly showing a woman being attacked by an alligator while posing for a photograph along a riverbank has been circulating on the Internet since 2012.

The video does not show a real alligator attack, however: it's merely a commercial for Preview magazine.

When the video was originally uploaded to YouTube, it featured a few extra seconds of footage containing the tagline "choose your bag wisely," the hashtag #IMAPreviewGirl, and a link to Preview's web site. These aspects clearly identified the video as an advertisement; but (as is typical with such "reality" commercials) they were trimmed off by those who subsequently reposted it, making it difficult for viewers to recognize the clip as a commercial.

Although several versions of this video currently circulating on the Internet (ma

# Summarizing

In [49]:
# ask AI to summarize scraped contents
modified_contents = f'Summarize following contents within 50 words, focusing on reasons for its verdict: {contents}'
modified_contents

'Summarize following contents within 50 words, focusing on reasons for its verdict: Advertisment:\n\nClaim: Video shows a woman getting eaten by an alligator while posing for a photograph along a riverbank. Rating: About this rating False\n\nA video purportedly showing a woman being attacked by an alligator while posing for a photograph along a riverbank has been circulating on the Internet since 2012.\n\nThe video does not show a real alligator attack, however: it\'s merely a commercial for Preview magazine.\n\nWhen the video was originally uploaded to YouTube, it featured a few extra seconds of footage containing the tagline "choose your bag wisely," the hashtag #IMAPreviewGirl, and a link to Preview\'s web site. These aspects clearly identified the video as an advertisement; but (as is typical with such "reality" commercials) they were trimmed off by those who subsequently reposted it, making it difficult for viewers to recognize the clip as a commercial.\n\nAlthough several version

In [50]:
send_request(modified_contents)

"A video circulating on the internet since 2012, which appears to show a woman being attacked by an alligator while posing for a photograph, has been rated as false. The video is actually a commercial for Preview magazine, but the tagline and link to the magazine's website were trimmed off by those who subsequently reposted it, making it difficult for viewers to recognise it as an advertisement. Preview has made it clear that the clip is just a commercial and has listed the full creative team behind the clip on its website."