In [2]:
pip install openai

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install tenacity

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
[K     |████████████████████████████████| 211 kB 7.6 MB/s eta 0:00:01
Collecting tinysegmenter==0.3
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
Collecting feedfinder2>=0.0.4
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
Collecting jieba3k>=0.35.1
  Downloading jieba3k-0.35.1.zip (7.4 MB)
[K     |████████████████████████████████| 7.4 MB 16.4 MB/s eta 0:00:01
Collecting feedparser>=5.2.1
  Downloading feedparser-6.0.10-py3-none-any.whl (81 kB)
[K     |████████████████████████████████| 81 kB 16.8 MB/s eta 0:00:01
Collecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
Building wheels for collected packages: tinysegmenter, feedfinder2, jieba3k, sgmllib3k
  Building wheel for tinysegmenter (setup.py) ... [?25ldone
[?25h  Created wheel for tinysegmenter: filename=tinysegmenter-0.3-py3-none-any.whl size=13553 sha256=97005349948752e9def93fefc1b07ccd53f92c8d95598b0aa73214daa2988d26
  Stored in 

In [5]:
import requests
import json
import random 
import numpy
import pandas as pd
from datetime import date
import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential
import newspaper
from newspaper import Article
from newspaper import Config

# **Document → claims**

In [6]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(10)) 
#To prevent API requests from timing out, gpt_request will retry max 10 times with intervals of 1-60 seconds

def gpt_request(query):
    
    """ Send a query to GPT-4 API and return the response """
    
    endpoint = "https://api.openai.com/v1/chat/completions"
    api_key = "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"
    
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}"
    }
    data = {
        "model": "gpt-4",
        # We don't need system message cos we don't ask fact checking
        "messages" : [{"role": "user", "content": query}],
        "max_tokens": 2000,
        "temperature": 0
    }

    response = requests.post(endpoint, headers=headers, json=data)
    response_json = response.json()
    # print(response_json)
    return response_json['choices'][0]['message']['content'].strip()

def summarise(text): 
    
    """ Given a string of text, ask GPT-4 to summarise the claims made """
    
    query = "List out all of the claims made in the text below that might be contentious or need to be checked. \
    Exclude any source attribution or contact information unless relevant to verifying another fact. \
    Exclude names of people, using other identifying details, unless the name is crucial to the claim or they are famous. \
    Also exclude any subjective facts (such as thoughts, feelings or wishes) that cannot be independently verified. \
    List each claim as a brief bullet point, each independent and self-explanatory without reference to details from the rest of the text. \n\n"
    
    return gpt_request(query + text)
    
# Get a list of factual claims
user_input = str(input("Copy and paste some text here: "))

claims = summarise(user_input)
if "\n" in claims:
    claims = claims.split("\n")
claims

Copy and paste some text here: Russian President Vladimir Putin arrives in South Africa. ICC fails to arrest Putin.


['- Russian President Vladimir Putin arrives in South Africa',
 '- ICC fails to arrest Putin']

# Optimize

In [7]:
# modify claims in a certain style that can be requested to AI
modified_queries = []
for claim in claims:
    new_query = 'Optimise the following claim to search for related information, focusing on the actors and general topic while avoiding overly specific details: ' + claim
    modified_queries.append(new_query)

modified_queries

['Optimise the following claim to search for related information, focusing on the actors and general topic while avoiding overly specific details: - Russian President Vladimir Putin arrives in South Africa',
 'Optimise the following claim to search for related information, focusing on the actors and general topic while avoiding overly specific details: - ICC fails to arrest Putin']

In [8]:
# Ask AI for optimizing claims
optimised_queries = []
for query in modified_queries:
    response = gpt_request(query)
    optimised_queries.append(response)
    
optimised_queries

['Russian President Vladimir Putin visits South Africa',
 'ICC unable to detain Putin']

# Search Google Fact Check Tools

In [9]:
# Melissa's header
my_headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36; Melissa Zhu/ZhuM17@cardiff.ac.uk. Working on data for class.'}
# Koh's header
# my_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56; Koh Yoshida/yoshidak1@cardiff.ac.uk. Working on data for class.'}

endpoint = 'https://factchecktools.googleapis.com/v1alpha1/claims:search'

links = []

for optimised_query in optimised_queries:
    
    query = optimised_query
    language = 'en'
    max_days = 10000 #Max age of returned search results, in days
    page_size = 20 #Number of pages in the search results

    reviewPublisherSiteFilter = '' #Filter by review publisher (can be blank)

    api_key = 'AIzaSyDKpe6j4lqR_yuy6FZMI3NEdY9VP7Fa2jI' # Melissa's key
#     api_key = 'AIzaSyAAB8N8l47u7kt_EfrPFUKpwfOPYRmIlGw' # Koh's key

    # url = f'{endpoint}?query={query}&languageCode={language}&maxAgeDays={max_days}&key={api_key}&pageSize={page_size}&reviewPublisherSiteFilter={reviewPublisherSiteFilter}'
    url = f'{endpoint}?query={query}&key={api_key}&languageCode={language}&maxAgeDays={max_days}&pageSize={page_size}&reviewPublisherSiteFilter={reviewPublisherSiteFilter}'
    links.append(url)

# the links below contains searched results,the first link is for the first claim
# If computer got no hit with the claim, it just shows "{}"
links

['https://factchecktools.googleapis.com/v1alpha1/claims:search?query=Russian President Vladimir Putin visits South Africa&key=AIzaSyDKpe6j4lqR_yuy6FZMI3NEdY9VP7Fa2jI&languageCode=en&maxAgeDays=10000&pageSize=20&reviewPublisherSiteFilter=',
 'https://factchecktools.googleapis.com/v1alpha1/claims:search?query=ICC unable to detain Putin&key=AIzaSyDKpe6j4lqR_yuy6FZMI3NEdY9VP7Fa2jI&languageCode=en&maxAgeDays=10000&pageSize=20&reviewPublisherSiteFilter=']

# Extract results

In [10]:
results = []
for link in links:
    req = requests.get(link, headers = my_headers)
    data = req.json()
    if 'claims' in data: 
        claims = data['claims']
        results.append(claims)
        
for result in results: 
    if len(result) > 3:  #Limit to max 3 results/query for brevity
        result = result[:3]
results

[[{'text': 'Video shows Putin in South Africa in March 2023',
   'claimant': 'Multiple authors',
   'claimDate': '2023-03-31T00:00:00Z',
   'claimReview': [{'publisher': {'name': 'AFP Fact Check',
      'site': 'factcheck.afp.com'},
     'url': 'https://factcheck.afp.com/doc.afp.com.33CR2KD',
     'reviewDate': '2023-04-06T15:26:00Z',
     'textualRating': 'False',
     'languageCode': 'en'}]},
  {'text': "The viral video shows Vladimir Putin's recent visit to South Africa",
   'claimant': 'Social media',
   'claimDate': '2022-08-17T00:00:00Z',
   'claimReview': [{'publisher': {'name': 'Newsmobile',
      'site': 'newsmobile.in'},
     'url': 'https://newsmobile.in/articles/2022/08/02/fact-check-does-the-viral-video-show-putins-recent-visit-to-south-africa-heres-the-truth/',
     'title': 'Fact Check: Does The Viral Video Show Putin’s Recent Visit To ...',
     'reviewDate': '2022-08-02T00:00:00Z',
     'textualRating': 'Misleading',
     'languageCode': 'en'}]}]]

In [11]:
search_queries = []
claims = []
originators = []
claim_dates = []
reviews = []
publisher_names = []
publisher_sites = []
review_urls = []
titles = []
review_dates = []
ratings = []

for i in range(len(results)): 
    search_query = optimised_queries[i] #Match queries to results
    search_queries += len(results[i]) * [search_query]

for result in results: 
    for i in range(len(result)):
        claim = result[i]['text']
        claims.append(claim)

for result in results: 
    for i in range(len(result)):
        if 'claimant' in result[i]:
            originator = result[i]['claimant']
            originators.append(originator)
        else: 
            originators.append('NA')

for result in results: 
    for i in range(len(result)):
        if 'claimDate' in result[i]:
            date = result[i]['claimDate']
            claim_dates.append(date)
        else: 
            claim_dates.append('NA')

for result in results: 
    for i in range(len(result)):
        first_review = result[i]['claimReview'][0]
        reviews.append(first_review) #Most only have one review but some of them have many reviews all saying the same thing so gonna keep it to one to standardise the dataframe

for review in reviews: 
    if 'publisher' in review and 'name' in review['publisher']: 
        publisher_name = review['publisher']['name']
        publisher_names.append(publisher_name)
    else: 
        publisher_names.append('NA')

for review in reviews: 
    if 'publisher' in review and 'site' in review['publisher']: 
        publisher_site = review['publisher']['site']
        publisher_sites.append(publisher_site)
    else: 
        publisher_sites.append('NA')

for review in reviews: 
    if 'url' in review: 
        review_url = review['url']
        review_urls.append(review_url)
    else: 
        review_urls.append('NA')
        
for review in reviews: 
    if 'title' in review: 
        title = review['title']
        titles.append(title)
    else: 
        titles.append('NA')

for review in reviews: 
    if 'reviewDate' in review: 
        review_date = review['reviewDate']
        review_dates.append(review_date)
    else: 
        review_dates.append('NA')

for review in reviews: 
    if 'textualRating' in review: 
        rating = review['textualRating']
        ratings.append(rating)
    else: 
        ratings.append('NA')
        
print(search_queries)
print(claims)
print(originators)
print(claim_dates)
print(publisher_names)
print(publisher_sites)
print(review_urls)
print(titles)
print(review_dates)
print(ratings)

['Russian President Vladimir Putin visits South Africa', 'Russian President Vladimir Putin visits South Africa']
['Video shows Putin in South Africa in March 2023', "The viral video shows Vladimir Putin's recent visit to South Africa"]
['Multiple authors', 'Social media']
['2023-03-31T00:00:00Z', '2022-08-17T00:00:00Z']
['AFP Fact Check', 'Newsmobile']
['factcheck.afp.com', 'newsmobile.in']
['https://factcheck.afp.com/doc.afp.com.33CR2KD', 'https://newsmobile.in/articles/2022/08/02/fact-check-does-the-viral-video-show-putins-recent-visit-to-south-africa-heres-the-truth/']
['NA', 'Fact Check: Does The Viral Video Show Putin’s Recent Visit To ...']
['2023-04-06T15:26:00Z', '2022-08-02T00:00:00Z']
['False', 'Misleading']


In [12]:
# make a table
df = pd.DataFrame({'query': search_queries,
                   'claim': claims, 
                   'originator': originators, 
                   'claim_date': claim_dates, 
                   'review_publisher': publisher_names, 
                   'publisher_site': publisher_sites, 
                   'review_url': review_urls, 
                   'review_title': titles, 
                   'review_date': review_dates, 
                   'verdict': ratings})
df

Unnamed: 0,query,claim,originator,claim_date,review_publisher,publisher_site,review_url,review_title,review_date,verdict
0,Russian President Vladimir Putin visits South ...,Video shows Putin in South Africa in March 2023,Multiple authors,2023-03-31T00:00:00Z,AFP Fact Check,factcheck.afp.com,https://factcheck.afp.com/doc.afp.com.33CR2KD,,2023-04-06T15:26:00Z,False
1,Russian President Vladimir Putin visits South ...,The viral video shows Vladimir Putin's recent ...,Social media,2022-08-17T00:00:00Z,Newsmobile,newsmobile.in,https://newsmobile.in/articles/2022/08/02/fact...,Fact Check: Does The Viral Video Show Putin’s ...,2022-08-02T00:00:00Z,Misleading


# Auto scraping (newspaper3k)
#### documentation: https://newspaper.readthedocs.io/en/latest/

In [20]:
contents = []

for url in df['review_url']:
    if 'nytimes' in url:
        a = Article(url, language='en') 
        a.download()
        a.parse()
        contents.append(a.title + a.text[:10000])
        """
        NewYorkTimes allows request with user_agent only a few times \
        and then blocks. Above 'if' code solve the problem. 
        Conversely, some factcheckers requires user_agent, such as \
        Newsroom, so below 'else' part are needed so far as well.
        """
    elif 'verifythis' in url:

        contents.append('NA') # Exception as "Verifythis.com" is not accessible in the UK
        
    else:
        # Koh's user_agent
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.56; Koh Yoshida/yoshidak1@cardiff.ac.uk. Collecting content for group project in my class.'
        config = Config()
        config.browser_user_agent = user_agent
        config.request_timeout = 10 # avoid runtime error
        a = Article(url, config=config, language='en') 
        a.download()
        a.parse()
        contents.append(a.title + a.text[:10000])

df['contents'] = contents
df

Unnamed: 0,query,claim,originator,claim_date,review_publisher,publisher_site,review_url,review_title,review_date,verdict,contents,summaries
0,Russian President Vladimir Putin visits South ...,Video shows Putin in South Africa in March 2023,Multiple authors,2023-03-31T00:00:00Z,AFP Fact Check,factcheck.afp.com,https://factcheck.afp.com/doc.afp.com.33CR2KD,,2023-04-06T15:26:00Z,False,Posts use old video to falsely claim Russian p...,A video claiming to show Russian President Vla...
1,Russian President Vladimir Putin visits South ...,The viral video shows Vladimir Putin's recent ...,Social media,2022-08-17T00:00:00Z,Newsmobile,newsmobile.in,https://newsmobile.in/articles/2022/08/02/fact...,Fact Check: Does The Viral Video Show Putin’s ...,2022-08-02T00:00:00Z,Misleading,Fact Check: Does The Viral Video Show Putin’s ...,A video claiming to show Russian President Vla...


# Summarizing

In [22]:
summaries = []

for i in range(len(df)):
    if df.contents[i] == 'NA':
        summaries.append("This website is not working")
    else: 
        modified_content = f'Summarise the following content in the following format: The claim that [claim] is **[true or false]** based on [source, in-line linked to {df.review_url[i]}]. [One line summary of reason.] {df.contents[i]}'
        response = gpt_request(modified_content)
        summaries.append(response)
    
df['summaries'] = summaries
df

Unnamed: 0,query,claim,originator,claim_date,review_publisher,publisher_site,review_url,review_title,review_date,verdict,contents,summaries
0,Russian President Vladimir Putin visits South ...,Video shows Putin in South Africa in March 2023,Multiple authors,2023-03-31T00:00:00Z,AFP Fact Check,factcheck.afp.com,https://factcheck.afp.com/doc.afp.com.33CR2KD,,2023-04-06T15:26:00Z,False,Posts use old video to falsely claim Russian p...,The claim that Russian President Vladimir Puti...
1,Russian President Vladimir Putin visits South ...,The viral video shows Vladimir Putin's recent ...,Social media,2022-08-17T00:00:00Z,Newsmobile,newsmobile.in,https://newsmobile.in/articles/2022/08/02/fact...,Fact Check: Does The Viral Video Show Putin’s ...,2022-08-02T00:00:00Z,Misleading,Fact Check: Does The Viral Video Show Putin’s ...,The claim that the viral video shows Putin's r...
