# Scraping - `Politifact`

In [170]:
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import requests
import urllib.request
import time

## Create a function to scrape the site

In [171]:
def scrape_website(page_number):
    
    page_num = str(page_number)
    URL = 'https://www.politifact.com/factchecks/list/?page='+page_num
    webpage = requests.get(URL)
    
    #time.sleep(3)
    soup = BeautifulSoup(webpage.text, "html.parser") #Parse the text from the website
    
    #Get the tags and it's class
    statement_footer =  soup.find_all('footer',attrs={'class':'m-statement__footer'})  #Get the tag and it's class
    statement_quote = soup.find_all('div', attrs={'class':'m-statement__quote'}) #Get the tag and it's class
    statement_meta = soup.find_all('div', attrs={'class':'m-statement__meta'})#Get the tag and it's class
    target = soup.find_all('div', attrs={'class':'m-statement__meter'}) #Get the tag and it's class
    
    print(f"#### Scraping page: {page_number} ####")
    
    #loop through the footer class m-statement__footer to get the date and author
    for i in statement_footer:
        link1 = i.text.strip()
        name_and_date = link1.split()
        if len(name_and_date) < 7:
            full_name = np.nan
            date = np.nan
        else: 
            first_name = name_and_date[1]
            last_name = name_and_date[2]
            full_name = first_name + ' ' + last_name
            month = name_and_date[4]
            day = name_and_date[5]
            year = name_and_date[6]
            date = month + ' ' + day + ' ' + year
        dates.append(date)
        authors.append(full_name)
    
    #Loop through the div m-statement__quote to get the link
    for i in statement_quote:
        link2 = i.find_all('a')
        statements.append(link2[0].text.strip())
    
    #Loop through the div m-statement__meta to get the source
    for i in statement_meta:
        link3 = i.find_all('a') #Source
        source_text = link3[0].text.strip()
        sources.append(source_text)
        
    #Loop through the target or the div m-statement__meter to get the facts about the statement (True or False)
    for i in target:
        fact = i.find('div', attrs={'class':'c-image'}).find('img').get('alt')
        targets.append(fact)

## Loop through `n-1` webpages to scrape the data

In [176]:
authors = []
dates = []
statements = []
sources = []
targets = []

start = 1
end = 101
for i in range(start, end):
    scrape_website(i)

#### Scarping page: 1 ####
#### Scarping page: 2 ####
#### Scarping page: 3 ####
#### Scarping page: 4 ####
#### Scarping page: 5 ####
#### Scarping page: 6 ####
#### Scarping page: 7 ####
#### Scarping page: 8 ####
#### Scarping page: 9 ####
#### Scarping page: 10 ####
#### Scarping page: 11 ####
#### Scarping page: 12 ####
#### Scarping page: 13 ####
#### Scarping page: 14 ####
#### Scarping page: 15 ####
#### Scarping page: 16 ####
#### Scarping page: 17 ####
#### Scarping page: 18 ####
#### Scarping page: 19 ####
#### Scarping page: 20 ####
#### Scarping page: 21 ####
#### Scarping page: 22 ####
#### Scarping page: 23 ####
#### Scarping page: 24 ####
#### Scarping page: 25 ####
#### Scarping page: 26 ####
#### Scarping page: 27 ####
#### Scarping page: 28 ####
#### Scarping page: 29 ####
#### Scarping page: 30 ####
#### Scarping page: 31 ####
#### Scarping page: 32 ####
#### Scarping page: 33 ####
#### Scarping page: 34 ####
#### Scarping page: 35 ####
#### Scarping page: 36 ####
#

In [177]:
print('-'*80)
print(len(authors))
print(len(statements))
print(len(sources))
print(len(dates))
print(len(targets))
print('-'*80)

--------------------------------------------------------------------------------
3000
3000
3000
3000
3000
--------------------------------------------------------------------------------


## Create the `DataFrame`

In [178]:
data = pd.DataFrame(columns = ['author',  'statement', 'source', 'date', 'target']) 
data['author'] = authors
data['statement'] = statements
data['source'] = sources
data['date'] = dates
data['target'] = targets

print('-'*80)
print(f"data shape: {data.shape}")
print('-'*80)

--------------------------------------------------------------------------------
data shape: (3000, 5)
--------------------------------------------------------------------------------


In [175]:
data.head()

Unnamed: 0,author,statement,source,date,target
0,Emily Tian,“Washington public school forces unvaccinated ...,Facebook posts,"August 25, 2021",half-true
1,Samantha Putterman,75 doctors in South Florida walked out in prot...,Instagram posts,"August 25, 2021",false
2,Gabrielle Settles,"“It is the vaccinated, NOT the unvaccinated, s...",Facebook posts,"August 25, 2021",false
3,Warren Fiske,On ending Virginia's income tax.,Glenn Youngkin,"August 25, 2021",half-flip
4,Tom Kertscher,“80% of women who have been jabbed have lost t...,Facebook posts,"August 24, 2021",false


## `label` creation

In [147]:
data['target'].unique()

array(['false', 'barely-true', 'pants-fire', 'half-true', 'mostly-true',
       'true', 'full-flop', 'half-flip', 'no-flip'], dtype=object)

In [148]:
# TRUE – The statement is accurate and there’s nothing significant missing.
# MOSTLY TRUE – The statement is accurate but needs clarification or additional information.
# HALF TRUE – The statement is partially accurate but leaves out important details or takes things out of context.
# MOSTLY FALSE – The statement contains an element of truth but ignores critical facts that would give a different impression.
# FALSE – The statement is not accurate.
# PANTS ON FIRE – The statement is not accurate and makes a ridiculous claim.

### Categorical labels

In [179]:
target_to_num_cat = {
    'true': 0,
    'mostly-true': 1,
    'half-true': 2,
    'barely-true': 3, # similar to 'mostly-false'
    'mostly-false': 3,
    'false': 4,
    'pants-fire': 5,
    'full-flop': 5 # similar to 'pants-fire'
}

data['category_cat'] = data['target'].map(target_to_num_cat)

print('-'*80)
print(data['category_cat'].value_counts()/len(data)*100)
print('-'*80)

--------------------------------------------------------------------------------
4.0    47.566667
5.0    17.566667
3.0    14.266667
2.0     9.666667
1.0     6.400000
0.0     4.366667
Name: category_cat, dtype: float64
--------------------------------------------------------------------------------


### Binary label

In [180]:
target_to_num_bin = {
    'true': 0,
    'mostly-true': 0,
    'half-true': 1,
    'barely-true': 1, # similar to 'mostly-false'
    'mostly-false': 1,
    'false': 1,
    'pants-fire': 1,
    'full-flop': 1 # similar to 'pants-fire'
}

data['category'] = data['target'].map(target_to_num_bin)

print('-'*80)
print(data['category'].value_counts()/len(data)*100)
print('-'*80)

--------------------------------------------------------------------------------
1.0    89.066667
0.0    10.766667
Name: category, dtype: float64
--------------------------------------------------------------------------------


In [181]:
data.head()

Unnamed: 0,author,statement,source,date,target,category_cat,category
0,Emily Tian,“Washington public school forces unvaccinated ...,Facebook posts,"August 25, 2021",half-true,2.0,1.0
1,Samantha Putterman,75 doctors in South Florida walked out in prot...,Instagram posts,"August 25, 2021",false,4.0,1.0
2,Gabrielle Settles,"“It is the vaccinated, NOT the unvaccinated, s...",Facebook posts,"August 25, 2021",false,4.0,1.0
3,Warren Fiske,On ending Virginia's income tax.,Glenn Youngkin,"August 25, 2021",half-flip,,
4,Tom Kertscher,“80% of women who have been jabbed have lost t...,Facebook posts,"August 24, 2021",false,4.0,1.0


In [152]:
data.isnull().sum()

author          9000
statement          0
source             0
date            9000
target             0
category_cat      22
category          22
dtype: int64