<a href="https://colab.research.google.com/github/CoderBose/amd-infoGuard/blob/main/AMD_DataScraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install beautifulsoup4 requests
!pip install selenium
!pip install webdriver-manager

Collecting selenium
  Downloading selenium-4.20.0-py3-none-any.whl (9.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.25.0-py3-none-any.whl (467 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m467.2/467.2 kB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl (17 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl (10 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl (24 kB)
Collecting h11<1,>=0.9.0 (from wsproto>=0.14->trio-websocket~=0.9->selenium)
  Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

class PolitiFactArticleScraper:
    base_url = "https://www.politifact.com/article/"

    def __init__(self):
        self.session = requests.Session()

    def generate_page_url(self, page):
        return f"{self.base_url}?page={page}"

    def get_articles_info(self, page=1):
        page_url = self.generate_page_url(page)
        response = self.session.get(page_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            articles = soup.find_all('div', class_='m-teaser')
            articles_info = []
            for article in articles:
                title = article.find('h3', class_='m-teaser__title').text.strip()
                date = article.find('div', class_='m-teaser__meta').text.strip().split('•')[1].strip()
                author = article.find('div', class_='m-teaser__meta').text.strip().split('•')[0].strip()
                if author.startswith("By "):
                    author = author[3:]  # The authors name was coming in the format : "By abc" - so I had to remove the "By " part
                article_url = article.find('a')['href']
                full_article_url = urljoin(self.base_url, article_url)  # Each page has 20 articles, so we need to construct urls by modifying the page number param
                # getting the author from the main page, because inside the articles they are placed in a different div - easier from main page
                articles_info.append({'title': title, 'date': date, 'author': author, 'url': full_article_url})
            return articles_info
        else:
            print("Failed to retrieve article information. Status code:", response.status_code)
            return []

    def scrape_article_data(self, article_url):
        response = self.session.get(article_url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            article_title = soup.find('h1', class_='m-statement__quote').text.strip()
            tags = [tag.text.strip() for tag in soup.find_all('a', class_='c-tag')]
            author = soup.find('div', class_='m-author__wrap').find('a').text.strip() if soup.find('div', class_='m-author__wrap') else ""
            paragraphs = soup.find('article', class_='m-textblock').find_all('p')
            text = ' '.join(paragraph.text.strip() for paragraph in paragraphs)
            return {'title': article_title, 'author': author, 'tags': tags, 'text': text}
        else:
            print("Failed to retrieve article data from URL:", article_url)
            return {}

    def scrape_and_save_articles(self, filename, start_page=1, end_page=None):
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Title', 'Date', 'Author', 'Tags', 'Text']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for page in range(start_page, end_page+1) if end_page else range(start_page, float('inf')):
                articles_info = self.get_articles_info(page)
                if not articles_info:
                    break
                for article in articles_info:
                    article_data = self.scrape_article_data(article['url'])
                    writer.writerow({'Title': article['title'], 'Date': article['date'],
                                     'Author': article.get('author', ''), 'Tags': ", ".join(article_data.get('tags', [])),
                                     'Text': article_data.get('text', '')})
                    print("Article scraped from page", page, ":", article['title'])
        print("Articles scraped and saved to", filename)

# Scraper at work - kept it to the first 5 pages for now, can be increased to 100 or however many the site lets us do, without shooting us down for a bot
scraper = PolitiFactArticleScraper()
scraper.scrape_and_save_articles("politifact_articles.csv", start_page=1, end_page=5)

Article scraped from page 1 : Are there really Chinese sleeper cells operating in the U.S.?
Article scraped from page 1 : Here’s how new Title IX regulations could affect LGBTQ+ students
Article scraped from page 1 : Donald Trump exagera la disminución del crimen en Venezuela y engaña sobre sus causas
Article scraped from page 1 : Biden in Tampa: Fact-checks of his claims on abortion, Trump
Article scraped from page 1 : El caso criminal contra Donald Trump en Manhattan no es un juicio ordenado por Joe Biden
Article scraped from page 1 : Fact-checking Aaron Rodgers, who repeated Robert F. Kennedy Jr.’s false claims about HIV/AIDS
Article scraped from page 1 : The Supreme Court will decide Donald Trump’s immunity case. Here are the arguments.
Article scraped from page 1 : Did a nongovernment organization in Mexico encourage migrants to vote for Biden? What we know
Article scraped from page 1 : Was Joe Biden’s uncle eaten by cannibals after World War II crash? Experts say it’s highly unli