# Scraping notebook
This notebook is used to collect the necessary data from Fox News / Twitter to be used in the following EDA notebook. Tucker Carlson's corpus of work is collected using web scraping tools, while Twitter is accessed via API calls.

In [2]:
#Imports cell

#Import basic libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv

#Selenium and related libraries
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.actions import mouse_button
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

## Generating article list
This section deploys a Selenium Chrome webdriver that crawls through the Fox News transcript section and collects any links related to Tucker Carlson to be used in the BeautifulSoup section below.

In [4]:
#Initialize the Selenium web driver on the foxnews page
PATH = '/Applications/chromedriver'
driver = webdriver.Chrome(PATH)
fox_URL = 'https://www.foxnews.com/category/shows/tucker-carlson-tonight/transcript'
driver.get(fox_URL)

  driver = webdriver.Chrome(PATH)


### Note:
 
***MUST click out of pop-up window when the page loads before running next cell!!***

In [5]:
#Click the load more button until the page has loaded every article
#It will stop early when there is an error due to no more "load more" buttons
for _ in range(10000):
    button = driver.find_element(By.CLASS_NAME, "button.load-more.js-load-more")
    button.click()
    time.sleep(.25)

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".button.load-more.js-load-more"}
  (Session info: chrome=101.0.4951.54)
Stacktrace:
0   chromedriver                        0x00000001050909a0 chromedriver + 4426144
1   chromedriver                        0x000000010502a798 chromedriver + 4007832
2   chromedriver                        0x0000000104c86cc4 chromedriver + 191684
3   chromedriver                        0x0000000104cb58d4 chromedriver + 383188
4   chromedriver                        0x0000000104cde648 chromedriver + 550472
5   chromedriver                        0x0000000104caacf4 chromedriver + 339188
6   chromedriver                        0x00000001050677d0 chromedriver + 4257744
7   chromedriver                        0x000000010506b580 chromedriver + 4273536
8   chromedriver                        0x000000010506f6c4 chromedriver + 4290244
9   chromedriver                        0x000000010506bcf8 chromedriver + 4275448
10  chromedriver                        0x000000010504bb38 chromedriver + 4143928
11  chromedriver                        0x000000010508321c chromedriver + 4370972
12  chromedriver                        0x0000000105083380 chromedriver + 4371328
13  chromedriver                        0x0000000105097154 chromedriver + 4452692
14  libsystem_pthread.dylib             0x00000001be2114ec _pthread_start + 148
15  libsystem_pthread.dylib             0x00000001be20c2d0 thread_start + 8


In [78]:
#Once all of the pages have been accessed, loop through each 
# article and grab the necessary info (The href link)
articles = driver.find_elements(By.CLASS_NAME ,'article')
article_links = []
for article in articles:
    # article = article.find_element(By.CLASS_NAME, 'title')
    # article = article.find_element(By.TAG_NAME, 'a')
    article_links.append(article.find_element(By.TAG_NAME, 'a').get_attribute('href'))


['https://www.foxnews.com/opinion/tucker-position-passive-aggressive-party',
 'https://www.foxnews.com/opinion/tucker-biden-plan-torque-up-fear',
 'https://www.foxnews.com/opinion/tucker-why-is-everyone-mad-justice-alito',
 'https://www.foxnews.com/opinion/tucker-roe-most-embarrassing-court-decision-handed-down-last-century',
 'https://www.foxnews.com/opinion/tucker-inevitable-end-stage-russiagate',
 'https://beap.gemini.yahoo.com/mbclk?bv=1.0.0&es=U.ZFShkGIS_iMLdxtvf9T1OpWNFtu5Fy4OXbRIwmQK24EllUixfDWH4WoLvCNj5mZBA5clh7nQyoY7WKsN6hMq_RyltGZp3lcn6tRrY4D5vuk0GQ0YrtoQv7YLEXZb543XtRqN_qGA61jhfPx8eNWaGnQ1kZh0v_yEmaCgTzqUEszMo6awF4HjrFfmnchZP7j6_nZXQ_WgFfBiT4P8C07aQRGI_I12pY1YYoxfzVIlK5H7pLvQQHqAdUt_wVuQMwcj33mOCLNYRYtVC57rDFam4PN0O1Z19LDiyD1W7ho1pjnw3cv5eO3_KxTrIAVmVtkTx_cnSh.Mca2PqKJZ1rVmZtxGj5f6g3pBC_zzrxkXOWwTVrb0.DwjSY3_kTolf4cTtHtyiOGgJlih1GbrxUiCyOsvCLohHwlB4_1Z432SVLHeTHSzGqdxOY.i7SzdXVSxhyMxV6WKY25YP2iOmEIOpGq2o2clPLeOImJxq_hiR9RvEC6QaXXUkeamEATxDreCn_nlwn4VLoPTYgbLa_tKmb3ANLilW.loZ

In [79]:
#Remove any link that isn't a tucker carlson article
cleaned_links = []
for link in article_links:
    if 'tucker' in link:
        cleaned_links.append(link)


In [83]:
#Save the cleaned up links as a csv file for later use
np.savetxt("data/tucker_links.csv", 
           cleaned_links,
           delimiter =", ", 
           fmt ='% s')

# Accessing the articles with BeautifulSoup

In [3]:
#Open up the tucker links CSV and load it into the notebook
with open('data/tucker_links.csv') as f:
    tucker_csv = csv.reader(f, delimiter = ',')
    tucker_links = []
    for link in tucker_csv:
        tucker_links.append(link[0])

In [45]:
#Some functions to process the pages

#Parse fox news page
def tucker_parse(html_page):
    soup = BeautifulSoup(html_page.content, 'html.parser') # Pass the page contents to beautiful soup for parsing
    main = soup.body.find('div', id = '__nuxt').find('div', id = '__layout').find('div', id = 'wrapper').find('div', class_= 'page-content').div.main
    article_body = main.article.find('div', class_ = 'article-content-wrap sticky-columns').div.div
    return article_body

#Turn the parsed page into a string
def tucker_content(article):
    page_content = ''
    for paragraph in article.findAll('p'):
        if paragraph.a is None:
            page_content = page_content + ' ' + str(paragraph.text)
    return page_content

#Create a function that loops through every tucker page and gets text data as list
def tucker_documents(links,sleep_time):
    tucker_doc = []
    for link in links:
        html_page = requests.get(link)
        article = tucker_parse(html_page)
        page_content = tucker_content(article)
        tucker_doc.append(page_content)
        time.sleep(sleep_time)
    return tucker_doc

In [44]:
#Stopping notebook before it can run the very slow cell below
raise NotImplementedError('Do not run the next cell unless you really need to.')

NotImplementedError: Do not run the next cell unless you really need to.

### Note:
 
***This cell can take over 20 minutes to run!!! Do not run unless you need new article info***

In [None]:
#Run through every article on Tucker Carlson on Fox News
tucker_docs = tucker_documents(tucker_links, .25)

In [33]:
#Save the file as a CSV to be used in EDA notebook
with open('data/tucker_docs.csv', 'w', encoding='UTF8', newline='') as f:
    # create the csv writer
    writer = csv.writer(f)
    # write a row to the csv file
    writer.writerow(tucker_documents)

#Pickle the DataFrame
tucker_df = pd.DataFrame(tucker_documents, columns = ['Document'])
tucker_df.to_pickle('data/tucker_pickle')

