# Scraping notebook
This notebook is used to collect the necessary data from Fox News / Twitter to be used in the following EDA notebook. Tucker Carlson's corpus of work is collected using web scraping tools, while Twitter is accessed via API calls.

In [7]:
#Imports cell

#Import basic libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv

#Selenium and related libraries
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.actions import mouse_button
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

## Generating article list
This section deploys a Selenium Chrome webdriver that crawls through the Fox News transcript section and collects any links related to Tucker Carlson to be used in the BeautifulSoup section below.

In [79]:
#Initialize the Selenium web driver on the foxnews page
PATH = '/Applications/chromedriver'
driver = webdriver.Chrome(PATH)
fox_URL = 'https://www.foxnews.com/category/shows/tucker-carlson-tonight/transcript'
driver.get(fox_URL)

  driver = webdriver.Chrome(PATH)


### Note:
 
***MUST click out of pop-up window when the page loads before running next cell!!***

In [80]:
#Click the load more button until the page has loaded every article
#It will stop early when there is an error due to no more "load more" buttons
for _ in range(10000):
    button = driver.find_element(By.CLASS_NAME, "button.load-more.js-load-more")
    button.click()
    time.sleep(.25)

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":".button.load-more.js-load-more"}
  (Session info: chrome=101.0.4951.54)
Stacktrace:
0   chromedriver                        0x00000001030fc9a0 chromedriver + 4426144
1   chromedriver                        0x0000000103096798 chromedriver + 4007832
2   chromedriver                        0x0000000102cf2cc4 chromedriver + 191684
3   chromedriver                        0x0000000102d218d4 chromedriver + 383188
4   chromedriver                        0x0000000102d4a648 chromedriver + 550472
5   chromedriver                        0x0000000102d16cf4 chromedriver + 339188
6   chromedriver                        0x00000001030d37d0 chromedriver + 4257744
7   chromedriver                        0x00000001030d7580 chromedriver + 4273536
8   chromedriver                        0x00000001030db6c4 chromedriver + 4290244
9   chromedriver                        0x00000001030d7cf8 chromedriver + 4275448
10  chromedriver                        0x00000001030b7b38 chromedriver + 4143928
11  chromedriver                        0x00000001030ef21c chromedriver + 4370972
12  chromedriver                        0x00000001030ef380 chromedriver + 4371328
13  chromedriver                        0x0000000103103154 chromedriver + 4452692
14  libsystem_pthread.dylib             0x00000001be2114ec _pthread_start + 148
15  libsystem_pthread.dylib             0x00000001be20c2d0 thread_start + 8


In [78]:
#Once all of the pages have been accessed, loop through each 
# article and grab the necessary info (The href link)
articles = driver.find_elements(By.CLASS_NAME ,'article')
article_links = []
for article in articles:
    # article = article.find_element(By.CLASS_NAME, 'title')
    # article = article.find_element(By.TAG_NAME, 'a')
    article_links.append(article.find_element(By.TAG_NAME, 'a').get_attribute('href'))


['https://www.foxnews.com/opinion/tucker-position-passive-aggressive-party',
 'https://www.foxnews.com/opinion/tucker-biden-plan-torque-up-fear',
 'https://www.foxnews.com/opinion/tucker-why-is-everyone-mad-justice-alito',
 'https://www.foxnews.com/opinion/tucker-roe-most-embarrassing-court-decision-handed-down-last-century',
 'https://www.foxnews.com/opinion/tucker-inevitable-end-stage-russiagate',
 'https://beap.gemini.yahoo.com/mbclk?bv=1.0.0&es=U.ZFShkGIS_iMLdxtvf9T1OpWNFtu5Fy4OXbRIwmQK24EllUixfDWH4WoLvCNj5mZBA5clh7nQyoY7WKsN6hMq_RyltGZp3lcn6tRrY4D5vuk0GQ0YrtoQv7YLEXZb543XtRqN_qGA61jhfPx8eNWaGnQ1kZh0v_yEmaCgTzqUEszMo6awF4HjrFfmnchZP7j6_nZXQ_WgFfBiT4P8C07aQRGI_I12pY1YYoxfzVIlK5H7pLvQQHqAdUt_wVuQMwcj33mOCLNYRYtVC57rDFam4PN0O1Z19LDiyD1W7ho1pjnw3cv5eO3_KxTrIAVmVtkTx_cnSh.Mca2PqKJZ1rVmZtxGj5f6g3pBC_zzrxkXOWwTVrb0.DwjSY3_kTolf4cTtHtyiOGgJlih1GbrxUiCyOsvCLohHwlB4_1Z432SVLHeTHSzGqdxOY.i7SzdXVSxhyMxV6WKY25YP2iOmEIOpGq2o2clPLeOImJxq_hiR9RvEC6QaXXUkeamEATxDreCn_nlwn4VLoPTYgbLa_tKmb3ANLilW.loZ

In [79]:
#Remove any link that isn't a tucker carlson article
cleaned_links = []
for link in article_links:
    if 'tucker' in link:
        cleaned_links.append(link)


In [83]:
#Save the cleaned up links as a csv file for later use
np.savetxt("data/tucker_links.csv", 
           cleaned_links,
           delimiter =", ", 
           fmt ='% s')

# Accessing the articles with BeautifulSoup

In [3]:
#Open up the tucker links CSV and load it into the notebook
with open('data/tucker_links.csv') as f:
    tucker_csv = csv.reader(f, delimiter = ',')
    tucker_links = []
    for link in tucker_csv:
        tucker_links.append(link[0])

In [45]:
#Some functions to process the pages

#Parse fox news page
def tucker_parse(html_page):
    soup = BeautifulSoup(html_page.content, 'html.parser') # Pass the page contents to beautiful soup for parsing
    main = soup.body.find('div', id = '__nuxt').find('div', id = '__layout').find('div', id = 'wrapper').find('div', class_= 'page-content').div.main
    article_body = main.article.find('div', class_ = 'article-content-wrap sticky-columns').div.div
    return article_body

#Turn the parsed page into a string
def tucker_content(article):
    page_content = ''
    for paragraph in article.findAll('p'):
        if paragraph.a is None:
            page_content = page_content + ' ' + str(paragraph.text)
    return page_content

#Create a function that loops through every tucker page and gets text data as list
def tucker_documents(links,sleep_time):
    tucker_doc = []
    for link in links:
        html_page = requests.get(link)
        article = tucker_parse(html_page)
        page_content = tucker_content(article)
        tucker_doc.append(page_content)
        time.sleep(sleep_time)
    return tucker_doc

In [None]:
#Stopping notebook before it can run this cell
raise NotImplementedError('Do not run the next cell unless you really need to.')
#Run through every article on Tucker Carlson on Fox News
tucker_docs = tucker_documents(tucker_links, 2)

In [33]:
#Save the file as a CSV to be used in EDA notebook
with open('data/tucker_docs.csv', 'w', encoding='UTF8', newline='') as f:
    # create the csv writer
    writer = csv.writer(f)
    # write a row to the csv file
    writer.writerow(tucker_documents)

#Pickle the DataFrame
tucker_df = pd.DataFrame(tucker_documents, columns = ['Document'])
tucker_df.to_pickle('data/tucker_pickle')

## Twitter stuff

In [14]:
#Loads keys variables as key, secret, bearer
import tweepy
%run secrets/.keys.ipynb

### Generate a list of user IDs
THIS IS ALL CRAP DON'T USE THIS SECTION

In [24]:
#This cell creates a sample stream of users on twitter and parses their user IDs to be used later for creating a twitter sample set

#raise NotImplementedError('Do not run the next cell unless you really need to')

import requests
import os
import json
import ast

bearer_token = bearer
users = []

def create_url():
    return "https://api.twitter.com/2/tweets/sample/stream"


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2SampledStreamPython"
    return r


def connect_to_endpoint(url):
    response = requests.request("GET", url, auth=bearer_oauth, stream=True)
    print(response.status_code)
    
    for response_line in response.iter_lines():

        if response_line:
            json_response = json.loads(response_line)
            print(response_line)
            #users.append(json.dumps(json_response, indent=4, sort_keys=True))

    if response.status_code != 200:
        raise Exception(
            "Request returned an error: {} {}".format(
                response.status_code, response.text
            )
        )


def main():
    url = create_url()
    timeout = 0

    while True:
        check = connect_to_endpoint(url)
        #users.append(ast.literal_eval(check)['data']['id'])
        timeout += 1



if __name__ == "__main__":
    main()



200
b'{"data":{"id":"1523670090541322240","text":"@riyad_jobs \xe2\x9c\x8d\xe2\x9c\x8d\xd8\xb3\xd9\x8a\xd8\xb1\xd8\xaa\xd9\x83 \xd8\xa7\xd9\x84\xd8\xb0\xd8\xa7\xd8\xaa\xd9\x8a\xd8\xa9 \xd8\xa5\xd8\xad\xd8\xaf\xd9\x89 \xd8\xa3\xd9\x87\xd9\x85 \xd8\xb9\xd9\x88\xd8\xa7\xd9\x85\xd9\x84 \xd8\xa7\xd9\x84\xd9\x82\xd8\xa8\xd9\x88\xd9\x84 \xd9\x88\xd8\xa7\xd9\x84\xd8\xa7\xd9\x86\xd8\xb7\xd8\xa8\xd8\xa7\xd8\xb9 \xd8\xa7\xd9\x84\xd8\xac\xd9\x8a\xd8\xaf \xd9\x84\xd8\xaf\xd9\x89 \xd9\x84\xd8\xac\xd9\x86\xd8\xa9 \xd8\xa7\xd9\x84\xd9\x85\xd9\x82\xd8\xa7\xd8\xa8\xd9\x84\xd8\xa7\xd8\xaa \xd9\x88\xd8\xb4\xd8\xb1\xd9\x83\xd8\xa7\xd8\xaa \xd8\xa7\xd9\x84\xd9\x88\xd8\xb8\xd8\xa7\xd8\xa6\xd9\x81.\\n\\n\xf0\x9f\x91\x8d\xd8\xaf\xd8\xb9\xd9\x86\xd8\xa7 \xd9\x86\xd8\xb5\xd9\x85\xd9\x85\xd9\x87\xd8\xa7 \xd9\x84\xd9\x83 \xd8\xa8\xd8\xb7\xd8\xb1\xd9\x8a\xd9\x82\xd8\xa9 \xd8\xa7\xd8\xad\xd8\xaa\xd8\xb1\xd8\xa7\xd9\x81\xd9\x8a\xd8\xa9 \xd9\x85\xd9\x88\xd8\xa7\xd9\x83\xd8\xa8\xd8\xa9 \xd9\x88\xd9\x85\xd8\xb9\xd8\xa7\

KeyboardInterrupt: 

In [12]:
#Build a list of all of the user IDs in the sample set

user_dicts = []
for user in users:
    try:
        user_dicts.append(ast.literal_eval(user))
    except:
        continue;

user_ids = [user['data']['id'] for user in user_dicts]


In [14]:
#Save the user IDs as a csv
raise NotImplementedError
pd.DataFrame(user_ids, columns = ['user_id']).to_csv('data/user_ids.csv')

NotImplementedError: 

### Generate a set of tweets for each user in the dataset
Take list of user IDs and generate a list of tweets to be used in the NLP system

In [15]:
client = tweepy.Client(bearer_token=bearer)

In [16]:
def get_tweets(user_id):
    tweets = client.get_users_tweets(id = user_id, max_results = 100)
    tweets_list = []
    try:
        for tweet in tweets.data:
            tweets_list.append(tweet.text)
            if len(tweet.context_annotations) > 0:
                tweets_meta.append(tweet.context_annotations)
        return tweets_list
    except:
        return None

### This is to find unique users
Already ran, don't run again!!!

In [6]:
raise NotImplementedError
query = 'roe v. wade -has:media -is:retweet'
#response = client.search_recent_tweets(query = query, tweet_fields=['created_at', 'lang', 'id'], expansions = 'author_id')
users_dict = {'id': [], 'lang': []}
for tweet in tweepy.Paginator(client.search_recent_tweets, query = query, tweet_fields=['created_at', 'lang', 'id', 'author_id']).flatten(limit = 20000):
    if tweet.author_id not in users_dict['id']:
        users_dict['id'].append(tweet.author_id)
        users_dict['lang'].append(tweet.lang)

TooManyRequests: 429 Too Many Requests
Too Many Requests

In [10]:
raise NotImplementedError
df = pd.DataFrame(users_dict)
df = df.loc[df['lang'] == 'en']
df.to_csv('data/user_ids.csv')

In [18]:
#Load up the saved data
df = pd.read_csv('data/user_ids.csv')

### This section pulls the tweet history from every user

In [19]:
#Initialize the parameters
n = 0
user_tweets_list = []
user_id_list = list(df['id'])


In [20]:
def get_next_tweets(n, checkpoint_step):
    user_tweets = []
    next_checkpoint = n + checkpoint_step
    while n < next_checkpoint:
        n += 1
        user_tweets.append(get_tweets(user_id_list[n]))
    
    return user_tweets, n

In [21]:
for _ in range(400):
    try:
        step_size = 15
        new_tweets, n = get_next_tweets(n, step_size)
        user_tweets_list = user_tweets_list + new_tweets

        tweet_df = pd.DataFrame(np.array(user_tweets_list), columns = ['tweet_history'])
        #tweet_df.to_csv('data/unique_tweets_list.csv')
    except:
        time.sleep(100)

  tweet_df = pd.DataFrame(np.array(user_tweets_list), columns = ['tweet_history'])


KeyboardInterrupt: 

In [13]:
tweet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   tweet_history  840 non-null    object
dtypes: object(1)
memory usage: 6.7+ KB
