In [None]:
import bs4 as bs
import concurrent.futures
import datetime
from functools import partial
import json
import matplotlib.pyplot as plt
import os
import pandas as pd
import requests
import re
import time


###### Link to sample data on Google Drive

https://drive.google.com/file/d/1rAqpbo3z-pjUNZizY0qgvMSIezLX_xlC/view

# 1: Python Requests Library

### We use the Python 'requests' library to interact with the internet
###### Requests Docs: https://requests.kennethreitz.org/en/master/#
###### Requests Tutorial: https://realpython.com/python-requests/
###### Requests Youtube tutorial: https://www.youtube.com/watch?v=tb8gHvYlCFs

In [None]:
# makes a 'GET' request to the yahoo, should return '200'
# we can use 'response' to access all elements found in the page source
response = requests.get('https://finance.yahoo.com/quote/TSLA?p=TSLA', 'lxml')
# this should redurrn '<Response [200]>' indicating a succsesfulll request and response
print(response)

### '.text' method gives us the page source

In [None]:
# we are assigning text value of the reponse to variable 'response_text' 
response_text = response.text

# notices that requests returns the page as a 'str'
print(f'response_text type: {type(response_text)}')

# prints out a spacer, makes output more readable
print('–'*50)
# print out 
print(response_text)

### The 'beautifulsoup' module allows us to search the webpage by tag/selector
Beautiful Soup module allows us to search webpages based on their html tags, classes, or ids
###### BS Docs: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
###### BS Tutorial: https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3
###### BS Youtube Tutorial: https://www.youtube.com/watch?v=ng2o98k983k


In [None]:
# creates 'bs4.BeautifulSoup' Object
source = bs.BeautifulSoup(response_text)

# notices that 'source' type is a BeautifulSoup object, we can search this for specific elements
print('SOURCE TYPE:\n', type(source))

# spacer( '\n' means 'new line')
print('\n')

# prints out the page source as a string 'str'
print('SOURCE:\n', source)

##### Finda all links in the webage

In [None]:
# finds all elements with the 'a' tag
links = source.find_all('a')

# here we are 'itterating' over the 'list' of links, and printing out each 'link' in the list 
for link in links:
    
    # 'link' is any 'a' tag on the page source. This includes the url, its classes, ids and any other atributes 
    print('LINK:\n', link)

    # we are filtering link to just the 'href' atribute, this conatines the actual url
    print('URL:\n', link['href'])
    
    # print the text value of the link, this is what is displayed on the web page when you click on it
    print('TEXT:\n', link.text)
    
    # divider, makes output more readdable
    print("-"*100)

# 2: Get Company Financial Data

In [None]:
# takes any ticker symbol and returns financial data as a dictionary (‘dict’)
# ':str' and '-->' are examples of function annotations. Can be accessed with '.__annotations__'
# annotations give usefull information about the function
def get_company_data(ticker: str) -> dict:
    # this is called a 'doc string', a brief description of the function. Can be accessed with '.__doc__'
    '''
    Takes any ticker symbol and returns financial data as a dict
    Parameters: A ticker symbol (str)
    Returns: A dict of financial data from Yahoo Finance
    '''
    
    # base url for yahoo financial stats
    url = f'https://finance.yahoo.com/quote/{ticker}/key-statistics?p={ticker}'
    
    # header passed request
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    # makes request 
    response = requests.get(url, 'lxml', headers=headers)

    # handles for bad url, response should be 200, anything else is an error
    if response.status_code != 200:
        return {'ticker': ticker, '!status': f'code {response.status_code}'}
    
    # main bs page object
    source = bs.BeautifulSoup(response.text)
    # find all 'section' tags with 'data-test' attribute of 'qsp-statistics'
    data = source.find('section', {'data-test':'qsp-statistics'})
    
    # handles for invalid ticker symbol, ".find()" returns "None" 
    if data == None:
        return {'ticker': ticker, '!status': 'data == None'}
    
    # finds the company name: Selects all 'div' elements with an id of "'id':'quote-header-info'"
    company_name = source.find('div', {'id':'quote-header-info'}).find('h1').text
    
    # creates a list of all 'tr'('table row') elements. Since we know we want all info in the tables,
    # this is a good way to select the data. Note 'find_all' returns a 'list' of elements
    rows = data.find_all('tr')

    # Python dictionary object. '!' take are sorted first alphabetically, keeps in first column
    info_dict = {'ticker': ticker, '!status': response.status_code, '!!company_name': company_name}
    
    # We are 'iterating' over the 'list' of 'tr' elements, selecting the category, and data value
    for row in rows:
        
        # 'td' (table data) tag. Think of this as a cell in the table. These tables have two columns, 
        # the first column is the category name, the second is the data value,
        # should returns a list with 2 elements
        data = row.find_all('td')
        
        # we create 'key' variable in used in our dictionary. This contains name of the category
        key = data[0].text.strip()
        
        # we create 'value' variable in used in our dictionary. This contains value of the category
        value = data[1].text.strip()
        
        # add the key and value to the dictionary we created. 
        info_dict[key] = value
        
    # function returns the dictionary, if we set a variable equal to this function,
    # that variable will be a dictionary ('dict' object)
    return info_dict
    

In [None]:
# function annotations
print(get_company_data.__annotations__)
# function doc string
print(get_company_data.__doc__)

### We use the 'Pandas' (imported as 'pd') module to organize the data into a useable table
###### Pandas Docs: https://pandas.pydata.org/pandas-docs/stable/
###### Pandas Tutorial: https://pandas.pydata.org/pandas-docs/stable/getting_started/10min.html
###### Pandas Youtube Tutorial: https://www.youtube.com/watch?v=0UA49Ds1XXo

In [None]:
# Python 'list' that containes three tickers ('str')
ticker_list = ['tsla', 'goog', 'gs', 'fb']

# an empty Python 'list'. This is used to store data temporaraly before being tunred into a Pandad 'DataFrame'
company_data_list = []

# we are 'itterating' over the 'ticker_list'
for ticker in ticker_list:
    
    # feeds each of the tickers in the list into the 'get_company_data' function we defined earlier
    # and asign the return value of this function ('dict') to a variable 'company_data'
    company_data = get_company_data(ticker)
    
    # adds the 'company_data' to the empty 'company_data_list' we created earlier
    company_data_list.append(company_data)
    
    print(ticker.upper())
    print(company_data_list)


In [None]:
# variable 'df' is created, this will be our 'DataFrame'
# 'pd.DataFrame' is a pandas 'DataFrame' object. You can think of it  
# much like an excel spreadsheet (rows and columns)
# the dataframe is created by passing a 'list' of dictionaries ('dict') which pandas converts
df = pd.DataFrame(company_data_list)

# by default, pandas creates an numerical index (1,2,3...). We are setting the index to the 'ticker' column
df.set_index('ticker', inplace=True)

# shape give us the rows and columns of the 'df'
print(df.shape)

# jupyter will display the 'df' if it is the last line in a cell, without useing the 'print' function
df

# 3: Get List of S&P 500 Tickers
###### Wikipedia list of S&P 500 Comapnies:  https://en.wikipedia.org/wiki/List_of_S%26P_500_companies
###### Yahoo: https://finance.yahoo.com/quote/TSLA/key-statistics?p=TSLA

In [None]:
# creates a list of S&P 500 tickers from Wikipedia, returns dictionary of company ticker and name
# Note this function takes no parameters, the url is included in the function,
# will not work with other Wikipedia pages
def get_tickers() -> list:
    '''
    Parameters: None
    Returns: A list of dictionaries containing companies and their tickers
    '''
    # unlike the previous function the url is not going to change
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    
    # optional headers, in this case we are changing the user agent
    headers = {'User-Agent': 'Mozilla/5.0'}
    
    # request the Wikipedia page
    response = requests.get(url, 'lxml', headers=headers)
    
    # bs object, searchable by 'dom' elements
    source = bs.BeautifulSoup(response.text)  
    
    # find the first element with a 'table' tag, and a 'id' of 'constituents'
    main_table = source.find('table', {'id': 'constituents'})
    
    # after finding the 'main_table', we select just the table body 'tbody'
    table_body = main_table.find('tbody')
    
    # selects all rows 'tr' in the table body
    rows = table_body.find_all('tr')
    
    # each ticker is added to this list
    company_list = []
    # itterate over the table rows, select each ticker symbol
    for row in rows:
        
        # row contains list of 'td' (table data) elements
        row_cells = row.find_all('td')
        
        # skips any row missing essential data (first and second column)
        if len(row_cells) <= 1:
            # 'continue' advances the for loop to the next item
            continue
                
        # first column in table. '.strip' removes white space
        ticker = row_cells[0].text.strip()
        
        # '.append' method adds a element ('ticker') to the end of a list ([company_list').
        company_list.append(ticker)
        
    # return Python list containing all S&P 500 tickers  
    return company_list
              

##### Runs the 'get_tickers()' function, creates list of tickers

In [None]:
# creates a variable 'tickers_list' which is set equal to the output of 'get_tickers()' function
tickers_list = get_tickers()

# prints number of tickers in 'tickers_list'
print(f"{len(tickers_list)} tickers in list")

# prints the entire 'ticker_list'
print(tickers_list)

##### Gets financial data for the first 10 tickers

In [None]:
# gets data for first 10 on a single thread
# 't0' is start time
t0 = time.time()

# data for each company will be added to this list
company_data_list = []
# itterate over the first 10 tickers in the tickers list.
# '[:10]' is slice notation for first 10 elements in list
for ticker in tickers_list[:10]:
    
    # variable 'company_data' is set to the return value of 'get_company_data' function, a dictionary
    company_data = get_company_data(ticker)
    
    # add the company data dictionary to 'company_data_list'
    company_data_list.append(company_data)
    
    print(ticker)
    
# 't1' is current time when this line is evaluated
t1 = time.time()
print("{:.4} seconds".format(t1-t0))

# makes 'DataFrame' from 'company_data_list'
df = pd.DataFrame(company_data_list)

# set the ticker column as the index
df.set_index('ticker', inplace=True)

# df rows and columns
print(df.shape)

##### Prints the DataFrame

In [None]:
# df rows and columns
print(df.shape)

# prints df
df

# 4: Saving Data
### convert the data to a CSV from the pandas DataFrame

In [None]:
# Current Working Directory. The current folder this file is in
print(os.getcwd())

In [None]:
# all files in directory before making the csv, 'enumerate' function numbers what is being looped over
for index, file in enumerate(os.listdir()):
    print(index, file)

In [None]:
# save as CSV (Comma Seperated Values)
df.to_csv(f'financial_data.csv')

# save a file with the current datetime to prevent overriding an existing file
df.to_csv(f'financial_data_{time.asctime()}.csv')


In [None]:
# all files in directory before making the csv
for index, file in enumerate(os.listdir()):
    print(index, file)

In [None]:
# create DataFrame from CSV, setting the index collumn to the first (0th) column in the CSV
df2 = pd.read_csv('financial_data.csv', index_col=0)
df2

# 5: Multithread Requests (Thread pool executor)
### Request multiple pages at once, saves time

In [None]:
# re run 'get_tickers()'
tickers_list = get_tickers()
print(f"{len(tickers_list)} tickers in list")

#####  'thread_function ' makes multiple requests at once

In [None]:
# asynchronously call "get_company_data". Acts as a 'wrapper' for the other functions
def thread_function(num, input_ticker_list=tickers_list, get_company_data_function=get_company_data) -> None:
    
    # ticker at index in list (num=5, 4th ticker in list, staring at 0)
    ticker = input_ticker_list[num]
    
    # calls company data function
    company_data = get_company_data_function(ticker)
   
    # adds 'compnay_data' to shared list
    company_data_list.append(company_data)
    

##### Runs 'thread_function' with 'tickers_list'

In [None]:
# start time
t0 = time.time()

# 'list' of dictionaries 'dict'
company_data_list = []

# max workers = max number of threads to be open at any one time
# 'multithread' the requests. This means we are makeing multiple requests at once.
# 'max_workers=5' is the max number of threads we can have open at ay one time.
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:

    # 'mapping' the function to a range of integers. Eaxh 'int' corasponds to a index in the list
    # 'partial' function allows us to pass one parameter to the functin before mapping it 
    executor.map(thread_function, range(10))

# create the 'DataFrame'
df = pd.DataFrame(company_data_list)
df.set_index('ticker', inplace=True)


# end time
t1 = time.time()
# prints time it took for this cell to execute
print("{:.4} seconds".format(t1-t0))
print(df.shape)

In [None]:
print(df.shape)
df

#### filters the 'tickers_list' based on starting letter

In [None]:
# filters 'tickers_list' for tickers that start with given letter. * = 'args' = multiple arguments
def starts_with(*letters, tickers_list=tickers_list): 
    
    filtered_tickers = []
    
    for letter in letters:
        filtered_tickers.extend([ticker for ticker in tickers_list if ticker.startswith(letter.upper())])  
    return filtered_tickers

letter_1 = 's'
letter_2 = 't'
letter_3 = 'r'

letter_list = starts_with(letter_1, letter_2, letter_3)

print(len(letter_list))
print(letter_list)



In [None]:
# 'list' of dictionaries 'dict'
company_data_list = []

# max workers = max number of threads to be open at any one time
# 'multithread' the requests. This means we are makeing multiple requests at once.
# 'max_workers=5' is the max number of threads we can have open at ay one time.
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:

    # 'mapping' the function to a range of integers. Eaxh 'int' corasponds to a index in the list
    # 'partial' function allows us to pass one parameter to the functin before mapping it 
    executor.map(partial(thread_function, input_ticker_list=letter_list), range(len(letter_list)))

# create the data frame
df = pd.DataFrame(company_data_list)
df.set_index('ticker', inplace=True)

print(df.shape)

In [None]:
df1 = df[df.index.str.startswith(letter_1.upper())]
df2 = df[df.index.str.startswith(letter_2.upper())]
df3 = df[df.index.str.startswith(letter_3.upper())]

In [None]:
print(df1.head().index)
print(df2.head().index)
print(df3.head().index)

In [None]:
df3['Profit Margin']

In [None]:
# print(df["Profit Margin "])
df['PM_num'] = df['Profit Margin'].str.replace('%','') 

In [None]:
df['PM_num']

In [None]:

def get_pm(df):
    profit_margins = []
    for ticker in df['Profit Margin']:
        if type(ticker) == float or ('%' not in ticker): 
            continue
        pm = float(ticker.replace('%',''))
        profit_margins.append(pm)

    return sum(profit_margins)/len(profit_margins)

get_pm(df)

# 6: Visualize Data

In [None]:
y = [get_pm(df1), get_pm(df2), get_pm(df3)]
x = [letter_1, letter_2, letter_3]

plt.bar(x,y)
plt.show()

In [None]:
# x values
labels = []
# y values
values = []

for index, row in df1.iterrows():
    pm = row['Profit Margin']
        
    if type(pm) == float or ('%' not in pm): 
        continue

    pm = float(pm.replace('%',''))
    profit_margins.append(pm)

    labels.append(index)
    values.append(pm)


In [None]:

fig = plt.figure(figsize=(18,8))
ax = fig.add_subplot(111)

ax.bar(labels, values)

plt.xticks(rotation=45)
plt.ylabel('Profit margin (%)')
plt.title('Profit margin by company')

plt.grid()

# WSJ Headline Word Frequency

In [None]:
# read in the saved csv files
files = os.listdir(os.path.join(os.getcwd(), 'wsj_csvs'))

file_list = []
for file in files:
    f = file.strip('.csv')
    if f != "DS_Store":
        file_list.append(f)
        
print(len(file_list))
print(min(file_list))   
print(max(file_list))      

In [None]:
# create a list of pandas DataFrames from the WSJ CSVs
df_list = []
for file in files[:]:
    df_list.append(pd.read_csv(f"wsj_csvs/{file}", index_col=0))
print(len(df_list))


In [None]:
# create a master DataFrame with every wsj_csv
df = pd.concat(df_list)
df.reset_index(inplace=True, drop=True)

df['date'] = pd.to_datetime(df['date'].copy(), format='%Y%m%d')
df.set_index('date', inplace=True, drop=True)
df.sort_index()
df['count'] = 1
df

In [None]:
keyword_one = 'Obama'
keyword_two = 'Romney'
frequency = '14D'
window = 4
# df = df.loc['2012-01-01':'2019-10-24']

# keyword one
df_1 = df[
    df['summary'].str.contains(keyword_one) | 
    df['headline'].str.contains(keyword_one)
].copy()
df_1_re = pd.DataFrame(df_1['count'].resample(frequency).sum())
df_1_re['SMA'] = df_1_re['count'].rolling(window=window).mean()

# keyword two
df_2 = df[
    df['summary'].str.contains(keyword_two) |
    df['headline'].str.contains(keyword_two)
].copy()
df_2_re = pd.DataFrame(df_2['count'].resample(frequency).sum())
df_2_re['SMA'] = df_2_re['count'].rolling(window=window).mean()

print(len(df_1))
print(len(df_2))

In [None]:
fig = plt.figure(figsize=(18,8))
ax = fig.add_subplot(111)

plt.plot(df_1_re['count'], color='pink')
plt.plot(df_1_re['SMA'], color='red')

plt.plot(df_2_re['count'], color='lightblue')
plt.plot(df_2_re['SMA'], color='blue')

plt.legend((keyword_one, keyword_one+' SMA', keyword_two, keyword_two+' SMA'))

plt.xticks(rotation=0)
plt.ylabel('Articles per period')
plt.title('Word frequency in WSJ headlines')
plt.style.use('ggplot')
plt.show()

# NOTE: Following sections will NOT work with Anaconda alone

# 7: Using An In Memory Data Base (Redis)

In [None]:
import redis

In [None]:
# must start redis server in terminal first
r_db = redis.Redis(port=6377, db=0)

In [None]:
# set a key value pair 'name' : 'Stefan'
r_db.mset({"name": "Stefan"})

In [None]:
# key value from db
r_db.mget('name')[0].decode('UTF-8')

In [None]:
# clearS db
r_db.flushall()

In [None]:
print(f"{len(tickers_list)} tickers in list")

In [None]:
# asynchronously call "get_company_data"
def thread_map(num, input_tickers_list=tickers_list, get_company_data_function=get_company_data):
    # ticker at point in list
    ticker = input_tickers_list[num]
    # calls company data function
    company_data = get_company_data_function(ticker)
    r_db.mset({ticker: str(company_data)})
    

In [None]:
t0 = time.time()
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        executor.map(thread_map, range(40))
        
t1 = time.time()
print("{:.4} seconds".format(t1-t0))


In [None]:
# create datadrame from data in redis db
df = pd.DataFrame([json.loads(r_db.get(ticker).decode('UTF-8').replace("'",'"')) for ticker in r_db.keys()]).set_index('ticker')

print(f"{len(df.index)} Rows, {len(df.columns)} Columns")
df

# Common issues
### Somtimes content on a webpage wont apear in the 'requests' response

In [None]:
response = requests.get('https://www.wsj.com/news/archive/20041001', 'lxml')

In [None]:
source = bs.BeautifulSoup(response.text)
# fine all elements with an article tag
articles = source.find_all('article')
print(len(articles))
for article in articles:
    print(article.text)
    print('-'*30)

# 8: Selenium (render full web page before extracting data)

In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.keys import Keys
import datetime
import getpass

In [None]:
# define webdriver use firefox browser
options = Options()
# run without broswer window
#options.add_argument('--headless')
driver = webdriver.Firefox(options=options)

In [None]:
driver.get('https://www.wsj.com/news/archive/20080608')

### Get newspaper title and article summary from WSJ archives

In [None]:
# creates a list of dates for wsj archive url
def create_date() -> list:
    start_date = datetime.date(1996, 4, 6)
    dates_list = []
    while True:
        start_date += datetime.timedelta(days=1)
        dates_list.append(str(start_date).replace('-',''))
        if datetime.date.today() == start_date:
            break
    return dates_list


In [None]:
# list list of possible dates fro url
dates = create_date()
print(f"{len(dates)} total dates")
print(dates[1:3],dates[-3:-1])

In [None]:
# current directory
print(os.listdir())

In [None]:
csvs_folder = 'wsj_csvs2'
errors_folder = 'wsj_errors2'

In [None]:
# makes directory for each days csvs
if not os.path.exists(csvs_folder):
    os.mkdir(csvs_folder)
print(os.listdir())

In [None]:
# makes errors directory for each days csvs
if not os.path.exists(errors_folder):
    os.mkdir(errors_folder)
print(os.listdir())

In [None]:
# takes a formatted date, appends to WSJ archive url, returns df of days articles
def get_days_news(date):

    driver.get(f'https://www.wsj.com/news/archive/{date}')
    raw_source = driver.page_source
    source = bs.BeautifulSoup(raw_source)
    articles = source.select("article[class*='WSJTheme--story']")
    
    # if page does not load, date is added to error file
    timeout = 0
    # while article length is 0, the page waits until 
    while len(articles) == 0:
        # brief pause, allows page to coninue loading
        time.sleep(1)
        # redifine the page source
        raw_source = driver.page_source
        # redifine Beautifule soup
        source = bs.BeautifulSoup(raw_source)
        articles = source.select("article[class*='WSJTheme--story']")
        timeout += 1
        if timeout >= 20:
            with open(os.path.join(os.getcwd(), errors_folder, f"{date}.txt"),'w') as f:
                f.write(date)
            return
    
    time.sleep(1)
        
    dict_list = []
    for article in articles:
        #print(article.text)
        
        # the tree sections of each article row
        days_articles = {'section': article.select("div[class*='WSJTheme--flashline']"),
                         'headline': article.select("h3[class*='WSJTheme--headline']"), 
                         'summary': article.select("p[class*='WSJTheme--summary']")
                        }
        
        # adds each of the three sections to dict, used for df
        for item in days_articles:
            if days_articles[item] == []:
                days_articles[item] = 'None'
            else:
                days_articles[item] = days_articles[item][0].text
         
        # for date columns
        days_articles['date'] = date
        
        # add to 'dict_list'
        dict_list.append(days_articles)
    
    # creates pandas df from list of article dicts
    df = pd.DataFrame(dict_list)
    
    # add to csv
    df.to_csv(os.path.join(os.getcwd(), csvs_folder, f"{date}.csv"))
    
    return df
 

In [None]:
for index, date in enumerate(dates[1000:1005]):
    get_days_news(date)
    time.sleep(2)
    print(index, date) 

# Reddit

In [None]:
# define webdriver use firefox browser
options = Options()
# run without broswer window
#options.add_argument('--headless')
driver = webdriver.Firefox(options=options)

In [None]:
driver.get('https://www.reddit.com')

In [None]:
# get all images on page
def get_img_links(num):
    images = []
    while len(images) < num:
        raw_source = driver.page_source
        source = bs.BeautifulSoup(raw_source)
        images = source.select("img[class*='ImageBox-image'][class*='media-element']")
        driver.execute_script('window.scrollTo(0,document.body.scrollHeight);')
        time.sleep(.5)
        print(len(images))
    return [image['src'] for image in images]

In [None]:
# save img to disk
def save_img(url):
    split_url = (url.split('?')[0]).split('.')
    ext = split_url[-1]
    name = split_url[-2].split('/')[1]
    response = requests.get(url, stream=True)
    path = 'images'
    if not os.path.exists(path):
        os.mkdir(path)
    with open(f'{path}/{name}.{ext}', 'wb') as image_file:
        image_file.write(response.content)
    return name

In [None]:
# creats list of all urls on the a redit page
image_urls = get_img_links(40)

In [None]:
for image_url in image_urls:
    save_img(image_url)
    print(image_url)

# Youtube

In [None]:
driver.get('https://accounts.google.com/signin/v2/identifier?service=youtube&uilel=3&passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Faction_handle_signin%3Dtrue%26app%3Ddesktop%26hl%3Den%26next%3D%252F&hl=en&ec=65620&flowName=GlifWebSignIn&flowEntry=ServiceLogin')

In [None]:
# username/ password inputs
username_input = driver.find_element_by_id('identifierId')
username = getpass.getpass(prompt='Username', stream=None)
username_input.clear()
username_input.send_keys(username)
submit = driver.find_element_by_id("identifierNext")
submit.click()


In [None]:
password_input = driver.find_element_by_xpath("//input[@type='password']")
password = getpass.getpass(prompt='Password', stream=None)
password_input.clear()
password_input.send_keys(password)
time.sleep(1)
submit = driver.find_element_by_id("passwordNext")
submit.click()

In [None]:
driver.get('https://www.youtube.com/feed/trending?gl=US')

In [None]:
source = bs.BeautifulSoup(driver.page_source)

In [None]:
titles = source.select("a[id='video-title'][class*='ytd-video-renderer'][aria-label*='']")
print(len(titles))
for i in titles:
    print (i['href'])

In [None]:
for title in titles[0:5]:
    rel_link = title['href']
    link = f'https://www.youtube.com{rel_link}'
    driver.get(link)
    time.sleep(.5)
    buttons = []
    timeout = 0
    while len(buttons) == 0:
        buttons = driver.find_elements_by_xpath("//button[starts-with(@aria-label,'like this')]")
        print(buttons)
        timeout += 1
        if timeout == 5:
            break
        time.sleep(.5)
    
    buttons[0].click()
    time.sleep(1)
    buttons[0].click()
    print(link)
    print('-----------')

In [None]:
# makes csv from wikipedia
resp = requests.get('https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue', 'lxml')
source = bs.BeautifulSoup(resp.text)
table = source.select("table[class*='wikitable'][class*='wikitable']")[0]
rows = table.find_all('tr')
header = table.find_all('th')    
dict_list = []
for row in rows:
    data_dict = {}
    tds = row.find_all(['td', 'th'])
    for i, td in enumerate(tds):
        data_dict[header[i].text.strip()] = td.text.strip()
        
    print(data_dict)
    dict_list.append(data_dict)
    
df = pd.DataFrame(dict_list)
df.set_index('Rank')
df.to_csv('company_data.csv')    