In [2]:
import time
import os
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys


def write_page(url, file_path):
    """Takes in the URL and writes the html file to the path specified."""
    driver = webdriver.Chrome()
    driver.get(url)
    
    try:
        xpath_convert_to_html = r'//*[@id="form-information-html"]'
        xpath_button_click =r'//*[@id="menu-dropdown-link"]'
        
        driver.find_element("xpath", xpath_button_click).click()
        correct_url =  driver.find_element("xpath", xpath_convert_to_html).get_attribute('href')
        
        driver.quit()
        driver = webdriver.Chrome()
        driver.get(correct_url)
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(driver.page_source)
        driver.quit()
    except:
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(driver.page_source)
        driver.quit()

def download_files_10k(ticker, dest_folder):
    """Downloads all the html 10-k files for the given ticker into the destination folder."""
    # Create the destination folder if it doesn't exist
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)

    # Construct the URL to search for the ticker's filings
    url = r'https://www.sec.gov/edgar/searchedgar/companysearch'

    # Open the search page and enter the ticker in the search box
    driver = webdriver.Chrome()
    driver.get(url)
    xpath_search_box = r'//*[@id="edgar-company-person"]'
    driver.find_element("xpath", xpath_search_box).send_keys(ticker,Keys.ENTER)

    # Wait for page to load and expand 10-K dropdown
    time.sleep(2)
    xpath_expand_selected = r'//*[@id="filingsStart"]/div[2]/div[3]/h5/a'
    driver.find_element("xpath", xpath_expand_selected).click()

    # views only 10-K and 10-Q data
    time.sleep(1)
    xpath_obtain_all_data = r'//*[@id="filingsStart"]/div[2]/div[3]/div/button[1]'
    driver.find_element("xpath", xpath_obtain_all_data).click()

    # Searches 10-K to only show the relevant filings
    xpath_search_10K = r'//*[@id="searchbox"]'
    driver.find_element("xpath", xpath_search_10K).send_keys('10-K',Keys.ENTER)

    # Download each 10-K filing
    
    # Download each 10-K filing
    table_xpath = r'//*[@id="filingsTable"]'
    wait = WebDriverWait(driver, 1)
    wait.until(EC.presence_of_element_located((By.XPATH, table_xpath)))
    table = driver.find_element(By.XPATH, table_xpath)
    rows = table.find_elements(By.XPATH, './tbody/tr')

    counter = 1
    for row in rows:
        cells = row.find_elements(By.XPATH, './td')
        filing_text = cells[2].text      
        xpath_url = '//*[@id="filingsTable"]/tbody/tr['+str(counter)+']/td[2]/div/a[1]'
        filing_url = driver.find_element("xpath", xpath_url).get_attribute('href')
        file_name = f"{ticker}_10-K_{filing_text}.html"
        file_path = os.path.join(dest_folder, file_name)
        counter += 1
        write_page(filing_url, file_path)
        
    driver.quit()

In [43]:
from bs4 import BeautifulSoup
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

def clean_html_text(html_text):
    '''
    Function uses BeautifulSoup to parse html text and replaces any tags
    or punctuation with a space. '\W' refers to matching any non-word 
    defined by regex and '+' refers to matching more than one non-words
    '''
    soup = BeautifulSoup(html_text, 'html.parser')
    stopword_list = stopwords.words('english')
    paragraphs = soup.find_all('p')
    return_list = []
    
    for p in paragraphs:
        
        if not p.find_previous('h1'):
            
            text = p.get_text(strip=True)
            cleaned = re.sub(r'[^A-Za-z0-9]', ' ', text.lower())
            clean_text_tokenized = word_tokenize(cleaned)
            
            for i in stopword_list:
                if i in clean_text_tokenized:
                    clean_text_tokenized.remove(i)
                    
            lemmatizer = WordNetLemmatizer()
            clean_text_lemmatized = [lemmatizer.lemmatize(j) for j in clean_text_tokenized]
            regex = re.compile(r'\d+')
            
            no_numbers = [item for item in clean_text_lemmatized if not regex.search(item)]
            no_letters = [word for word in no_numbers if len(word) != 1]
            
            if len(no_letters) > 8:
                return_list.append(no_letters)
    
    result = '\n'.join([' '.join(inner_list) for inner_list in return_list])
    
    return result   

def write_clean_html_text_files(input_folder, dest_folder):
    '''
    Function reads the files in the input folder and and calls the clean_html_text function
    and stores a text file in the destination directory.
    
    '''
    
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
    
    for filename in os.listdir(input_folder):
        input_file_path = os.path.join(input_folder, filename)
        new_name = filename.split('.')
        
        with open(input_file_path,'r' ,encoding="utf8") as f:
            html_text = f.read()
            cleaned_text = clean_html_text(html_text)
            
        dest_filename = f'{new_name[0]}.txt' 
        dest_file_path = os.path.join(dest_folder, dest_filename)
    
        
        with open(dest_file_path, 'w') as f:
            f.write(cleaned_text)


In [44]:
folder = r'D:\Users\Arian\Documents\Kubrick\Week 10\EDGAR Project\data\10k_filings_raw'
ticker = 'AAPL'
download_files_10k(ticker, folder)

In [45]:
input_folder = r'D:\Users\Arian\Documents\Kubrick\Week 10\EDGAR Project\data\10k_filings_raw'
output_folder = r'D:\Users\Arian\Documents\Kubrick\Week 10\EDGAR Project\data\10k_filings_clean'
write_clean_html_text_files(input_folder, output_folder)

In [5]:
input_folder = r'D:\Users\Arian\Documents\Kubrick\Week 10\EDGAR Project\data\10k_filings_raw\AAPL_10-K_2012-10-31.html'
import requests
import bs4 as BeautifulSoup
# with open(input_folder,'r' ,encoding="utf8") as f:
#     html_text = f.read()

html_text = r'https://www.sec.gov/Archives/edgar/data/320193/000162828016020309/a201610-k9242016.htm'
user_agent = r'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36'

response = requests.get(html_text, headers={'User-Agent':user_agent})

soup = BeautifulSoup(response.content, 'html.parser')
for tag in soup.find_all():
    print(tag.name)

stopword_list = stopwords.words('english')

paragraphs = soup.find_all('p')
return_list = []
for p in paragraphs:
    if not p.find_previous('h1'):
        text = p.get_text(strip=True)
        clean_text = re.sub(r'[^A-Za-z0-9]', ' ', text.lower())
        clean_text_tokenized = word_tokenize(clean_text)
        for i in stopword_list:
            if i in clean_text_tokenized:
                clean_text_tokenized.remove(i)
        lemmatizer = WordNetLemmatizer()
        clean_text_lemmatized = [lemmatizer.lemmatize(j) for j in clean_text_tokenized]
        
        regex = re.compile(r'\d+')
        
        no_numbers = [item for item in clean_text_lemmatized if not regex.search(item)]
        
        no_letters = [word for word in no_numbers if len(word) != 1]
        if len(no_letters) > 8:
            return_list.append(no_letters)
        
result = '\n'.join([' '.join(inner_list) for inner_list in return_list])
print(result)
        
        # if len(text.strip()) != 0:
        #     print(p.text.lower())


# Finds what the html contains:
divs = soup.find_all('div')
for div in divs:
    spans = div.find_all('span')
    for span in spans:
        print(span.text)

TypeError: 'module' object is not callable

In [1]:
# Part 3A:
import csv
import pandas as pd
import yahoofinancials
from yahoofinancials import YahooFinancials



csv_file = 'sp-100-index-03-07-2023.csv'
def get_sp100():
    # sp100 = []
    # with open(csv_file,newline='\n'):
    #     csv_reader = csv.reader(csv_file, delimiter=',')
    #     for row in csv_reader:
    #         sp100.append(row)
    #     return df, sp100
    df =pd.read_csv(csv_file)
    
    return list(df['Symbol'])



def get_yahoo_data(start_date,end_date,tickers):


    for ticker in tickers:
        data = YahooFinancials(ticker).get_historical_price_data(start_date, end_date, 'daily')
        prices = pd.DataFrame(data[ticker]['prices'])
        prices['1daily_return'] = prices['open'] - prices['close'].shift(1)
        return prices

In [4]:

df_returns = get_yahoo_data('2000-01-01', '2020-08-01', 'MMM')
display(df_returns)

Unnamed: 0,date,high,low,open,close,volume,adjclose,formatted_date,1daily_return
0,946909800,25.53125,23.87500,25.1250,25.18750,2493200,14.863492,2000-01-03,
1,946996200,24.87500,24.09375,24.6875,24.46875,1527000,14.439350,2000-01-04,-0.50000
2,947082600,25.00000,24.12500,24.5625,24.93750,1755400,14.715969,2000-01-05,0.09375
3,947169000,25.00000,24.15625,25.0000,24.71875,1663200,14.586878,2000-01-06,0.06250
4,947255400,25.50000,24.53125,24.6250,25.40625,2078400,14.992577,2000-01-07,-0.09375
...,...,...,...,...,...,...,...,...,...
5173,1595856600,6.44000,6.12000,6.4400,6.21000,26162900,5.942723,2020-07-27,-0.01000
5174,1595943000,6.55000,6.15000,6.2000,6.43000,23990700,6.153255,2020-07-28,-0.01000
5175,1596029400,6.72000,6.42000,6.4600,6.49000,22529400,6.210672,2020-07-29,0.03000
5176,1596115800,6.45000,6.18000,6.3500,6.22000,18802400,5.952293,2020-07-30,-0.14000
