<a href="https://colab.research.google.com/github/DebraBeat/stock_project/blob/main/stock_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
import csv
from google.colab import drive
import os
import datetime
import time
import random

In [2]:
# Downloaded csv file and locally parsed symbols from here: https://www.nasdaq.com/market-activity/stocks/screener
# Code to parse symbols:
# import csv
# path = r"C:\Users\zeeri\Downloads\nasdaq_screener_1717802878001.csv"
# tickers = []

# with open(path, newline='') as csvfile:
#     reader = csv.reader(csvfile, delimiter=',')

#     for row in reader:
#         tickers.append(row[0])

# print(tickers[1:])

# Get symbol data
# Run ls to make sure you're in the right directory

'''
Put ourselves into the google drive directory for our project and get a list
of symbols to use
'''

drive.mount('/content/drive', force_remount=True)
os.chdir("drive/My Drive/stock_project")
!ls
raw_symbols = []
symbols = []
with open('symbols', 'r') as csvfile:
  reader = csv.reader(csvfile, delimiter=',')

  for row in reader:
    raw_symbols.append(row)

# symbols is a 2d list of one element, so make it the first element
raw_symbols = raw_symbols[0]

# sanitize symbols
for symbol in raw_symbols:
  if symbol.isalnum():
    symbols.append(symbol)

print(len(raw_symbols))
print(len(symbols))

Mounted at /content/drive
496through499.csv  df1.csv  df3.csv  df5.csv  df7.csv  stock_valuations.csv  test.csv
df0.csv		   df2.csv  df4.csv  df6.csv  df.csv   symbols
7159
6757


In [4]:
'''
Define our user agent so yahoo finance doesn't think we're a web crawler.
Define the key parts of our URL to request
'''
header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'}

head = 'https://finance.yahoo.com/quote/'
stats_tail = '/key-statistics/'
hist_tail = 'history?period1=942883200&period2=1717718400&interval=1mo&filter=history&frequency=1mo&includeAdjustedClose=true'

In [8]:
'''
This cell is a prototype / test of fetching data from yahoo finance, to be used below.
Make sure to run this to create table_elements
'''


# response = requests.get('https://finance.yahoo.com/quote/NVDA/key-statistics/', headers=header)
response = requests.get(head + 'NVDA' + stats_tail, headers=header)
print(head + symbols[0] + stats_tail)
soup = BeautifulSoup(response.text, 'html.parser')
data_cells = soup.find_all('td')
table_elements = []

for tag in data_cells:
  table_elements.append(tag.contents[0])
print(table_elements)

for i, s in enumerate(table_elements):
  try:
    if s[0].isalpha():
      # print(f'{s}: {table_elements[i+1]}')
      pass
  except KeyError as error:
    # print('--')
    pass

https://finance.yahoo.com/quote/A/key-statistics/
['Market Cap', '3.19T ', '2.13T ', '1.52T ', '1.01T ', '1.15T ', '686.27B ', 'Enterprise Value', '3.17T ', '2.11T ', '1.51T ', '1.00T ', '1.15T ', '685.00B ', 'Trailing P/E', '75.84 ', '72.42 ', '81.17 ', '98.50 ', '243.38 ', '159.48 ', 'Forward P/E', '50.00 ', '35.71 ', '30.40 ', '24.51 ', '62.89 ', '61.73 ', 'PEG Ratio (5yr expected)', '1.49 ', '1.16 ', '0.60 ', '0.82 ', '2.66 ', '3.45 ', 'Price/Sales', '40.52 ', '35.37 ', '34.14 ', '31.09 ', '45.06 ', '25.79 ', 'Price/Book', '64.88 ', '49.45 ', '45.57 ', '36.57 ', '47.05 ', '31.05 ', 'Enterprise Value/Revenue', '39.71 ', '34.64 ', '33.63 ', '30.62 ', '44.46 ', '25.39 ', 'Enterprise Value/EBITDA', '62.51 ', '59.31 ', '66.16 ', '77.37 ', '178.68 ', '114.42 ', 'Fiscal Year Ends  ', '1/28/2024', 'Most Recent Quarter  (mrq)', '4/28/2024', 'Profit Margin  ', '53.40%', 'Operating Margin  (ttm)', '64.93%', 'Return on Assets  (ttm)', '49.10%', 'Return on Equity  (ttm)', '115.66%', 'Revenue  (

In [6]:
'''
This cell is a prototype / test of creation of the statistics dataframe, to be used below.
'''

cols = ['Company']
d = {'Company': symbols[0]}
for i, s in enumerate(table_elements):
  try:
    if s[0].isalpha():
      cols.append(s)
      d[s] = table_elements[i+1]
  except KeyError:
    cols.append(np.nan)

df = pd.DataFrame(data=d, columns=cols, index=[0])
df.head()

Unnamed: 0,Company,Market Cap,Enterprise Value,Trailing P/E,Forward P/E,PEG Ratio (5yr expected),Price/Sales,Price/Book,Enterprise Value/Revenue,Enterprise Value/EBITDA,...,Shares Short (prior month 4/15/2024),Forward Annual Dividend Rate,Forward Annual Dividend Yield,Trailing Annual Dividend Rate,Trailing Annual Dividend Yield,Payout Ratio,Dividend Date,Ex-Dividend Date,Last Split Factor,Last Split Date
0,A,3.19T,3.17T,75.84,50.0,1.49,40.52,64.88,39.71,62.51,...,290.75M,0.04,0.03%,0.02,0.01%,0.94%,6/28/2024,6/11/2024,10:1,6/10/2024


In [20]:
'''
Define our function for retriveing stock statistics (AKA data or valuations
or metrics). Take in a DataFrame, create a new row for a new stock, and
concatenate it onto the end of the DataFrame. Then return the new DataFrame.
'''

def append_stock_stats(df, symbol, i):
  response = requests.get(head + symbol + stats_tail, headers=header)
  soup = BeautifulSoup(response.text, 'html.parser')
  data_cells = soup.find_all('td')
  table_elements = []
  d = {'Company': symbol}

  for tag in data_cells:
    try:
      table_elements.append(tag.contents[0])
    except IndexError as err:
      table_elements.append(np.nan)
      print(f'{err}')

  for j, s in enumerate(table_elements):
    try:
      if s[0].isalpha() and j < len(table_elements) - 1:
        d[s] = table_elements[j+1]
    except KeyError:
      # print(f'KeyError, Symbol: {symbol}, i: {i}')
      # print(f'j: {j}, s: {s}')
      pass
    except IndexError:
      print(f'IndexError, Symbol: {symbol}, i: {i}, j: {j}')
      print(f'j: {j}, s: {s}')
      raise
    except TypeError:
      print(f'TypeError, Symbol: {symbol}, i: {i}, j: {j}')
      print(f'j: {j}, s: {s}')
      raise
    except requests.TooManyRedirects:
      print(f'TooManyRedirects, Symbol: {symbol}, i: {i}, j: {j}')
      print(f'j: {j}, s: {s}')
      raise

  row = pd.DataFrame(data=d, columns=cols, index=[i])
  df = pd.concat([df, row])

  return df

In [None]:
'''
Create and populate our DataFrame of stock data.
'''

# Create columns for reference
cols = ['Company']
for s in table_elements:
  try:
    if s[0].isalpha():
      cols.append(s)
  except KeyError:
    cols.append(np.nan)

# Create empty DataFrame with just column names
df = pd.DataFrame(columns=cols)

# Populate the DataFrame
symbol_index = 100
df_num = 15
for i in range(1590, len(symbols)):
  try:
    df = append_stock_stats(df, symbols[i], i)
  except:
    continue


  # In case we've used up our allotted requests per whatever,
  # we wait, then delete the last row and try again
  wait_time = 0
  while pd.isna(df['Market Cap'].iloc[-1]):
    time.sleep(5 + wait_time)
    wait_time += 1
    df.drop(df.tail(1).index, inplace=True)
    try:
      df = append_stock_stats(df, symbols[i], i)
    except:
      continue

  print(f'Row {i} fetched, Current stock: {symbols[i]}')

  if i % 100 == 0:
    !ls

  if i % 99 == 0 and i > 0:
    filename = f'df{df_num}.csv'
    df_num += 1
    df.to_csv(filename)
    df = pd.DataFrame(columns=cols)
    print(f'stocks {i-100} through {i} written')
    !ls


  # # Add a random delay because I am unreasonably cautious about getting\
  # # banned from yahoo finance
  delay = random.random() * 0.5 + 0.5
  time.sleep(delay)

df.head()

Row 1590 fetched, Current stock: CTBI
Row 1591 fetched, Current stock: CTCX
Row 1592 fetched, Current stock: CTCXW
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
TypeError, Symbol: CTDD, i: 1593, j: 1
j: 1, s: nan
Row 1594 fetched, Current stock: CTGO
Row 1595 fetched, Current stock: CTHR
Row 1596 fetched, Current stock: CTKB
Row 1597 fetched, Current stock: CTLP
Row 1598 fetched, Current stock: CTLT
Row 1599 fetched, Current stock: CTM
Row 1600 fetched, Current stock: CTMX
1585through1588.csv  df10.csv  df13.csv  df2.csv  df5.csv  df8.csv  stock_valuations.csv
496through499.csv    df11.csv  df14.csv  df3.csv  df6.csv  df9.csv  symbols
df0.csv		     df12.csv  df1.csv	 df4.csv  df7.csv  df.csv   test.csv
Row 1601 fetched, Current stock: CTNM
Row 1602 fetched, Current stock: CTNT
Row 1603 fetched, Current stock: CTO
Row 1604 fetched, Current stock: CTOS
Row 1605 fetched, Current stock: CTR
Ro