<a href="https://colab.research.google.com/github/DebraBeat/stock_project/blob/main/stock_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup, SoupStrainer
import re
import csv
from google.colab import drive
import os
import datetime
import time
import random
import subprocess

In [2]:
# Downloaded csv file and locally parsed symbols from here: https://www.nasdaq.com/market-activity/stocks/screener
# Code to parse symbols:
# import csv
# path = r"C:\Users\zeeri\Downloads\nasdaq_screener_1717802878001.csv"
# tickers = []

# with open(path, newline='') as csvfile:
#     reader = csv.reader(csvfile, delimiter=',')

#     for row in reader:
#         tickers.append(row[0])

# print(tickers[1:])

# Get symbol data
# Run ls to make sure you're in the right directory

'''
Put ourselves into the google drive directory for our project and get a list
of symbols to use
'''

drive.mount('/content/drive', force_remount=True)
os.chdir("drive/My Drive/stock_project")
!ls
raw_symbols = []
symbols = []
with open('symbols', 'r') as csvfile:
  reader = csv.reader(csvfile, delimiter=',')

  for row in reader:
    raw_symbols.append(row)

# symbols is a 2d list of one element, so make it the first element
raw_symbols = raw_symbols[0]

# sanitize symbols
for symbol in raw_symbols:
  if symbol.isalnum():
    symbols.append(symbol)

print(len(raw_symbols))
print(len(symbols))

Mounted at /content/drive
1585through1588.csv  df19.csv  df2.csv	 df40.csv  df51.csv  df62.csv	   package-lock.json
496through499.csv    df1.csv   df30.csv  df41.csv  df52.csv  df63.csv	   price_df.csv
df0.csv		     df20.csv  df31.csv  df42.csv  df53.csv  df64.csv	   prices.txt
df10.csv	     df21.csv  df32.csv  df43.csv  df54.csv  df65.csv	   stock_valuations.csv
df11.csv	     df22.csv  df33.csv  df44.csv  df55.csv  df66.csv	   symbols
df12.csv	     df23.csv  df34.csv  df45.csv  df56.csv  df6.csv	   test.csv
df13.csv	     df24.csv  df35.csv  df46.csv  df57.csv  df7.csv
df14.csv	     df25.csv  df36.csv  df47.csv  df58.csv  df8.csv
df15.csv	     df26.csv  df37.csv  df48.csv  df59.csv  df9.csv
df16.csv	     df27.csv  df38.csv  df49.csv  df5.csv   df.csv
df17.csv	     df28.csv  df39.csv  df4.csv   df60.csv  download
df18.csv	     df29.csv  df3.csv	 df50.csv  df61.csv  node_modules
7159
6757


In [None]:
'''
Define our user agent so yahoo finance doesn't think we're a web crawler.
Define the key parts of our URL to request
'''
header = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'}

head = 'https://finance.yahoo.com/quote/'
stats_tail = '/key-statistics/'
hist_tail = 'history?period1=942883200&period2=1717718400&interval=1mo&filter=history&frequency=1mo&includeAdjustedClose=true'

In [None]:
'''
This cell is a prototype / test of fetching data from yahoo finance, to be used below.
Make sure to run this to create table_elements
'''


# response = requests.get('https://finance.yahoo.com/quote/NVDA/key-statistics/', headers=header)
response = requests.get(head + 'NVDA' + stats_tail, headers=header)
print(head + symbols[0] + stats_tail)
soup = BeautifulSoup(response.text, 'html.parser')
data_cells = soup.find_all('td')
table_elements = []

for tag in data_cells:
  table_elements.append(tag.contents[0])
print(table_elements)

for i, s in enumerate(table_elements):
  try:
    if s[0].isalpha():
      # print(f'{s}: {table_elements[i+1]}')
      pass
  except KeyError as error:
    # print('--')
    pass

In [None]:
'''
This cell is a prototype / test of creation of the statistics dataframe, to be used below.
'''

cols = ['Company']
d = {'Company': symbols[0]}
for i, s in enumerate(table_elements):
  try:
    if s[0].isalpha():
      cols.append(s)
      d[s] = table_elements[i+1]
  except KeyError:
    cols.append(np.nan)

df = pd.DataFrame(data=d, columns=cols, index=[0])
df.head()

In [None]:
'''
Define our function for retriveing stock statistics (AKA data or valuations
or metrics). Take in a DataFrame, create a new row for a new stock, and
concatenate it onto the end of the DataFrame. Then return the new DataFrame.
'''

def append_stock_stats(df, symbol, i):
  response = requests.get(head + symbol + stats_tail, headers=header)
  soup = BeautifulSoup(response.text, 'html.parser')
  data_cells = soup.find_all('td')
  table_elements = []
  d = {'Company': symbol}

  for tag in data_cells:
    try:
      table_elements.append(tag.contents[0])
    except IndexError as err:
      table_elements.append(np.nan)
      print(f'{err}')

  for j, s in enumerate(table_elements):
    try:
      if s[0].isalpha() and j < len(table_elements) - 1:
        d[s] = table_elements[j+1]
    except KeyError:
      # print(f'KeyError, Symbol: {symbol}, i: {i}')
      # print(f'j: {j}, s: {s}')
      pass
    except IndexError:
      print(f'IndexError, Symbol: {symbol}, i: {i}, j: {j}')
      print(f'j: {j}, s: {s}')
      raise
    except TypeError:
      print(f'TypeError, Symbol: {symbol}, i: {i}, j: {j}')
      print(f'j: {j}, s: {s}')
      raise
    except requests.TooManyRedirects:
      print(f'TooManyRedirects, Symbol: {symbol}, i: {i}, j: {j}')
      print(f'j: {j}, s: {s}')
      raise

  row = pd.DataFrame(data=d, columns=cols, index=[i])
  df = pd.concat([df, row])

  return df

In [None]:
'''
Create and populate our DataFrame of stock data.
Note this cell takes about 12 hours to run :)
'''

# Create columns for reference
cols = ['Company']
for s in table_elements:
  try:
    if s[0].isalpha():
      cols.append(s)
  except KeyError:
    cols.append(np.nan)

# Create empty DataFrame with just column names
df = pd.DataFrame(columns=cols)

# Populate the DataFrame
symbol_index = 100
df_num = 58
for i in range(0, len(symbols)):
  try:
    df = append_stock_stats(df, symbols[i], i)
  except:
    continue


  # In case we've used up our allotted requests per whatever,
  # we wait, then delete the last row and try again
  wait_time = 0
  while pd.isna(df['Market Cap'].iloc[-1]):
    time.sleep(90 + wait_time)
    wait_time += 1
    df.drop(df.tail(1).index, inplace=True)
    try:
      df = append_stock_stats(df, symbols[i], i)
    # If for whatever reason we can't get a particular stock's data
    # we continue on without it's data
    except:
      continue

  print(f'Row {i} fetched, Current stock: {symbols[i]}')

  if i % 100 == 0:
    !ls

  if i % 99 == 0 and i > 0:
    filename = f'df{df_num}.csv'
    df_num += 1
    df.to_csv(filename)
    df = pd.DataFrame(columns=cols)
    print(f'stocks {i-100} through {i} written')
    !ls


  # # Add a random delay because I am unreasonably cautious about getting\
  # # banned from yahoo finance
  delay = random.random() * 0.5 + 0.5
  time.sleep(delay)

df.head()

In [None]:
'''
Get the stock metrics csv and clean it up a little. It will be cleaned more after
the stock prices have been added
'''

copy = pd.read_csv('df.csv', index_col=0)

for i in range(3, 67):
  df = pd.read_csv(f'df{i}.csv', index_col=0)
  copy = pd.concat([copy, df], axis=0, join='outer')
  print(copy.shape)

df = pd.read_csv('1585through1588.csv')
copy = pd.concat([copy, df], axis=0, join='outer')
df = pd.read_csv('496through499.csv')
copy = pd.concat([copy, df], axis=0, join='outer')

copy.reset_index()
copy.drop(columns=['Unnamed: 0', 'Unnamed: 11'])
copy.to_csv('df.csv')

In [None]:
'''
Install the program dukascopy-node which will allow us to easily fetch
historical stock price data
'''

subprocess.run(["npm install dukascopy-node --save"])

In [None]:
'''
Create the symbols we need to use for dukascopy, and then get stock price data
for each company
'''
dukascopy_symbols = []

for symbol in symbols:
  dukascopy_symbols.append(symbol.lower() + 'ususd')

for symbol in dukascopy_symbols:
  string = f'npx dukascopy-node -i {symbol} -from 2023-06-01 -to 2024-06-01 -t mn1 -f csv --date-format "YYYY-MM-DD HH:mm:ss"'
  subprocess.run([string], shell=True)
  print(string)

In [None]:
'''
Get the names of each csv price file. From this, put them all into one DataFrame.

Here's the most devious test cases for filenamesI could think of:
filename = 'ususususd-mn1-bid-2023-06-01-2024-06-01.csv'
filename = 'aususd-mn1-bid-2023-06-01-2024-06-01.csv'
'''
!ls download > prices.txt

price_df = pd.DataFrame(columns=['symbol', 'timestamp', 'close'])

with open('prices.txt', 'r') as f:
  lines = f.readlines()
  for line in lines:
    line = line[:-1]
    # print(line)
    ticker = line[:-39]
    temp = pd.read_csv(filename)
    temp = temp[['timestamp', 'close']]
    temp.insert(0, 'symbol', ticker)
    price_df = pd.concat([price_df, temp])
price_df.to_csv('price_df.csv')

In [81]:
def string_to_dollar(value: str) -> int:
  if type(value) != str or value[:2] == '--':
    return np.NaN

  pre_decimal = 0
  post_decimal = 0
  figures = {'': 1,
             'k': 1000,
             'M': 1000000,
             'B': 1000000000,
             'T': 1000000000000}
  i = 0
  j = 1

  while i < len(value) and value[i].isnumeric():
    pre_decimal = pre_decimal * 10 + int(value[i])
    i += 1

  while i < len(value) and not value[i].isalpha():
    if value[i].isnumeric():
      post_decimal += int(value[i]) / 10**j
      j += 1
    i += 1

  try:
    return (pre_decimal + post_decimal) * figures[value[i]]
  except:
    return float(value.replace(',', '').strip())

# TEST CASES:
# print(string_to_dollar('666.56K')) Random one with K instead of M
# print(string_to_dollar('38.81B')) A
# print(string_to_dollar('7.39B')) AA
# print(string_to_dollar('666.56M')) AACT
# print(string_to_dollar('2.96T')) AAPL
# print(string_to_dollar('--')) None fetched / avaliable

In [23]:
def string_to_percent(value):
  if type(value) != str or value[:2] == '--':
    return np.NaN

  value = value.strip()
  res_string = ''
  for c in value:
    if c != ',' and c != '%':
      res_string += c

  return float(res_string)


In [82]:
'''
Clean up our training data DataFrame
'''

df = pd.read_csv('df.csv', index_col=0)
# Drop rows incorrectly added
df.drop(columns=['Unnamed: 0', 'Unnamed: 11'], inplace=True)

# Convert from object to string dtypes for all columns
df = df.convert_dtypes()

# Drop irrelevant column
df.drop(columns=['Most Recent Quarter  (mrq)'], inplace=True)

# Rename some columns with weird whitespace stuff
df.rename(columns={col : col.replace(" (", "(").strip() for col in df.columns},
            inplace=True)

# In the dollar columns, convert from string to int (aka dollars)
dollar_cols = ['Market Cap', 'Enterprise Value', 'Trailing P/E', 'Forward P/E',
               'PEG Ratio(5yr expected)', 'Price/Sales', 'Price/Book',
               'Enterprise Value/Revenue', 'Revenue Per Share (ttm)']
for column in dollar_cols:
  df[column] = df[column].apply(string_to_dollar)

# In the datetime columns, convert from string to datetime
datetime_cols = ['Fiscal Year Ends']
for column in datetime_cols:
  df[column] = pd.to_datetime(df[column], errors='coerce')

# In the percentage columns, convert from string to float
percentage_cols = ['Profit Margin', 'Operating Margin (ttm)',
                   'Return on Assets (ttm)', 'Return on Equity (ttm)']
for column in percentage_cols:
  df[column] = df[column].apply(string_to_percent)
df['Revenue Per Share (ttm)']

0    22.53
1    58.79
2     7.06
3      NaN
4      NaN
     ...  
4      NaN
0     5.82
1     1.51
2    82.22
3     0.03
Name: Revenue Per Share (ttm), Length: 6716, dtype: float64