-
Notifications
You must be signed in to change notification settings - Fork 0
/
investing_in_stocks_with_Machine_learning_acquiring_data.py
205 lines (149 loc) · 8.19 KB
/
investing_in_stocks_with_Machine_learning_acquiring_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import pandas as pd
import os
import time
from datetime import datetime
from time import mktime
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
#style.use('dark_background')
import re
import urllib
# path to the folder where we have our data
path = "C:/Investing_in_stocks_with_ML/intraQuarter/intraQuarter"
# defining spec for our stat, path to stat's directory and a one liner loop
# to list the contents in our directory
def Key_Stats(gather="Total Debt/Equity (mrq)"):
statspath = path+('/_KeyStats')
stock_list = [x[0] for x in os.walk(statspath)]
#print(stock_list)
# added a new dataframe, also added % change of stock_Price and sp500
df = pd.DataFrame(columns = ['Date',
'Unix',
'Ticker',
'DE Ratio',
'Price',
'stock_p_change',
'SP500',
'sp500_p_change',
'Difference',
'Status'])
sp500_df = pd.DataFrame.from_csv("YAHOO-INDEX_GSPC.csv") # S & P Data
ticker_list = []
# we first iterate on our directories and list each directory, that is the
# contents in each directory(ticker) and save, now if lenght of each file in
# the directory is greater than zero we would like to proceed for each file in
# each directory that meets this condition, note that some directories has no
# data.
for each_dir in stock_list[1:]:
each_file = os.listdir(each_dir)
ticker = each_dir.split("\\")[1] # added stock ticker name
ticker_list.append(ticker) # add every ticker as a list
# defined two variables to calculate % change for stock_price and sp500
# we always want to do this each time the stock price changes
starting_stock_value = False
starting_sp500_value = False
if len(each_file) > 0:
# now that we've accessed each directory in our stock list, we would like to
# also pull some info from each file in our directory.
# Here we would like to pull time and date from the files
for file in each_file:
#print(file)
date_stamp = datetime.strptime(file, '%Y%m%d%H%M%S.html')
unix_time = time.mktime(date_stamp.timetuple())
#print(date_stamp, unix_time, ticker)
# we pulled the full path to our files, retrieved the source code together with our stats value
# with this we've been able to Debt/Equity ratios for all of the companies
full_file_path = each_dir+'/'+file
#print(full_file_path)
source = open(full_file_path,'r').read()
#print(source)
#value = (source.split(gather+':</td><td class="yfnc_tabledata1">')[1].split('</td>')[0])
#print(ticker+":",value)
# we added functions here to execute our code and do error handling as well
try:
try:
value = float(source.split(gather+':</td><td class="yfnc_tabledata1">')[1].split('</td>')[0])
except:
value = float(source.split(gather+':</td><td class="yfnc_tabledata1">')[1].split('</td>')[0])
#df = df.append({'Date':date_stamp,'Unix':unix_time,'Ticker':ticker,'DE Ratio':value,}, ignore_index = True)
#added new try-except to make sp500 date and values from S&P 500 data
try:
sp500_date = datetime.fromtimestamp(unix_time).strftime('%Y-%m-%d')
row = sp500_df[(sp500_df.index == sp500_date)]
sp500_value = float(row["Adj Close"])
except:
sp500_date = datetime.fromtimestamp(unix_time-259200).strftime('%Y-%m-%d')
row = sp500_df[(sp500_df.index == sp500_date)]
sp500_value = float(row["Adj Close"])
# pulled price from source script by splitting and indexing
try:
stock_price = float(source.split('</small><big><b>')[1].split('</b></big>')[0])
#print("stock_price:",stock_price,"ticker:", ticker)
except:
try:
stock_price = (source.split('</small><big><b>')[1].split('</b></big>')[0])
#print(stock_price)
stock_price = re.search(r'(\d{1,8}\.\d{1,8})', stock_price)
stock_price = float(stock_price.group(1))
#print(stock_price)
except:
try:
stock_price = (source.split('<span class="time_rtq_ticker">')[1].split('</span>')[0])
#print(stock_price)
stock_price = re.search(r'(\d{1,8}\.\d{1,8})', stock_price)
stock_price = float(stock_price.group(1))
#print(stock_price)
except:
print('wtf stock price lol',ticker,file, value)
time.sleep(5)
# setting the start value for stock_price and sp500 to calculate % change (new-old)/old * 100:
if not starting_stock_value:
starting_stock_value = stock_price
if not starting_sp500_value:
starting_sp500_value = sp500_value
stock_p_change = ((stock_price - starting_stock_value) / starting_stock_value) * 100
sp500_p_change = ((sp500_value - starting_sp500_value) / starting_sp500_value) * 100
location = len(df['Date'])
#Here we account for the difference between the S&P 500 and the stock:
difference = stock_p_change - sp500_p_change
if difference > 0:
status = "outperform"
else:
status = "underperform"
#updating our dataframe
df = df.append({'Date':date_stamp,
'Unix':unix_time,
'Ticker':ticker,
'DE Ratio':value,
'Price':stock_price,
'stock_p_change':stock_p_change,
'SP500':sp500_value,
'sp500_p_change':sp500_p_change,
'Difference':difference,
'Status':status}, ignore_index = True)
except Exception as e:
pass
#print(ticker,e,file, value)
#print(ticker_list)
#print(df)
for each_ticker in ticker_list:
try:
plot_df = df[(df['ticker'] == each_ticker)]
plot_df = plot_df.set_index(['Date'])
if plot_df['Status'][-1] == 'underperform':
color = 'r'
else:
color = 'g'
plot_df['Difference'].plot(label == each_ticker, color = color)
plt.legend()
except Exception as e:
pass
#print(str(e)):
plt.show
# we edit our gather variable by replacing space, single quotes and forward slash with nothing so as to customise our file name .csv
save = gather.replace(' ','').replace(')','').replace('(','').replace('/','')+('.csv')
#print(save)
#df.to_csv(save)
#time.sleep(10)
Key_Stats()