-
Notifications
You must be signed in to change notification settings - Fork 0
/
investing_in_stocks_with_Machine_learning_acquiring_data_2.py
339 lines (276 loc) · 15.3 KB
/
investing_in_stocks_with_Machine_learning_acquiring_data_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
import pandas as pd
import os
import time
from datetime import datetime
from time import mktime
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import style
#style.use('dark_background')
import re
import urllib
# path to the folder where we have our data
path = "C:/Investing_in_stocks_with_ML/intraQuarter/intraQuarter"
# defining specs for our stat,adding more features, path to stat's directory and a one liner loop
# to list the contents in our directory
def Key_Stats(gather=['Total Debt/Equity (mrq)',
'Trailing P/E',
'Price/Sales',
'Price/Book',
'Profit Margin',
'Operating Margin',
'Return on Assets',
'Return on Equity',
'Revenue Per Share',
'Market Cap',
'Enterprise Value',
'Forward P/E',
'PEG Ratio',
'Enterprise Value/Revenue',
'Enterprise Value/EBITDA',
'Revenue',
'Gross Profit',
'EBITDA',
'Net Income Avl to Common ',
'Diluted EPS',
'Earnings Growth',
'Revenue Growth',
'Total Cash',
'Total Cash Per Share',
'Total Debt',
'Current Ratio',
'Book Value Per Share',
'Cash Flow',
'Beta',
'Held by Insiders',
'Held by Institutions',
'Shares Short (as of',
'Short Ratio',
'Short % of Float',
'Shares Short (prior ']):
statspath = path+('/_KeyStats')
stock_list = [x[0] for x in os.walk(statspath)]
#print(stock_list)
# added a new dataframe, also added % change of stock_Price and sp500(added more features to the list)
df = pd.DataFrame(columns = ['Date',
'Unix',
'Ticker',
'Price',
'stock_p_change',
'SP500',
'sp500_p_change',
'Difference',
##############
'DE Ratio',
'Trailing P/E',
'Price/Sales',
'Price/Book',
'Profit Margin',
'Operating Margin',
'Return on Assets',
'Return on Equity',
'Revenue Per Share',
'Market Cap',
'Enterprise Value',
'Forward P/E',
'PEG Ratio',
'Enterprise Value/Revenue',
'Enterprise Value/EBITDA',
'Revenue',
'Gross Profit',
'EBITDA',
'Net Income Avl to Common ',
'Diluted EPS',
'Earnings Growth',
'Revenue Growth',
'Total Cash',
'Total Cash Per Share',
'Total Debt',
'Current Ratio',
'Book Value Per Share',
'Cash Flow',
'Beta',
'Held by Insiders',
'Held by Institutions',
'Shares Short (as of',
'Short Ratio',
'Short % of Float',
'Shares Short (prior ',
##############
'Status'])
sp500_df = pd.DataFrame.from_csv("YAHOO-INDEX_GSPC.csv") # S & P Data
ticker_list = []
# we first iterate on our directories and list each directory, that is the
# contents in each directory(ticker) and save, now if lenght of each file in
# the directory is greater than zero we would like to proceed for each file in
# each directory that meets this condition, note that some directories has no
# data.
for each_dir in stock_list[1:]:
each_file = os.listdir(each_dir)
ticker = each_dir.split("\\")[1] # added stock ticker name
ticker_list.append(ticker) # add every ticker as a list
# defined two variables to calculate % change for stock_price and sp500
# we always want to do this each time the stock price changes
starting_stock_value = False
starting_sp500_value = False
if len(each_file) > 0:
# now that we've accessed each directory in our stock list, we would like to
# also pull some info from each file in our directory.
# Here we would like to pull time and date from the files
for file in each_file:
#print(file)
date_stamp = datetime.strptime(file, '%Y%m%d%H%M%S.html')
unix_time = time.mktime(date_stamp.timetuple())
#print(date_stamp, unix_time, ticker)
# we pulled the full path to our files, retrieved the source code together with our stats value
# with this we've been able to Debt/Equity ratios for all of the companies
full_file_path = each_dir+'/'+file
#print(full_file_path)
source = open(full_file_path,'r').read()
#print(source)
#value = (source.split(gather+':</td><td class="yfnc_tabledata1">')[1].split('</td>')[0])
#print(ticker+":",value)
# using regular expression module to get the values we want, "M" for million, or "B" for billion.
try:
value_list = []
for each_data in gather:
try:
regex = re.escape(each_data) + r'.*?(\d{1,8}\.\d{1,8}M?B?|N/A)%?</td>'
value = re.search(regex, source)
value = (value.group(1))
if "B" in value:
value = float(value.replace("B",''))*1000000000
elif "M" in value:
value = float(value.replace("M",''))*1000000
value_list.append(value)
except Exception as e:
value = "N/A"
value_list.append(value)
##we added functions here to execute our code and do error handling as well
## try:
## value = float(source.split(gather+':</td><td class="yfnc_tabledata1">')[1].split('</td>')[0])
## except:
## value = float(source.split(gather+':</td><td class="yfnc_tabledata1">')[1].split('</td>')[0])
## #df = df.append({'Date':date_stamp,'Unix':unix_time,'Ticker':ticker,'DE Ratio':value,}, ignore_index = True)
#added new try-except to make sp500 date and values from S&P 500 data
try:
sp500_date = datetime.fromtimestamp(unix_time).strftime('%Y-%m-%d')
row = sp500_df[(sp500_df.index == sp500_date)]
sp500_value = float(row["Adj Close"])
except:
sp500_date = datetime.fromtimestamp(unix_time-259200).strftime('%Y-%m-%d')
row = sp500_df[(sp500_df.index == sp500_date)]
sp500_value = float(row["Adj Close"])
# pulled price from source script by splitting and indexing
try:
stock_price = float(source.split('</small><big><b>')[1].split('</b></big>')[0])
#print("stock_price:",stock_price,"ticker:", ticker)
except:
try:
stock_price = (source.split('</small><big><b>')[1].split('</b></big>')[0])
#print(stock_price)
stock_price = re.search(r'(\d{1,8}\.\d{1,8})', stock_price)
stock_price = float(stock_price.group(1))
#print(stock_price)
except:
try:
stock_price = (source.split('<span class="time_rtq_ticker">')[1].split('</span>')[0])
#print(stock_price)
stock_price = re.search(r'(\d{1,8}\.\d{1,8})', stock_price)
stock_price = float(stock_price.group(1))
#print(stock_price)
except:
print('wtf stock price lol',ticker,file, value)
time.sleep(5)
# setting the start value for stock_price and sp500 to calculate % change (new-old)/old * 100:
if not starting_stock_value:
starting_stock_value = stock_price
if not starting_sp500_value:
starting_sp500_value = sp500_value
stock_p_change = ((stock_price - starting_stock_value) / starting_stock_value) * 100
sp500_p_change = ((sp500_value - starting_sp500_value) / starting_sp500_value) * 100
location = len(df['Date'])
#Here we account for the difference between the S&P 500 and the stock:
difference = stock_p_change - sp500_p_change
if difference > 0:
status = "outperform"
else:
status = "underperform"
if value_list.count("N/A") > (0):
pass
#updating our dataframe
else:
df = df.append({'Date':date_stamp,
'Unix':unix_time,
'Ticker':ticker,
'Price':stock_price,
'stock_p_change':stock_p_change,
'SP500':sp500_value,
'sp500_p_change':sp500_p_change,
'Difference':difference,
'DE Ratio':value_list[0],
#'Market Cap':value_list[1],
'Trailing P/E':value_list[1],
'Price/Sales':value_list[2],
'Price/Book':value_list[3],
'Profit Margin':value_list[4],
'Operating Margin':value_list[5],
'Return on Assets':value_list[6],
'Return on Equity':value_list[7],
'Revenue Per Share':value_list[8],
'Market Cap':value_list[9],
'Enterprise Value':value_list[10],
'Forward P/E':value_list[11],
'PEG Ratio':value_list[12],
'Enterprise Value/Revenue':value_list[13],
'Enterprise Value/EBITDA':value_list[14],
'Revenue':value_list[15],
'Gross Profit':value_list[16],
'EBITDA':value_list[17],
'Net Income Avl to Common ':value_list[18],
'Diluted EPS':value_list[19],
'Earnings Growth':value_list[20],
'Revenue Growth':value_list[21],
'Total Cash':value_list[22],
'Total Cash Per Share':value_list[23],
'Total Debt':value_list[24],
'Current Ratio':value_list[25],
'Book Value Per Share':value_list[26],
'Cash Flow':value_list[27],
'Beta':value_list[28],
'Held by Insiders':value_list[29],
'Held by Institutions':value_list[30],
'Shares Short (as of':value_list[31],
'Short Ratio':value_list[32],
'Short % of Float':value_list[33],
'Shares Short (prior ':value_list[34],
'Status':status},
ignore_index=True)
except Exception as e:
pass
#print(ticker,e,file, value)
#print(ticker_list)
#print(df)
## for each_ticker in ticker_list:
## try:
## plot_df = df[(df['ticker'] == each_ticker)]
##
## plot_df = plot_df.set_index(['Date'])
##
## if plot_df['Status'][-1] == 'underperform':
## color = 'r'
## else:
## color = 'g'
##
## plot_df['Difference'].plot(label == each_ticker, color = color)
## plt.legend()
## except Exception as e:
## pass
## #print(str(e)):
## plt.show
# we edit our gather variable by replacing space, single quotes and forward slash with nothing so as to customise our file name .csv
## save = gather.replace(' ','').replace(')','').replace('(','').replace('/','')+('.csv')
#print(save)
df.to_csv("key_stats.csv")
#time.sleep(10)
Key_Stats()