forked from Kiize/Quantitative-Finance
-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_dataset.py
143 lines (112 loc) · 4.73 KB
/
build_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
Code to download stock histories to create the dataset to analyze.
For SP500 data is possible to use:
https://pkgstore.datahub.io/core/s-and-p-500-companies/10/datapackage.json
"""
import time
import numpy as np
import datapackage
import pandas as pd
import yfinance as yf
def load(start_date, end_date, list_ticker=None, json_link=None,
save=False, path_data=None, path_info=None):
'''
Function to load history of titles via yfinance.
It's possibile to pass a list of ticker or a link to a .json file
with all information; for example for all SP500 is possible to use:
https://pkgstore.datahub.io/core/s-and-p-500-companies/10/datapackage.json
It's also possible to save the data in some file; in this case the function
will return None; If we don't want to save the data the function will return
the all data in a dictionary.
One between list_ticker or json_link is mandatory
Parameters
----------
start_date : string
start of the title's history, in the form: "yyyy-mm-dd"
end_date : string
end of the title's history, in the form: "yyyy-mm-dd"
list_ticker : list, optional, default None
list of ticker that we want
json_link : string, optional, default None
link to json file wiht necessary information
save : bool, optional, default False
set True to save data in .npy file
path_data : string, optional, default None
path for save data; mandatory for save=True
path_info : string, optional, default None
path to save some other information that are in the file .json
Return
------
history : dict
dictionary with all the desired title's history
'''
if list_ticker is None and json_link is None:
err_msg = "You must pass a list of tiker or \
\na link to json file with all information"
raise Exception(err_msg)
if list_ticker :
# Dictionary which will contain the histories of the various titles
history = {}
ticker_len = len(list_ticker)
for index, ticker in enumerate(list_ticker):
# Loop over the entire data set and read the symbol
print(f"[{(index + 1)}/{ticker_len}] download of: {ticker}")
# Download the history of the title, in the time range of interest
hist = yf.download(ticker, start=start_date, end=end_date,
interval='1d', progress=False)
# Check
if hist.empty:
print(f"The history of {ticker} is not available")
else:
history[ticker] = hist
# Save
if save:
np.save(path_data, history)
else:
return history
if json_link is not None:
package = datapackage.Package(json_link)
resources = package.resources
# Only the data written in the table are of interest,
# which contain: symbol, name and sector
for resource in resources:
if resource.tabular:
dataset = pd.read_csv(resource.descriptor['path'])
# Now dataset contains symbols, names and sectors of stocks.
# We reorder it in alphabetical order of the symbol and readjust
# the index that labels it
dataset = dataset.sort_values('Symbol').reset_index(drop=True)
# Save info on a csv
dataset.to_csv(f'{path_info}.csv', index=False)
# Dictionary which will contain the histories of the various titles
history = {}
dataset_len = len(dataset)
for index, row in dataset.iterrows():
# Loop over the entire data set and read the symbol
ticker = row['Symbol']
print(f"[{(index + 1)}/{dataset_len}] download of: {ticker}")
# Download the history of the title, in the time range of interest
hist = yf.download(ticker, start=start_date, end=end_date,
interval='1d', progress=False)
# Check
if hist.empty:
print(f"The history of {ticker} is not available")
else:
history[ticker] = hist
# Save
if save:
np.save(path_data, history)
else:
return history
if __name__ == "__main__":
sp500 = r"https://pkgstore.datahub.io/core/s-and-p-500-companies/10/datapackage.json"
path_file = r"data/16_19"
path_info = r"data/16_19_information"
start = time.time()
start_date = "2016-01-01"
end_date = "2019-12-31"
load(start_date, end_date, json_link=sp500,
save=True, path_data=path_file, path_info=path_info)
mins = (time.time()-start)//60
sec = (time.time()-start) % 60
print(f"Elapsed time: {mins} min {sec:.2f} sec")