In [1]:
import numpy as np
import pandas as pd
import os

In [139]:
data_path = "/home/dimitribouche/Bureau/ENSAE/OnlineLearning/NYSE"
save_data_path = "/home/dimitribouche/Bureau/ENSAE/OnlineLearning"
volume_threshold = 1e7

In [99]:
def load_stocks(path):
    stock_data = dict()
    for stock in os.listdir(path):
        name = stock.split(".")[0]
        try:
            stock_data[name] = pd.read_table(path + "/" + stock, sep=",", index_col=0).loc[:, ["Close", "Volume"]]
            stock_data[name].index = pd.to_datetime(stock_data[name].index)
        except pd.errors.EmptyDataError:
            continue
    return stock_data   

In [100]:
nyse_stocks = load_stocks(data_path)

In [101]:
def date_index_union(stock_data):
    first = True
    for key in stock_data.keys():
        if first :
            date_union = stock_data[key].index
            first = False
        else :
            date_union = date_union.union(stock_data[key].index)
    return date_union
        

In [144]:
date_union = date_index_union(nyse_stocks)
most_recent = date_union[10000:]

In [103]:
def pick_big_volumes(thresh_volume, stock_data):
    volumes = pd.Series()
    for key in stock_data.keys():
        volumes[key] = (stock_data[key]["Close"] * stock_data[key]["Volume"]).mean()
    return (volumes[volumes > thresh_volume]).index
    

In [145]:
# Keep only big stocks
big_stocks = pick_big_volumes(volume_threshold, nyse_stocks)

In [146]:
def consolidate_data(stock_data, date_index, stocks_to_keep):
    conso_data = pd.DataFrame(index=date_index)
    for stock in stocks_to_keep:
        conso_data[stock] = stock_data[stock]["Close"]
    return conso_data

In [147]:
# Consolidate the data according to the index most_recent
conso_data = consolidate_data(nyse_stocks, most_recent, big_stocks)

In [148]:
# Delete stocks which are not quoted during the "tolerance" first day of the period
cleaned = conso_data.dropna(axis=1, subset=conso_data.index[0:tolerance])
# Fills Na with last valid price
cleaned = cleaned.fillna(method="ffill")

In [149]:
cleaned

Unnamed: 0_level_0,nwl,fhn,cmi,cma,wfc,cms,ups,nem,lmt,tgna,...,tmo,brk-a,brk-b,pg,eog,hd,au,mck,hpq,tjx
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-09-21,17.690,29.537,6.4215,43.127,16.944,15.085,40.068,19.856,32.755,29.588,...,15.048,62500.00,41.68,26.820,6.7267,28.305,15.4450,32.051,5.6889,6.7747
2001-09-24,18.018,29.955,6.8668,44.278,17.225,15.085,40.161,19.260,32.606,30.367,...,15.488,64000.00,42.46,27.806,6.4309,30.783,14.8480,32.289,6.1219,6.8337
2001-09-25,18.181,30.751,6.9695,45.840,17.372,14.887,41.359,19.565,32.881,29.482,...,15.602,65000.00,43.40,28.423,6.3165,31.031,14.9640,33.058,6.0882,6.9436
2001-09-26,18.392,31.850,6.7469,45.112,17.423,14.494,39.861,20.832,32.606,29.018,...,15.058,67800.00,45.24,28.472,6.3116,30.389,15.5610,34.293,6.0844,6.9388
2001-09-27,18.852,32.147,6.8838,45.967,17.431,14.287,40.083,21.083,33.613,29.949,...,15.244,69750.00,46.30,28.983,6.4688,31.543,15.3880,35.111,6.1605,6.9342
2001-09-28,18.945,32.113,7.0640,46.753,17.735,14.368,41.900,21.039,33.958,30.844,...,15.630,70000.00,46.60,29.144,6.8556,32.801,15.3970,35.509,6.1033,7.4318
2001-10-01,19.075,31.814,6.8754,46.376,17.735,14.368,40.826,22.134,33.858,30.073,...,15.468,70000.00,46.70,28.983,6.6934,32.329,15.4740,36.127,5.9316,7.4817
2001-10-02,19.347,32.331,7.1667,45.744,17.796,14.454,40.387,21.891,34.936,30.042,...,15.630,70000.00,46.74,29.335,6.8509,33.382,16.0430,36.363,5.7987,7.6780
2001-10-03,19.895,31.859,7.2006,45.608,17.859,14.793,41.195,20.581,35.239,30.639,...,16.240,72700.00,48.44,29.321,7.0030,34.903,15.6770,36.555,6.0844,7.9289
2001-10-04,19.903,30.760,7.2605,45.325,17.654,15.085,41.917,20.562,35.397,31.283,...,16.888,71850.00,47.72,28.504,7.3756,33.501,15.7730,36.269,6.0844,7.8922


In [91]:
# Save the data as a csv file
cleaned.to_csv(save_data_path + "/" + "DataNyse.csv")