In [1]:
import os
import sys
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

# figuring out

In [None]:
df = pd.read_csv("data/csv/traning/AAPL.csv", 
                 parse_dates=['Date'],
                 index_col='Date').drop(columns=['Unnamed: 0'])
df.head(20)

In [None]:
window_size = 5
for i in range(window_size, len(df) - window_size, window_size):
    start = i - window_size
    end = i 
    horizon_start = i 
    horizon_end = i - 1 + window_size
    past_start = i - (window_size*2)
    image_path_not_completed = f"5day/AAPL/{start}"
    
    print(image_path_not_completed, " ---------------------------------------------------")
    
    current_df = df.iloc[start:end].copy()
    horizon_start_value = df.iloc[horizon_start]
    horizon_end_value = df.iloc[horizon_end]
    print("Start", start)
    print("End", end)
    print("Horizon start", horizon_start)
    print("Horizon end", horizon_end)
    display(current_df)
    print(f"Horizon Start: {horizon_start_value}")
    print(f"Horizon End: {horizon_end_value}")
    print("PAST --->", past_start)
    break

In [None]:
# Reset index temporarily
result = df.reset_index(drop=False).iloc[1::5]
result

# doing stuff

Load in price data

In [2]:
csv_folder_dir = "data/csv/out_of_sample/"
csv_list = [x for x in os.listdir(csv_folder_dir) if not x.startswith(".")]

prices_df = None

for csv_name in csv_list:
    csv_dir = os.path.join(csv_folder_dir, csv_name)
    current_df = pd.read_csv(csv_dir, 
                             usecols=['Date', 'Close'],
                             parse_dates=['Date'])
    current_df.rename(columns={'Close': f"{csv_name[:-4]}"}, inplace=True)
    
    # Merge the current DataFrame with the final DataFrame
    if prices_df is None:
        prices_df = current_df  # Initialize with the first DataFrame
    else:
        prices_df = pd.merge(prices_df, current_df, on='Date', how='outer')  # Merge on 'Date'    

In [3]:
prices_df.head(3)

Unnamed: 0,Date,CSCO,UAL,TROW,ISRG,NVR,TPR,DVN,CE,MRO,...,WM,DOV,CRM,PGR,WAT,IEX,BWA,LRCX,BLK,PPL
0,2021-01-04,43.959999,41.630001,147.690002,266.666656,4040.649902,32.709999,16.129999,125.93,6.83,...,114.830002,123.339996,220.309998,97.129997,250.149994,195.309998,33.503521,47.801998,710.820007,27.58
1,2021-01-05,43.98,43.400002,149.089996,268.350006,4008.909912,32.52,17.360001,131.600006,7.45,...,115.209999,124.059998,221.520004,96.949997,255.429993,194.929993,34.295776,49.455002,714.580017,27.610001
2,2021-01-06,44.400002,43.540001,151.529999,264.263336,3966.48999,34.32,18.32,136.550003,7.5,...,117.760002,127.559998,216.149994,98.790001,262.470001,202.179993,35.360916,49.648998,734.960022,28.110001


Create dataframe with model inference

In [4]:
from tensorflow.keras.models import load_model

model = load_model("models/5-day.keras")
model.summary()

2024-11-27 15:42:56.595676: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2024-11-27 15:42:56.595701: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-11-27 15:42:56.595706: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-11-27 15:42:56.595725: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-27 15:42:56.595738: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
%%time
image_dir = "data/images_small/out_of_sample/5-day/"
dirist = [x for x in os.listdir(image_dir) if not x.startswith(".")]
predictions_dict = {}
for folder_name in tqdm(dirist):
    folder_dir = os.path.join(image_dir, folder_name)
    current_images = []
    # load images
    for file_name in os.listdir(folder_dir):
        file_dir = os.path.join(folder_dir, file_name)
        img = cv2.imread(file_dir, cv2.IMREAD_GRAYSCALE)
        current_images.append(img)
    # get model prediction
    current_images = np.array(current_images)
    current_predictions = model.predict(current_images, verbose=0).flatten()
    predictions_dict[folder_name] = current_predictions

#-------------
predictions_df = pd.DataFrame(predictions_dict)

del model

  0%|          | 0/489 [00:00<?, ?it/s]2024-11-27 15:42:56.911386: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.
100%|██████████| 489/489 [00:26<00:00, 18.55it/s]

CPU times: user 15.4 s, sys: 5.84 s, total: 21.2 s
Wall time: 26.4 s





In [6]:
new_index = range(5, 5 * (len(predictions_df) + 1), 5)
predictions_df.index = new_index
predictions_df.head(3)

Unnamed: 0,CTAS,WELL,VZ,AMZN,CNP,RCL,CAT,TFC,AAPL,PANW,...,EXPE,HUM,HST,NVR,STT,CCI,SCHW,STZ,MSCI,GLW
5,0.493433,0.506692,0.503856,0.504588,0.503391,0.516497,0.502029,0.49916,0.493861,0.498304,...,0.489093,0.508085,0.510367,0.495663,0.516739,0.518014,0.493344,0.511776,0.502274,0.501523
10,0.51254,0.498483,0.516194,0.509003,0.49333,0.499638,0.501157,0.51179,0.498543,0.505561,...,0.508218,0.517695,0.515029,0.494546,0.50989,0.485286,0.506887,0.492358,0.508084,0.524672
15,0.512908,0.504336,0.52798,0.497231,0.508439,0.504626,0.502002,0.510966,0.485071,0.490222,...,0.507604,0.49778,0.490733,0.504038,0.485943,0.502652,0.498867,0.495877,0.500681,0.483182


---
### For the sake of clarity:
- `predictions_df` --> One row in this df, represents the **probability to have a positive returns in the next time period** (in this case 5 days), its a probability that we calculate based on one week worth of data, and we know that the week starts at the index.

*EXAMPLE:* So the first row of `predictions_df` has `index = 5`, so we know that it its the probability to have a positive returns in the time frame described from indeces `10` to `15` in the `prices_df`.
> 
> `predictions_df.index(5)` is calculated based on `price_df.iloc[5:10]` and its the probability to have a positive return in the next 5 day window `price_df.iloc[10:15]`

---

We start our simulation at index 10 of our price_df, we use only the information we have up untill that point and among all the firms in the sp500, we 
- **buy** the top 10 with highest probability to have a positive return
- **short** the ones with the lowest probability to have a positive return

In [7]:
for today in range(5, len(prices_df), 5):
    # First we select the companies
    
    print(today)

5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100
105
110
115
120
125
130
135
140
145
150
155
160
165
170
175
180
185
190
195
200
205
210
215
220
225
230
235
240
245
250
255
260
265
270
275
280
285
290
295
300
305
310
315
320
325
330
335
340
345
350
355
360
365
370
375
380
385
390
395
400
405
410
415
420
425
430
435
440
445
450
455
460
465
470
475
480
485
490
495
500
505
510
515
520
525
530
535
540
545
550
555
560
565
570
575
580
585
590
595
600
605
610
615
620
625
630
635
640
645
650
655
660
665
670
675
680
685
690
695
700
705
710
715
720
725
730
735
740
745
750
755
760
765
770
775
780
785
790
795
800
805
810
815
820
825
830
835
840
845
850
855
860
865
870
875
880
885
890
895
900
905
910
915
920
925
930
935
940
945
950
955
960
965
970
975


In [8]:
predictions_df.iloc[5]

CTAS    0.510378
WELL    0.503129
VZ      0.511674
AMZN    0.489840
CNP     0.522878
          ...   
CCI     0.502417
SCHW    0.510327
STZ     0.517320
MSCI    0.501626
GLW     0.499121
Name: 30, Length: 489, dtype: float32

In [9]:
predictions_df.loc[5]

CTAS    0.493433
WELL    0.506692
VZ      0.503856
AMZN    0.504588
CNP     0.503391
          ...   
CCI     0.518014
SCHW    0.493344
STZ     0.511776
MSCI    0.502274
GLW     0.501523
Name: 5, Length: 489, dtype: float32