In [3]:
import gc  
import os  
import time  
import warnings 
from itertools import combinations  
from warnings import simplefilter 
# import lightgbm as lgb
import numpy as np  
import pandas as pd
# import plotly.graph_objs as go

# from sklearn.metrics import mean_absolute_error 
# from sklearn.model_selection import KFold, TimeSeriesSplit 


In [27]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Read the CSV file into a DataFrame
df = pd.read_csv("6XWX_bike_rides.csv")


In [28]:
# Convert `date` to datetime
df['date'] = pd.to_datetime(df['date'])

# Filter data by year 2022
df_2022 = df[df['date'].dt.year == 2022]

# Compute and print average `group_size` in 2022
avg_group_size_2022 = df_2022['group_size'].mean()
print(
    f"The average group size for rides in 2022 was: {avg_group_size_2022:.2f}"
)

The average group size for rides in 2022 was: 3.76


# Load and perform feature extration

In [90]:
def read_csv_files_in_folder(folder_path):
    data_dict = {}
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".csv"):
                data_dict[file[:-4]] = pd.read_csv(os.path.join(root, file))
    
    return data_dict


def add_psar(data, af_step=0.02, af_max=0.2):
    # data = stock_data.copy()

    # Initialize columns
    psar_array = np.zeros(len(data))
    af_array = np.full(len(data), af_step)

    # Initial values
    bull = True
    psar = data.iloc[0]["Low"]
    ep = data.iloc[0]["High"]

    for i in range(1, len(data)):
        if bull:
            psar = psar + af_array[i - 1] * (ep - psar)
            if data.iloc[i]["Low"] < psar:
                bull = False
                psar = ep
                ep = data.iloc[i]["Low"]
                af_array[i] = af_step
            else:
                if data.iloc[i]["High"] > ep:
                    ep = data.iloc[i]["High"]
                    af_array[i] = min(af_array[i - 1] + af_step, af_max)
                else:
                    af_array[i] = af_array[i - 1]
        else:
            psar = psar - af_array[i - 1] * (psar - ep)
            if data.iloc[i]["High"] > psar:
                bull = True
                psar = ep
                ep = data.iloc[i]["High"]
                af_array[i] = af_step
            else:
                if data.iloc[i]["Low"] < ep:
                    ep = data.iloc[i]["Low"]
                    af_array[i] = min(af_array[i - 1] + af_step, af_max)
                else:
                    af_array[i] = af_array[i - 1]

        psar_array[i] = psar

    psar_array[0] = data.iloc[0]["Close"]

    data["psar"] = psar_array
    data["psar_diff"] = data["Close"] - psar_array
    return ["psar", "psar_diff"]


def add_rel_ma_ema(data, day_range):
    """Today's closed compared to different moving averages"""
    ma_ema_col = []
    for d in day_range:
        data[f"ma{d}"] = data["Close"] / data["Close"].rolling(d).mean()
        data[f"ema{d}"] = data["Close"] / data["Close"].ewm(span=d).mean()
        ma_ema_col.append(f"ma{d}")
        ma_ema_col.append(f"ema{d}")
    
    return ma_ema_col

def add_rel_volume_ma(data, day_range):
    """Today's volume compare to other volume moving averages"""
    rel_vol_col = []
    for d in day_range:
        data[f"vol{d}"] = data["Volume"] / data["Volume"].rolling(d).mean()
        rel_vol_col.append(f"vol{d}")
    return rel_vol_col

def add_past_day_change(data, col, day_range):
    past_col = []
    for d in day_range:
        new_col = [f"{i}_{d}dChg" for i in col]
        data[new_col] = data[col] - data[col].shift(d)
        past_col.extend(new_col)
    return past_col

In [94]:
# Up to 55 days is enough for short term trends
day_range = [3, 5, 8, 13, 21, 34, 55]

all_data = read_csv_files_in_folder("data")
all_data["AAPL"].head(5)


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-01-02,38.7225,39.712502,38.557499,39.48,37.845039,148158800
1,2019-01-03,35.994999,36.43,35.5,35.547501,34.075386,365248800
2,2019-01-04,36.1325,37.137501,35.950001,37.064999,35.530045,234428400
3,2019-01-07,37.174999,37.2075,36.474998,36.982498,35.450966,219111200
4,2019-01-08,37.389999,37.955002,37.130001,37.6875,36.126766,164101200


In [67]:
psar_col = add_psar(all_data["AAPL"], af_step=0.02, af_max=0.2)

In [92]:
ma_ema_col = add_rel_ma_ema(all_data["AAPL"], day_range)

In [95]:
rel_vol_col = add_rel_volume_ma(all_data["AAPL"], day_range)

In [97]:
past_col = add_past_day_change(all_data["AAPL"], rel_vol_col, day_range=[1,3,5])

In [98]:
all_data["AAPL"].tail(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,vol3,vol5,vol8,...,vol21_3dChg,vol34_3dChg,vol55_3dChg,vol3_5dChg,vol5_5dChg,vol8_5dChg,vol13_5dChg,vol21_5dChg,vol34_5dChg,vol55_5dChg
1295,2024-02-26,182.240005,182.759995,180.649994,181.160004,181.160004,40867400,0.886627,0.8752,0.810779,...,-0.003823,0.009915,-0.007729,0.008337,-0.051796,-0.164634,-0.040146,-0.128319,-0.123273,-0.156902
1296,2024-02-27,181.100006,183.919998,179.559998,182.630005,182.630005,54318900,1.161438,1.160026,1.07848,...,0.048267,0.06512,0.040172,0.207674,0.20158,0.025853,0.159528,0.030493,0.06397,0.015532
1297,2024-02-28,182.509995,183.119995,180.130005,181.419998,181.419998,48953900,1.018881,1.01332,1.01341,...,0.072635,0.080123,0.072558,0.159033,0.229627,0.199956,0.227708,0.137883,0.159216,0.135407
1298,2024-02-29,181.270004,182.570007,179.529999,180.75,180.75,136682600,1.70885,2.096729,2.309657,...,1.595314,1.656536,1.668859,0.64519,1.101155,1.302986,1.379064,1.401121,1.469584,1.468051
1299,2024-03-01,179.550003,180.529999,177.380005,179.660004,179.660004,73488000,0.850803,1.037056,1.191893,...,0.243982,0.272007,0.306511,-0.123412,0.106018,0.33022,0.324369,0.417371,0.4549,0.476912
1300,2024-03-04,176.149994,176.899994,173.789993,175.100006,175.100006,81510100,0.838349,1.031895,1.222882,...,0.462725,0.48537,0.532381,-0.048278,0.156695,0.412103,0.498852,0.604431,0.634671,0.675516
1301,2024-03-05,170.759995,172.039993,169.619995,170.119995,170.119995,95132400,1.140993,1.091551,1.321116,...,-0.752873,-0.812916,-0.768789,-0.020445,-0.068475,0.242636,0.37464,0.599981,0.591548,0.65909
1302,2024-03-06,171.059998,171.240005,168.679993,169.119995,169.119995,68587700,0.839061,0.753048,0.915203,...,-0.090969,-0.13932,-0.107172,-0.17982,-0.260272,-0.098208,0.080037,0.253767,0.235457,0.297182
1303,2024-03-07,169.149994,170.729996,168.490005,169.0,169.0,71765100,0.914263,0.918927,0.910669,...,-0.182243,-0.193318,-0.157853,-0.794588,-1.177802,-1.398988,-1.316391,-1.173126,-1.215183,-1.151195
1304,2024-03-08,169.0,173.699997,168.940002,170.729996,170.729996,76114600,1.054865,0.968108,0.933586,...,-0.370444,-0.334183,-0.314533,0.204061,-0.068947,-0.258307,-0.1192,-0.014445,-0.014642,0.038047


In [79]:
import plotly.graph_objs as go

# Sample data
x_values = all_data["AAPL"]["Date"]
y_values = []
# y_values.append(all_data["AAPL"]["Close"])

for i in [3,13,55]:
    # plot_data = np.ones(len(x_values)) / all_data["AAPL"][f"ma{i}"] * all_data["AAPL"]["Close"]
    plot_data = all_data["AAPL"][f"ma{i}"]
    y_values.append(plot_data)



# Create a trace
trace_data = []
for y in y_values:
    trace_data.append(go.Scatter(x=x_values, y=y, mode='lines'))

# Create layout
layout = go.Layout(title='Simple Plot', xaxis=dict(title='X-axis'), yaxis=dict(title='Y-axis'))

# Create figure
fig = go.Figure(data=trace_data, layout=layout)

# Display the figure
fig.show()