In [None]:
import yfinance as yf
import matplotlib.pyplot as plt

# Set constant variables
TICKERS = ['AAPL']  # You can expand this list
START_DATE = '2022-01-01'
END_DATE = '2022-06-01'
OUTPUT_FILE = 'df.csv'

In [14]:
import numpy as np
import pandas as pd

def compute_features(data, shares) -> pd.DataFrame:
    """ 
        Calculate addtional features from a raw dataframe fetched from yf

        data: yf dataframe consisting of five columns ['Open', 'High', 'Close', 'Low', 'Volume']
        shares: number of outstanding shares 
    """
    # Compute additional features
    data['Amount'] = data['Close'] * data['Volume']
    data['Relative Volume(20d)'] = data['Volume'] / data['Volume'].rolling(20).mean()

    if not np.isnan(shares):
        data['Turnover'] = data['Volume'] / shares
    else:
        data['Turnover'] = np.nan

     # Compute daily price changes
    data['Open Change'] = data['Open'].pct_change()
    data['High Change'] = data['High'].pct_change()
    data['Close Change'] = data['Close'].pct_change()
    data['Low Change'] = data['Low'].pct_change()
    
    # Create price prediction target
    data['Today Return'] = data['Close Change']
    data['Tomorrow Return'] = data['Today Return'].shift(-1)

    return data

In [19]:
from tqdm import tqdm

all_data = []
for ticker in tqdm(TICKERS, desc="Processing Tickers"):
    raw = yf.download(ticker, start=START_DATE, end=END_DATE)
    if raw.empty:
        continue

    # Create a deepy copy of raw data
    data = raw.copy()

    # Fetch shares outstanding for computing turnover
    ticker_info = yf.Ticker(ticker).info
    shares = ticker_info.get('sharesOutstanding', np.nan)

    df = compute_features(data, shares)
    df['Ticker'] = ticker
    df.reset_index(inplace=True)
    all_data.append(df)


[*********************100%***********************]  1 of 1 completed
Processing Tickers: 100%|██████████| 1/1 [00:00<00:00,  3.26it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   (Date, )                  103 non-null    datetime64[ns]
 1   (Close, AAPL)             103 non-null    float64       
 2   (High, AAPL)              103 non-null    float64       
 3   (Low, AAPL)               103 non-null    float64       
 4   (Open, AAPL)              103 non-null    float64       
 5   (Volume, AAPL)            103 non-null    int64         
 6   (Amount, )                103 non-null    float64       
 7   (Relative Volume(20d), )  84 non-null     float64       
 8   (Turnover, )              103 non-null    float64       
 9   (Open Change, )           102 non-null    float64       
 10  (High Change, )           102 non-null    float64       
 11  (Close Change, )          102 non-null    float64       
 12  (Low Change, )        




In [9]:
yf.Ticker('AAPL').info['sharesOutstanding']

14935799808

In [34]:
import os
import tqdm
import pandas as pd
from pyproj import Transformer

dataframes = []

# 3. Loop through the extracted .dat files
datadir = "/Users/joeycullmann/Downloads/files/file-000001.dat"
df = pd.read_csv(datadir, sep='\s+')

transformer = Transformer.from_crs("EPSG:3857", "EPSG:4326", always_xy=True)
df['lon'], df['lat'] = transformer.transform(df['x'].values, df['y'].values)

df.head()

Unnamed: 0,x,y,k,tid,lon,lat
0,-13617805.7,4516997.306,0,1,-122.33083,37.55675
1,-13617679.91,4517273.941,1,1,-122.3297,37.55872
2,-13617691.04,4517365.218,2,1,-122.3298,37.55937
3,-13617733.35,4517401.729,3,1,-122.33018,37.55963
4,-13617882.51,4517400.325,4,1,-122.33152,37.55962


In [36]:
from tqdm import tqdm
import os
import pandas as pd
from pyproj import Transformer

basedir = "/Users/joeycullmann/Downloads/files/"
trajectory_rows = []

for file in tqdm(os.listdir(basedir)):
    if file.endswith(".dat"):
        path = os.path.join(basedir, file)
        try:
            df = pd.read_csv(path, sep='\s+')
            transformer = Transformer.from_crs("EPSG:3857", "EPSG:4326", always_xy=True)
            df['lon'], df['lat'] = transformer.transform(df['x'].values, df['y'].values)

            tid = df['tid'][0]
            coords = list(zip(df['lon'], df['lat']))
            trajectory_rows.append({'trip_id': tid, 'geometry': coords})
        except Exception as e:
            print(f"Error reading {file}: {e}")

# Step 3: Create a DataFrame
traj_df = pd.DataFrame(trajectory_rows)
traj_df.to_csv("sf_trajs.csv", index=False)

100%|██████████| 20200/20200 [00:14<00:00, 1434.35it/s]


0