In [13]:
import os
import sys
import pandas as pd

In [14]:
# Add project root to sys.path (adjust as needed)
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))  # go up one directory
sys.path.append(project_root)

In [15]:
from io import StringIO
from dotenv import load_dotenv
from azure.storage.blob import BlobServiceClient

from src.utils.helper import format_to_decimal, check_or_create_logfile, append_to_log

In [None]:
def clean_data(df):
    df["Volume"] = df["Volume"].str.replace(',','').replace("'","").replace(".","").astype(int)
    df["Date"] = pd.to_datetime(df["Date"], format='%m/%d/%Y')
    
    df["Open"] = df["Open"].apply(format_to_decimal).astype(float)
    df["High"] = df["High"].apply(format_to_decimal).astype(float)
    df["Close"] = df["Close"].apply(format_to_decimal).astype(float)
    df["Low"] = df["Low"].apply(format_to_decimal).astype(float)
    return df

In [17]:
def add_features(df: pd.DataFrame, source: str):

    df.insert(0, "Source", source)    

    # Dates
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    df["Year"] = df["Date"].dt.year
    df["DayOfWeek"] = df["Date"].dt.dayofweek

    df["MA_5"] = df["Close"].rolling(5).mean().apply(format_to_decimal)
    df["MA_20"] = df["Close"].rolling(20).mean().apply(format_to_decimal)
    df["EMA_5"] = df["Close"].ewm(span=5, adjust=False).mean().apply(format_to_decimal)
    df["EMA_20"] = df["Close"].ewm(span=20, adjust=False).mean().apply(format_to_decimal)
    df["STD_5"] = df["Close"].rolling(5).mean().apply(format_to_decimal)
    df["STD_20"] = df["Close"].rolling(20).mean().apply(format_to_decimal)

    df["DailyReturn"] = df["Close"].pct_change().apply(format_to_decimal)
    df["Volatility"] = df["DailyReturn"].rolling(20).std().apply(format_to_decimal)
    df["PriceChange"] = (df["Close"] - df["Open"]).apply(format_to_decimal)
 
    return df

In [18]:
def upload_silver_to_blob(df, serviceClient, containerName, blobName):
    csv_buffer = StringIO()
    df.to_csv(csv_buffer, index=False, encoding='utf-8', na_rep='nan')
    csv_buffer.seek(0)

    blob_client = serviceClient.get_blob_client(container=containerName, blob=blobName)
    blob_client.upload_blob(csv_buffer.getvalue(), overwrite=True)

In [19]:
def append_latest_to_silver(df, serviceClient, containerName, blobName):
    try:
        blob_client = serviceClient.get_blob_client(container=containerName, blob=blobName)
        existing_data = blob_client.download_blob().readall().decode('utf-8')
        existing_df = pd.read_csv(StringIO(existing_data), parse_dates=["Date"], keep_default_na=True, na_values=['nan', 'NaN'])

        df = pd.concat([existing_df, df], ignore_index=True)
        df =df.replace('',pd.NA)
        #test=df
    except Exception as e:
        print(f"Error reading existing data: {e}")

    upload_silver_to_blob(df, serviceClient, containerName, blobName)

In [24]:
def main():
    
    load_dotenv()

    combinedData = pd.DataFrame() # STAGING DATAFAME TO HANDLE FILES
    serviceClient = BlobServiceClient.from_connection_string(os.getenv("AZURE_CONNECTION_STRING"))
    blobContainer = serviceClient.get_container_client(os.getenv("CONTAINER_NAME"))

    bronzeLocation = os.getenv("BRONZE_LOCATION")
    silverFile = os.getenv("SILVER_LOCATION")
   
    for blob in blobContainer.list_blobs(name_starts_with=f"{bronzeLocation}"):
        
        blob_client = blobContainer.get_blob_client(blob.name)
        stockPrices = blob_client.download_blob().readall().decode('utf-8')

        data = pd.read_csv(StringIO(stockPrices), keep_default_na=True, parse_dates=["Date"])
        
        data = data.head(1)
        data = clean_data(data)
        data = add_features(data, os.path.basename(blob.name).replace('.csv',''))

        

        combinedData = pd.concat([combinedData, data], ignore_index=True)
    
    # Uncomment the next line and comment append_latest_to_silver for initial upload to Azure Blob Storage
    upload_silver_to_blob(combinedData, serviceClient, os.getenv("CONTAINER_NAME"), f"{silverFile}silver_output.csv")
    #append_latest_to_silver(combinedData, serviceClient, os.getenv("CONTAINER_NAME"), f"{silverFile}silver_output.csv")
    return combinedData, data

In [25]:
s = main()
s

(  Source       Date  Open     High      Low    Close    Volume  Month  Day  \
 0    bdo 2025-08-05   678   144.90   142.60   143.00   1638980      8    5   
 1  creit 2025-08-05   678     3.70     3.67     3.68   1121000      8    5   
 2  globe 2025-08-05   678  1740.00  1705.00  1710.00     85700      8    5   
 3  mreit 2025-08-05   678    14.50    14.00    14.38   1392700      8    5   
 4    rcr 2025-08-05   678     8.02     7.80     7.78  18777700      8    5   
 
    Year  DayOfWeek MA_5 MA_20    EMA_5   EMA_20 STD_5 STD_20 DailyReturn  \
 0  2025          1  nan   nan   143.00   143.00   nan    nan         nan   
 1  2025          1  nan   nan     3.68     3.68   nan    nan         nan   
 2  2025          1  nan   nan  1710.00  1710.00   nan    nan         nan   
 3  2025          1  nan   nan    14.38    14.38   nan    nan         nan   
 4  2025          1  nan   nan     7.78     7.78   nan    nan         nan   
 
   Volatility PriceChange  
 0        nan     -535.00  
 1  