In [10]:
import os
print(os.getcwd())


c:\Users\Searc\OneDrive\CSE\CS7643\Project


In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler

def calculate_metrics(input_csv: str, output_csv: str, scaler_pkl: str, eps: float = 10.0):
    """
    Args:
        input_csv (str): Path to input 
        output_csv (str): Path to output CSV.
        scaler_pkl (str): Path to pickles.
        eps (float): for P/E ratio calculation
    """
    df = pd.read_csv(input_csv)
    df = df.iloc[:, 1:]  


    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values(['Date']) 
    df['Adjusted Close'] = df['Close'] + df.get('Dividends', 0)
    df['Market Cap'] = df['SharesOutstanding'] * df['Volume']
    df['Volatility'] = df['Open'].rolling(window=7).std()

    #RSI
    delta = df['Close'].diff()
    gain = np.where(delta > 0, delta, 0)
    loss = np.where(delta < 0, -delta, 0)
    avg_gain = pd.Series(gain).rolling(window=14).mean()
    avg_loss = pd.Series(loss).rolling(window=14).mean()
    rs = avg_gain / avg_loss
    df['RSI'] = 100 - (100 / (1 + rs))  

    #P/E
    df['P/E Ratio'] = df['Close'] / eps 

    #Capped Percent Change
    def compute_percent_change(col):
        pct_change = df[col].pct_change()
        pct_change = pct_change.clip(-1, 1)  
        pct_change[df[col].shift(1) == 0] = 1.0  
        return pct_change

    df['% Change Adj Close'] = compute_percent_change('Adjusted Close')
    df['% Change Open'] = compute_percent_change('Open')
    df['% Change Volume'] = compute_percent_change('Volume')
    df['% Change Market Cap'] = compute_percent_change('Market Cap')
    df['% Change Volatility'] = compute_percent_change('Volatility')
    df['% Change RSI'] = compute_percent_change('RSI')

    #Log Transform
    def log_transform(col):
        return np.log(df[col].replace(0, np.nan))

    df['Log Open'] = log_transform('Open')
    df['Log Adjusted Close'] = log_transform('Adjusted Close')
    df['Log Volume'] = log_transform('Volume')
    df['Log Market Cap'] = log_transform('Market Cap')

    df = df.drop(columns=['Open', 'Adjusted Close', 'Volume', 'Market Cap', 'High', 'Low', 'Close', 
                          'Dividends', 'Stock Splits'])

    df.to_csv(output_csv, index=False)

    feature_columns = [
        'Log Open', 'Log Adjusted Close', 'Log Volume', 'Log Market Cap',
        'Volatility', 'RSI', 'P/E Ratio',
        '% Change Adj Close', '% Change Open', '% Change Volume', '% Change Market Cap',
        '% Change Volatility', '% Change RSI'
    ]

    scalers = {}
    for feature in feature_columns:
        scaler = StandardScaler()
        reshaped_data = df[feature].dropna().values.reshape(-1, 1)
        scaler.fit(reshaped_data)
        scalers[feature] = scaler

    with open(scaler_pkl, 'wb') as f:
        pickle.dump(scalers, f)

#Testing
calculate_metrics("scraped_data/A_numerical_features.csv", "processed/A_numerical_features_output.csv", "scalers.pkl")


  df['Date'] = pd.to_datetime(df['Date'])


In [None]:
import os
import glob
import pandas as pd

input_folder = "scraped_data/"
output_folder = "processed/"
pkl_folder = "pkl/"

os.makedirs(output_folder, exist_ok=True)
os.makedirs(pkl_folder, exist_ok=True)

#All files
file_list = glob.glob(os.path.join(input_folder, "*.*"))
file_list = [f for f in file_list if f.endswith(('.csv', '.xlsx', '.xls'))] 

print(f"{len(file_list)} files")

#Data Processing
for file_path in file_list:
    file_name = os.path.basename(file_path) 
    base_name, ext = os.path.splitext(file_name) 
    output_csv = os.path.join(output_folder, f"{base_name}_processed.csv")
    scaler_pkl = os.path.join(pkl_folder, f"{base_name}.pkl")

    try:
        if ext in ['.xlsx', '.xls']:
            df = pd.read_excel(file_path)
            temp_csv = os.path.join(input_folder, f"{base_name}.csv")
            df.to_csv(temp_csv, index=False)
            file_path = temp_csv 

        calculate_metrics(file_path, output_csv, scaler_pkl)
    except Exception as e:
        print(f"Error processing {file_name}: {e}")

print("Batch processing complete!")
