FileNotFoundError: [Errno 2] No such file or directory: 'aapl_historical_data.csv'

In [10]:
import pandas as pd
import numpy as np
import json
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import yfinance as yf

# Load dataset
aapl = yf.download("AAPL", start="2023-01-01", end="2024-03-15")

# Check for duplicate data
duplicates = aapl[aapl.duplicated()]
print(f"Number of duplicate rows: {duplicates.shape[0]}")
aapl = aapl.drop_duplicates()

# Check for missing values
missing_values = aapl.isna().sum()
print("Missing values per column:\n", missing_values)
aapl = aapl.dropna()

# Check for data overload
print(f"Dataset contains {aapl.shape[0]} rows and {aapl.shape[1]} columns")
if aapl.shape[0] > 100000:
    aapl = aapl.sample(10000)

# Check for missing trading days
full_range = pd.date_range(start=aapl.index.min(), end=aapl.index.max(), freq="B")
missing_dates = full_range.difference(aapl.index)
print("Missing trading days:\n", missing_dates)


# 2. Compute Descriptive Statistics
summary_stats = aapl.describe().T
summary_stats["skewness"] = aapl.skew()
summary_stats["kurtosis"] = aapl.kurtosis()

# 3. PCA (Run only if dimensions exceed threshold)
dimension_threshold = 5  # Example threshold
num_features = aapl.shape[1]

pca_summary = None  # Default if PCA isn't needed
if num_features > dimension_threshold:
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(aapl.dropna())

    pca = PCA(n_components=2)  # Reduce to 2 principal components
    principal_components = pca.fit_transform(scaled_data)

    pca_summary = {
        "explained_variance_ratio": pca.explained_variance_ratio_.tolist(),
        "components": pca.components_.tolist(),
    }

# 4. Bootstrapping
def bootstrap_means(data, num_samples=1000):
    boot_means = [np.mean(np.random.choice(data, size=len(data), replace=True)) for _ in range(num_samples)]
    return np.mean(boot_means), np.std(boot_means)

boot_results = {col: bootstrap_means(aapl[col].dropna()) for col in aapl.columns}
print(boot_results)


[*********************100%***********************]  1 of 1 completed


Number of duplicate rows: 0
Missing values per column:
 Price   Ticker
Close   AAPL      0
High    AAPL      0
Low     AAPL      0
Open    AAPL      0
Volume  AAPL      0
dtype: int64
Dataset contains 301 rows and 5 columns
Missing trading days:
 DatetimeIndex(['2023-01-16', '2023-02-20', '2023-04-07', '2023-05-29',
               '2023-06-19', '2023-07-04', '2023-09-04', '2023-11-23',
               '2023-12-25', '2024-01-01', '2024-01-15', '2024-02-19'],
              dtype='datetime64[ns]', freq=None)
{('Close', 'AAPL'): (np.float64(173.1048673951919), np.float64(0.9609155207254457)), ('High', 'AAPL'): (np.float64(174.3546251106806), np.float64(0.9724479459553281)), ('Low', 'AAPL'): (np.float64(171.4887370026786), np.float64(0.9606015730615576)), ('Open', 'AAPL'): (np.float64(172.76781896252575), np.float64(0.9735209754551845)), ('Volume', 'AAPL'): (np.float64(59421807.14983388), np.float64(1038936.5932042039))}


In [None]:
# 5. Store Statistical Summary in AWS S3
statistical_memory = {
    "summary_stats": summary_stats.to_dict(),
    "pca_summary": pca_summary,
    "bootstrap_estimates": boot_results,
}

compressed_data = json.dumps(statistical_memory)

# Upload to AWS S3 (assuming credentials are set)
s3 = boto3.client("s3")
bucket_name = "your-bucket-name"
s3.put_object(Bucket=bucket_name, Key="aapl_statistical_memory.json", Body=compressed_data)

print("Statistical memory successfully uploaded to AWS S3!")
