In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

In [None]:
def clean_and_filter_data(df, start_date, end_date):
    df = df.copy()
    
    df.columns = [col.lower().strip() for col in df.columns]                    # Make column names lowercase
    rename_map = {"1. open": "open",                                            # Map for renaming columns to standard
                  "2. high": "high", 
                  "3. low": "low",
                  "4. close": "close", 
                  "5. volume": "volume", 
                  "date": "timestamp",
                  "volume usdt": "volume",
                  }
    
    df.rename(columns=rename_map, inplace=True)                                 # Rename columns based on the rename map
    df = df[["timestamp", "open", "high", "low", "close", "volume"]]            # Keep only the required columns

    df["timestamp"] = df["timestamp"].astype(str).str.split(".").str[0]         # Remove milliseconds from timestamp
    df["timestamp"] = pd.to_datetime(df["timestamp"])                           # Convert timestamp to datetime
    df = df[(df["timestamp"] >= start_date) & (df["timestamp"] <= end_date)]    # Filter Data for Date Range
    df.set_index("timestamp", inplace=True)                                     # Set timestamp as index

    df.sort_index(inplace=True)                                                 # Sort from oldest to newest

    cols = ["open", "high", "low", "close", "volume"] 
    df[cols] = df[cols].apply(pd.to_numeric, errors="coerce")                   # Convert columns to numeric
    return df.dropna()                                                          # Return df with no NaN values

In [3]:
# Read BTC data from files
btc_hourly = pd.read_csv("./datasets/crypto/hourly/BTC.csv")
btc_4hourly = pd.read_csv("./datasets/crypto/4hourly/BTC.csv")
btc_daily = pd.read_csv("./datasets/crypto/daily/BTC.csv")
btc_weekly = pd.read_csv("./datasets/crypto/weekly/BTC.csv")

# Read ETH data from files
eth_hourly = pd.read_csv("./datasets/crypto/hourly/ETH.csv")
eth_4hourly = pd.read_csv("./datasets/crypto/4hourly/ETH.csv")
eth_daily = pd.read_csv("./datasets/crypto/daily/ETH.csv")
eth_weekly = pd.read_csv("./datasets/crypto/weekly/ETH.csv")

# Read AAPL data from files
aapl_hourly = pd.read_csv("./datasets/stocks/hourly/AAPL.csv")
aapl_4hourly = pd.read_csv("./datasets/stocks/4hourly/AAPL.csv")
aapl_daily = pd.read_csv("./datasets/stocks/daily/AAPL.csv")
aapl_weekly = pd.read_csv("./datasets/stocks/weekly/AAPL.csv")

# Read TSLA data from files
tsla_hourly = pd.read_csv("./datasets/stocks/hourly/TSLA.csv")
tsla_4hourly = pd.read_csv("./datasets/stocks/4hourly/TSLA.csv")
tsla_daily = pd.read_csv("./datasets/stocks/daily/TSLA.csv")
tsla_weekly = pd.read_csv("./datasets/stocks/weekly/TSLA.csv")

# Read AMZN data from files
amzn_hourly = pd.read_csv("./datasets/stocks/hourly/AMZN.csv")
amzn_4hourly = pd.read_csv("./datasets/stocks/4hourly/AMZN.csv")
amzn_daily = pd.read_csv("./datasets/stocks/daily/AMZN.csv")
amzn_weekly = pd.read_csv("./datasets/stocks/weekly/AMZN.csv")

In [4]:
startDate = "2018-02-01 00:00:00"
endDate = "2023-09-30 23:00:00"

# Clean and Filter for BTC
btc_hourly_filtered = clean_and_filter_data(btc_hourly, startDate, endDate)
btc_4hourly_filtered = clean_and_filter_data(btc_4hourly, startDate, endDate)
btc_daily_filtered = clean_and_filter_data(btc_daily, startDate, endDate)
btc_weekly_filtered = clean_and_filter_data( btc_weekly, startDate, endDate)

# Clean and Filter for ETH
eth_hourly_filtered = clean_and_filter_data(eth_hourly, startDate, endDate)
eth_4hourly_filtered = clean_and_filter_data( eth_4hourly, startDate, endDate)
eth_daily_filtered = clean_and_filter_data(eth_daily, startDate, endDate)
eth_weekly_filtered = clean_and_filter_data(eth_weekly, startDate, endDate)

# Clean and Filter for AAPL
aapl_hourly_filtered = clean_and_filter_data(aapl_hourly, startDate, endDate)
aapl_4hourly_filtered = clean_and_filter_data(aapl_4hourly, startDate, endDate)
aapl_daily_filtered = clean_and_filter_data(aapl_daily, startDate, endDate)
aapl_weekly_filtered = clean_and_filter_data(aapl_weekly, startDate, endDate)

# Clean and Filter for TSLA
tsla_hourly_filtered = clean_and_filter_data(tsla_hourly, startDate, endDate)
tsla_4hourly_filtered = clean_and_filter_data(tsla_4hourly, startDate, endDate)
tsla_daily_filtered = clean_and_filter_data(tsla_daily, startDate, endDate)
tsla_weekly_filtered = clean_and_filter_data(tsla_weekly, startDate, endDate)

# Clean and Filter for AMZN
amzn_hourly_filtered = clean_and_filter_data(amzn_hourly, startDate, endDate)
amzn_4hourly_filtered = clean_and_filter_data(amzn_4hourly, startDate, endDate)
amzn_daily_filtered = clean_and_filter_data(amzn_daily, startDate, endDate)
amzn_weekly_filtered = clean_and_filter_data(amzn_weekly, startDate, endDate)

In [28]:
# Save all dataframes to files

# Saving for BTC
btc_hourly_filtered.to_csv("./datasets/crypto/hourly/BTC.csv")
btc_4hourly_filtered.to_csv("./datasets/crypto/4hourly/BTC.csv")
btc_daily_filtered.to_csv("./datasets/crypto/daily/BTC.csv")
btc_weekly_filtered.to_csv("./datasets/crypto/weekly/BTC.csv")

# Saving for ETH
eth_hourly_filtered.to_csv("./datasets/crypto/hourly/ETH.csv")
eth_4hourly_filtered.to_csv("./datasets/crypto/4hourly/ETH.csv")
eth_daily_filtered.to_csv("./datasets/crypto/daily/ETH.csv")
eth_weekly_filtered.to_csv("./datasets/crypto/weekly/ETH.csv")

# Saving for AAPL
aapl_hourly_filtered.to_csv("./datasets/stocks/hourly/AAPL.csv")
aapl_4hourly_filtered.to_csv("./datasets/stocks/4hourly/AAPL.csv")
aapl_daily_filtered.to_csv("./datasets/stocks/daily/AAPL.csv")
aapl_weekly_filtered.to_csv("./datasets/stocks/weekly/AAPL.csv")

# Saving for TSLA
tsla_hourly_filtered.to_csv("./datasets/stocks/hourly/TSLA.csv")
tsla_4hourly_filtered.to_csv("./datasets/stocks/4hourly/TSLA.csv")
tsla_daily_filtered.to_csv("./datasets/stocks/daily/TSLA.csv")
tsla_weekly_filtered.to_csv("./datasets/stocks/weekly/TSLA.csv")

# Saving for AMZN
amzn_hourly_filtered.to_csv("./datasets/stocks/hourly/AMZN.csv")
amzn_4hourly_filtered.to_csv("./datasets/stocks/4hourly/AMZN.csv")
amzn_daily_filtered.to_csv("./datasets/stocks/daily/AMZN.csv")
amzn_weekly_filtered.to_csv("./datasets/stocks/weekly/AMZN.csv")