# 01_data_preparation.ipynb

## Notebook Purpose
This notebook is designed to load, clean, and preprocess historical cryptocurrency data. It will also calculate technical indicators that will be used for further analysis and model training.

## Instructions
1. **Import Necessary Libraries**:
   - Import `pandas` for data manipulation.
   - Import functions from `utils.py` for loading, preprocessing data, and calculating technical indicators.

2. **Load Data**:
   - Use the `load_data` function to load the CSV file containing historical cryptocurrency data.

3. **Preprocess Data**:
   - Use the `preprocess_data` function to clean and preprocess the loaded data.
   - Ensure any missing values are handled appropriately.

4. **Calculate Technical Indicators**:
   - Use the `calculate_indicators` function to add technical indicators (e.g., SMA, EMA, RSI) to the data.

5. **Save Preprocessed Data**:
   - Save the cleaned and preprocessed data, including the calculated technical indicators, to a new CSV file for later use.

6. **Review Data**:
   - Display the first few rows of the preprocessed data to ensure it looks correct.

## Example Code
```python
# Import necessary libraries
import pandas as pd
from scripts.utils import load_data, preprocess_data, calculate_indicators

# Load data
data_path = 'data/historical_data/btc_usd.csv'  # Update this path based on the selected cryptocurrency
data = load_data(data_path)

# Preprocess data
data = preprocess_data(data)

# Calculate technical indicators
data = calculate_indicators(data)

# Save the preprocessed data
data.to_csv('data/historical_data/btc_usd_preprocessed.csv')

# Display the first few rows of the preprocessed data
data.head()

In [None]:
# Cell 1: Import necessary libraries and verify
try:
    import pandas as pd
    import numpy as np
    from pandas_datareader import data as pdr
    from datetime import datetime
    import matplotlib.pyplot as plt
    import seaborn as sns
    import requests
    from dotenv import load_dotenv
    import os
    import ccxt
    %matplotlib inline
    print("Libraries loaded successfully. Let's proceed!")
except ImportError as e:
    print(f"Uh-oh! Please verify the installation of: {e.name}")


In [None]:
# Cell 2: Load environment variables and fetch API keys
import os
from dotenv import load_dotenv
import requests

# Load environment variables
load_dotenv()

# Fetch API keys
COINBASE_API_KEY = os.getenv("COINBASE_API_KEY")
COINBASE_API_SECRET = os.getenv("COINBASE_API_SECRET")
ALPHA_VANTAGE_API_KEY = os.getenv("ALPHA_VANTAGE_API_KEY")
CRYPTOCOMPARE_API_KEY = os.getenv("CRYPTOCOMPARE_API_KEY")


In [None]:
# Cell 3: Function to fetch data from Alpha Vantage
def fetch_alpha_vantage_data(symbol):
    base_url = "https://www.alphavantage.co/query"
    params = {
        "function": "DIGITAL_CURRENCY_DAILY",
        "symbol": symbol,
        "market": "USD",
        "apikey": ALPHA_VANTAGE_API_KEY
    }
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if 'Time Series (Digital Currency Daily)' in data:
            time_series = data['Time Series (Digital Currency Daily)']
            df = pd.DataFrame.from_dict(time_series, orient='index')
            df = df.rename(columns={
                '1a. open (USD)': 'open',
                '2a. high (USD)': 'high',
                '3a. low (USD)': 'low',
                '4a. close (USD)': 'close',
                '5. volume': 'volume'
            })
            df.index = pd.to_datetime(df.index)
            df.reset_index(inplace=True)
            df = df.rename(columns={'index': 'time'})
            return df
        else:
            print(f"No 'Time Series (Digital Currency Daily)' data found for {symbol}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        return None


In [None]:
# Cell 4: Function to fetch data from Coinbase
def fetch_coinbase_data(currency_pair):
    base_url = "https://api.coinbase.com/v2/prices/"
    url = f"{base_url}{currency_pair}/spot"
    headers = {
        "CB-ACCESS-KEY": COINBASE_API_KEY,
        "CB-ACCESS-SIGN": COINBASE_API_SECRET
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        return None


In [None]:
# Cell 5: Function to fetch data from CryptoCompare
def fetch_cryptocompare_data(symbol, start_date, end_date):
    base_url = f"https://min-api.cryptocompare.com/data/v2/histoday"
    params = {
        "fsym": symbol,
        "tsym": "USD",
        "toTs": int(pd.Timestamp(end_date).timestamp()),
        "limit": 2000, # CryptoCompare allows fetching up to 2000 days in one call
        "api_key": CRYPTOCOMPARE_API_KEY
    }
    try:
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        data = response.json()
        if 'Data' in data and 'Data' in data['Data']:
            data = data['Data']['Data']
            df = pd.DataFrame(data)
            df['time'] = pd.to_datetime(df['time'], unit='s')
            df = df.rename(columns={'open': 'open', 'high': 'high', 'low': 'low', 'close': 'close', 'volumeto': 'volume'})
            df = df[['time', 'open', 'high', 'low', 'close', 'volume']]
            return df
        else:
            print(f"No data available for {symbol} from CryptoCompare")
            return None
    except requests.exceptions.RequestException as e:
        print(f"API request failed: {e}")
        return None


In [None]:
# Cell 6: Function to save data to a CSV file
def save_data_to_csv(data, filename):
    if data is not None and not data.empty:
        data.to_csv(filename, index=False)
        print(f"Data saved to {filename}")
    else:
        print(f"No data to save for {filename}")


In [None]:
# Cell 7: Load manually downloaded data
def load_manual_data():
    manual_data = {}
    cryptos = ["BTC", "ETH", "SOL"]
    for crypto in cryptos:
        try:
            data = pd.read_csv(f'../data/historical_data/{crypto}-USD.csv', parse_dates=['Date'])
            data.rename(columns={'Date': 'time'}, inplace=True)
            manual_data[crypto] = data
            print(f"Manual data loaded for {crypto}")
        except Exception as e:
            print(f"Error loading manually downloaded data for {crypto}: {e}")
    return manual_data


In [None]:
# Cell 8: Fetch data from APIs
def fetch_api_data():
    api_data = {}
    cryptos = ["BTC", "ETH", "SOL"]
    for crypto in cryptos:
        # Fetch data from Alpha Vantage
        alpha_vantage_data = fetch_alpha_vantage_data(crypto)
        if alpha_vantage_data is not None:
            api_data[f"{crypto}_alpha_vantage"] = alpha_vantage_data
        
        # Fetch data from Coinbase
        coinbase_data = fetch_coinbase_data(f"{crypto}-USD")
        if coinbase_data is not None:
            api_data[f"{crypto}_coinbase"] = coinbase_data
        
        # Fetch data from CryptoCompare
        for year in range(2018, 2024):
            start_date = f"{year}-01-01"
            end_date = f"{year}-12-31"
            cryptocompare_data = fetch_cryptocompare_data(crypto, start_date, end_date)
            if cryptocompare_data is not None:
                api_data[f"{crypto}_cryptocompare_{year}"] = cryptocompare_data
    
    return api_data


In [None]:
# Cell 9: Combine data from all sources
def combine_data_sources(api_data, manual_data):
    combined_data = {}
    cryptos = ["BTC", "ETH", "SOL"]
    for crypto in cryptos:
        combined_df = pd.DataFrame()
        for source in ['alpha_vantage', 'coinbase', 'cryptocompare']:
            for key, df in api_data.items():
                if key.startswith(f"{crypto}_{source}"):
                    combined_df = pd.concat([combined_df, df])
        if crypto in manual_data:
            combined_df = pd.concat([combined_df, manual_data[crypto]])
        combined_df = combined_df.sort_values('time').drop_duplicates()
        combined_data[crypto] = combined_df
    return combined_data


In [None]:
# Cell 10: Save combined data to CSV
def save_combined_data(combined_data):
    for crypto, df in combined_data.items():
        file_path = f'../data/cleaned_data/{crypto}_cleaned.csv'
        df.to_csv(file_path, index=False)
        print(f"Combined data saved to {file_path}")

# Execute the data preparation steps
print("Fetching API data...")
api_data = fetch_api_data()

print("Loading manual data...")
manual_data = load_manual_data()

print("Combining data sources...")
combined_data = combine_data_sources(api_data, manual_data)

print("Saving combined data...")
save_combined_data(combined_data)

print("Data preparation complete.")
