In [9]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import os

# Cell 2: Load CSV Data
def load_data(file_path):
    """
    Load CSV data from the provided file path.
"""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"The file at path {file_path} does not exist.")
    
    try:
        # Read CSV with 'date' column as the index
        data = pd.read_csv(file_path, parse_dates=['date'], index_col='date')
    except Exception as e:
        raise ValueError(f"Failed to load data: {e}")

    return data

# Cell 3: Check for Missing Values
def check_missing_values(data):
    """
    Check for missing or NaN values in the dataset.
    """
    return data.isnull().sum()

# Cell 4: Fill Missing Values
def fill_missing_values(data):
    """
    Fill missing values in the dataset using the mean of the last two prices.
    """
    # Replace missing values in the 'close' column with the mean of the last two prices
    if 'close' in data.columns:
        data['close'] = data['close'].fillna(method='ffill').fillna(method='bfill')
    
    # Forward fill and backward fill for other columns
    data = data.ffill().bfill()

    return data

# Cell 5: Detect and Remove Outliers
def remove_outliers(data):
    """
    Detect and remove outliers using the IQR method.

    """
    numeric_columns = data.select_dtypes(include=[np.number]).columns
    for col in numeric_columns:
        Q1 = data[col].quantile(0.25)
        Q3 = data[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]
    return data

# Cell 6: Clean Data
def clean_data(data):
    """
    Perform additional data cleaning tasks.

    """
    # Remove rows with non-positive values for numeric columns
    numeric_columns = data.select_dtypes(include=[np.number]).columns
    data = data[(data[numeric_columns] > 0).all(axis=1)]

    # Remove outliers
    data = remove_outliers(data)

    return data

# Cell 7: Preprocess Data
def preprocess_data(file_path):
    """
    Complete framework for loading, cleaning, and processing data.
    """
    print("Loading data...")
    data = load_data(file_path)
    
    print("Checking for missing values...")
    missing_values = check_missing_values(data)
    if missing_values.sum() > 0:
        print(f"Missing values detected:\n{missing_values}")
        print("Filling missing values...")
        data = fill_missing_values(data)
        print("Missing values filled.")
    else:
        print("No missing values detected.")
    
    print("Cleaning data...")
    data = clean_data(data)
    print("Data cleaning complete.")

    return data

# Cell 8: Example Usage in Kaggle
if __name__ == "__main__":
    # Provide the path to your CSV file
    file_path = "/kaggle/input/btcinusd/BTC-2021min.csv"  # Update with the correct Kaggle dataset path

    try:
        processed_data = preprocess_data(file_path)
        print("Processed Data Sample:")
        print(processed_data.head())
    except Exception as e:
        print(f"Error: {e}")


Loading data...
Checking for missing values...
No missing values detected.
Cleaning data...
Data cleaning complete.
Processed Data Sample:
                           unix   symbol      open      high       low  \
date                                                                     
2022-03-01 03:41:00  1646106060  BTC/USD  43018.23  43046.59  43018.23   
2022-03-01 03:40:00  1646106000  BTC/USD  43022.24  43022.24  43016.03   
2022-03-01 03:39:00  1646105940  BTC/USD  43035.16  43035.16  42999.44   
2022-03-01 03:38:00  1646105880  BTC/USD  43077.82  43077.82  43049.46   
2022-03-01 03:37:00  1646105820  BTC/USD  43078.73  43092.09  43078.73   

                        close  Volume BTC    Volume USD  
date                                                     
2022-03-01 03:41:00  43046.58    0.142977   6154.673021  
2022-03-01 03:40:00  43016.03    0.009230    397.037957  
2022-03-01 03:39:00  42999.44    0.820950  35300.390268  
2022-03-01 03:38:00  43049.46    0.022210    956.143

In [15]:
processed_data.head

<bound method NDFrame.head of                            unix   symbol      open      high       low  \
date                                                                     
2022-03-01 03:41:00  1646106060  BTC/USD  43018.23  43046.59  43018.23   
2022-03-01 03:40:00  1646106000  BTC/USD  43022.24  43022.24  43016.03   
2022-03-01 03:39:00  1646105940  BTC/USD  43035.16  43035.16  42999.44   
2022-03-01 03:38:00  1646105880  BTC/USD  43077.82  43077.82  43049.46   
2022-03-01 03:37:00  1646105820  BTC/USD  43078.73  43092.09  43078.73   
...                         ...      ...       ...       ...       ...   
2021-01-01 00:06:00  1609459560  BTC/USD  28996.71  29025.68  28996.71   
2021-01-01 00:05:00  1609459500  BTC/USD  29021.86  29023.38  28982.33   
2021-01-01 00:04:00  1609459440  BTC/USD  29048.13  29057.73  29035.61   
2021-01-01 00:03:00  1609459380  BTC/USD  29037.68  29069.39  29019.00   
2021-01-01 00:02:00  1609459320  BTC/USD  29069.80  29073.02  29028.14   

       