In [141]:
# === Step 0: Import Libraries ===
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.preprocessing import StandardScaler

# Models & Pickle for saving
import joblib
import json
import os

print("✅ Libraries imported successfully.")

✅ Libraries imported successfully.


In [142]:
# === Step 1: Load Dataset ===
CSV_PATH = "./Dataset/Bitcoin - Dataset.csv"  # Change path if needed
df = pd.read_csv(CSV_PATH)
print("✅ Dataset loaded successfully.\n")
print("=== INITIAL HEAD ===")
print(df.head())


✅ Dataset loaded successfully.

=== INITIAL HEAD ===
         Date        Open        High         Low       Close   Adj Close  \
0  2014-09-17  465.864014  468.174011  452.421997  457.334015  457.334015   
1  2014-09-18  456.859985  456.859985  413.104004  424.440002  424.440002   
2  2014-09-19  424.102997  427.834991  384.532013  394.795990  394.795990   
3  2014-09-20  394.673004  423.295990  389.882996  408.903992  408.903992   
4  2014-09-21  408.084991  412.425995  393.181000  398.821014  398.821014   

     Volume  
0  21056800  
1  34483200  
2  37919700  
3  36863600  
4  26580100  


In [143]:
# === Step 2: Inspect Dataset ===
print("\n=== DATAFRAME INFO ===")
print(df.info())

print("\n=== MISSING VALUES ===")
print(df.isnull().sum())

print("\n=== STATISTICAL SUMMARY ===")
print(df.describe())



=== DATAFRAME INFO ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2713 entries, 0 to 2712
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2713 non-null   object 
 1   Open       2713 non-null   float64
 2   High       2713 non-null   float64
 3   Low        2713 non-null   float64
 4   Close      2713 non-null   float64
 5   Adj Close  2713 non-null   float64
 6   Volume     2713 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 148.5+ KB
None

=== MISSING VALUES ===
Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

=== STATISTICAL SUMMARY ===
               Open          High           Low         Close     Adj Close  \
count   2713.000000   2713.000000   2713.000000   2713.000000   2713.000000   
mean   11311.041069  11614.292482  10975.555057  11323.914637  11323.914637   
std    16106.428891  16537.390649

In [144]:
# === Step 4: Handle Missing Values ===
# Forward fill for dates, mean for numeric columns
df["Date"] = pd.to_datetime(df["Date"])
df["Open"].fillna(df["Open"].mean(), inplace=True)
df["High"].fillna(df["High"].mean(), inplace=True)
df["Low"].fillna(df["Low"].mean(), inplace=True)
df["Close"].fillna(df["Close"].mean(), inplace=True)
df["Adj Close"].fillna(df["Adj Close"].mean(), inplace=True)
df["Volume"].fillna(df["Volume"].mean(), inplace=True)

print("\n✅ Missing values handled successfully.")


✅ Missing values handled successfully.


In [151]:
# === Step 5: Remove Duplicates ===
before = df.shape
df = df.drop_duplicates()
after = df.shape
print(f"Dropped duplicates: By removing duplicates Before {before} → After {after}")
print("✅ Duplicates removed successfully.")

Dropped duplicates: By removing duplicates Before (2713, 28) → After (2713, 28)
✅ Duplicates removed successfully.


In [146]:
# === Step 6: Handle Outliers (IQR Capping) ===
def iqr_capping(series, k=1.5):
    q1, q3 = series.quantile([0.25, 0.75])
    iqr = q3 - q1
    lower, upper = q1 - k*iqr, q3 + k*iqr
    return series.clip(lower, upper)

for col in ["Open", "High", "Low", "Close", "Volume"]:
    df[col] = iqr_capping(df[col])

print("\n✅ Outliers capped using IQR method.")



✅ Outliers capped using IQR method.


In [147]:
# === Step 7: Feature Engineering & Encoding ===
# Extract meaningful date features
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["DayOfWeek"] = df["Date"].dt.dayofweek

# One-hot encode Month and DayOfWeek
df = pd.get_dummies(df, columns=["Month", "DayOfWeek"], drop_first=True)

# Create additional features
df["Open-Close"] = df["Open"] - df["Close"]
df["High-Low"] = df["High"] - df["Low"]
df["Low-Close"] = df["Low"] - df["Close"]

print("\n✅ Features engineered and categorical variables encoded.")


✅ Features engineered and categorical variables encoded.


In [148]:
# === Step 8: Feature Scaling ===
scaler = StandardScaler()
num_cols = ["Open", "High", "Low", "Close", "Volume", "Open-Close", "High-Low", "Low-Close"]
df[num_cols] = scaler.fit_transform(df[num_cols])

print("\n✅ Numerical features scaled successfully.")


✅ Numerical features scaled successfully.


In [149]:
# === Step 10: Final Dataset Check ===
print("\n=== FINAL HEAD ===")
print(df.head())

print("\n=== FINAL INFO ===")
print(df.info())

print("\n=== FINAL MISSING VALUES ===")
print(df.isnull().sum())


=== FINAL HEAD ===
        Date      Open      High       Low     Close   Adj Close    Volume  \
0 2014-09-17 -0.885645 -0.884096 -0.886190 -0.886821  457.334015 -0.806132   
1 2014-09-18 -0.886700 -0.885381 -0.890919 -0.890669  424.440002 -0.805367   
2 2014-09-19 -0.890539 -0.888680 -0.894355 -0.894138  394.795990 -0.805171   
3 2014-09-20 -0.893987 -0.889196 -0.893711 -0.892487  408.903992 -0.805232   
4 2014-09-21 -0.892416 -0.890431 -0.893315 -0.893667  398.821014 -0.805817   

   Year  Month_2  Month_3  ...  Month_12  DayOfWeek_1  DayOfWeek_2  \
0  2014    False    False  ...     False        False         True   
1  2014    False    False  ...     False        False        False   
2  2014    False    False  ...     False        False        False   
3  2014    False    False  ...     False        False        False   
4  2014    False    False  ...     False        False        False   

   DayOfWeek_3  DayOfWeek_4  DayOfWeek_5  DayOfWeek_6  Open-Close  High-Low  \
0        Fa

In [150]:
# === Step 11: Save Cleaned Dataset ===
os.makedirs("Dataset", exist_ok=True)
OUT_PATH = "./Dataset/Bitcoin-Cleaned-Dataset.csv"
df.to_csv(OUT_PATH, index=False)
print(f"\n✅ Cleaned dataset saved to {OUT_PATH}")



✅ Cleaned dataset saved to ./Dataset/Bitcoin-Cleaned-Dataset.csv
