In [257]:
# === Step 0: Import Libraries ===
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.preprocessing import StandardScaler

# Models & Pickle for saving
import joblib
import json
import os

print("✅ Libraries imported successfully.")

✅ Libraries imported successfully.


In [258]:
# === Step 1: Load Dataset ===
CSV_PATH = "./Dataset/Bitcoin - Dataset.csv"  # Change path if needed
df = pd.read_csv(CSV_PATH)
print("✅ Dataset loaded successfully.\n")
print("=== INITIAL HEAD ===")
print(df.head())


✅ Dataset loaded successfully.

=== INITIAL HEAD ===
         Date        Open        High         Low       Close   Adj Close  \
0  2014-09-17  465.864014  468.174011  452.421997  457.334015  457.334015   
1  2014-09-18  456.859985  456.859985  413.104004  424.440002  424.440002   
2  2014-09-19  424.102997  427.834991  384.532013  394.795990  394.795990   
3  2014-09-20  394.673004  423.295990  389.882996  408.903992  408.903992   
4  2014-09-21  408.084991  412.425995  393.181000  398.821014  398.821014   

     Volume  
0  21056800  
1  34483200  
2  37919700  
3  36863600  
4  26580100  


In [259]:
# === Step 2: Initial Data Exploration ===
print("\n=== DATAFRAME INFO ===")
print(df.info())

print("\n=== MISSING VALUES ===")
print(df.isnull().sum())

print("\n=== STATISTICAL SUMMARY ===")
print(df.describe())



=== DATAFRAME INFO ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2713 entries, 0 to 2712
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       2713 non-null   object 
 1   Open       2713 non-null   float64
 2   High       2713 non-null   float64
 3   Low        2713 non-null   float64
 4   Close      2713 non-null   float64
 5   Adj Close  2713 non-null   float64
 6   Volume     2713 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 148.5+ KB
None

=== MISSING VALUES ===
Date         0
Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

=== STATISTICAL SUMMARY ===
               Open          High           Low         Close     Adj Close  \
count   2713.000000   2713.000000   2713.000000   2713.000000   2713.000000   
mean   11311.041069  11614.292482  10975.555057  11323.914637  11323.914637   
std    16106.428891  16537.390649

In [260]:
# Step 3: Automatically detect numeric-like columns ===
# Keep only columns that contain numbers (ignore text/date)
numeric_like_cols = [
    col for col in df.columns
    if df[col].astype(str).str.replace(r"[^0-9.\-]", "", regex=True).str.strip().ne("").any()
]

print("\n=== NUMERIC-LIKE COLUMNS DETECTED ===")
print(numeric_like_cols)


=== NUMERIC-LIKE COLUMNS DETECTED ===
['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']


In [261]:
# Step 4: Convert them safely to numeric ===
for col in numeric_like_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")
print("\n✅ Numeric-like columns converted to numeric types.")



✅ Numeric-like columns converted to numeric types.


In [262]:
# Step 5: Handle missing values ===
df = df.fillna(df.median(numeric_only=True))
print("\n✅ Missing values handled.")


✅ Missing values handled.


In [263]:
# Step 6: Remove duplicates ===
before = df.shape
df = df.drop_duplicates()
after = df.shape
print(f"\nDropped duplicates:    {before} → {after}")


Dropped duplicates:    (2713, 7) → (2713, 7)


In [264]:
# Step 7: Create engineered features (if columns exist) ===
if set(["High", "Low"]).issubset(df.columns):
    df["Price_Range"] = df["High"] - df["Low"]
    df["Avg_Price"] = (df["High"] + df["Low"]) / 2
if set(["Open", "Close"]).issubset(df.columns):
    df["Day_Return"] = df["Close"] - df["Open"]
if set(["High", "Low", "Open"]).issubset(df.columns):
    df["Volatility"] = np.where(df["Open"] != 0, (df["High"] - df["Low"]) / df["Open"], 0)
print("\n✅ Engineered features created.")

# Optional if "Turnover" or "Trade Quantity" exists
if "Turnover (Lacs)" in df.columns and "Total Trade Quantity" in df.columns:
    df["Turnover_per_Trade"] = df["Turnover (Lacs)"] / (df["Total Trade Quantity"] + 1)


✅ Engineered features created.


In [265]:
# Step 8: Feature Scaling ===
scale_cols = ["Open", "High", "Low", "Price_Range", "Avg_Price", "Day_Return", "Volatility"]
scaler = StandardScaler()
df[scale_cols] = scaler.fit_transform(df[scale_cols])
print("\n✅ Feature scaling completed.")


✅ Feature scaling completed.


In [266]:
# Step 9: Save scaler
os.makedirs("models", exist_ok=True)
joblib.dump(scaler, "models/bitcoin_scaler.pkl")
print("\n✅ Scaler saved to models/bitcoin_scaler.pkl")


✅ Scaler saved to models/bitcoin_scaler.pkl


In [267]:
# Step 10: Save training columns ===
TRAIN_COLUMNS = df.drop(columns=["Close", "Date"]).columns.tolist()
json.dump(TRAIN_COLUMNS, open("models/bitcoin_train_columns.json", "w"))
print("\n✅ Training columns saved to models/bitcoin_train_columns.json")


✅ Training columns saved to models/bitcoin_train_columns.json


In [268]:

# Step 11: Final Snapshot ===
print("\n=== FINAL HEAD ===")
print(df.head())

print("\n=== FINAL INFO ===")
print(df.info())

print("\n=== FINAL MISSING VALUES ===")
print(df.isnull().sum())


=== FINAL HEAD ===
   Date      Open      High       Low       Close   Adj Close    Volume  \
0   NaN -0.673469 -0.674119 -0.674314  457.334015  457.334015  21056800   
1   NaN -0.674028 -0.674804 -0.676833  424.440002  424.440002  34483200   
2   NaN -0.676062 -0.676559 -0.678664  394.795990  394.795990  37919700   
3   NaN -0.677890 -0.676833 -0.678321  408.903992  408.903992  36863600   
4   NaN -0.677057 -0.677491 -0.678110  398.821014  398.821014  26580100   

   Price_Range  Avg_Price  Day_Return  Volatility  
0    -0.535231  -0.674374   -0.027585   -0.330762  
1    -0.511172  -0.675950   -0.058374    1.247730  
2    -0.511561  -0.677743   -0.054362    1.408970  
3    -0.520058  -0.677717    0.001749    0.964565  
4    -0.532230  -0.677953   -0.028531    0.009244  

=== FINAL INFO ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2713 entries, 0 to 2712
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   D

In [269]:
# Step 12: Save Cleaned Dataset ===
os.makedirs("Dataset", exist_ok=True)
OUT_PATH = "./Dataset/Bitcoin-Cleaned-Dataset.csv"
df.to_csv(OUT_PATH, index=False)
print(f"\n✅ Cleaned dataset saved to {OUT_PATH}")


✅ Cleaned dataset saved to ./Dataset/Bitcoin-Cleaned-Dataset.csv
