In [685]:
#  === Step 2: Drop the Date column (not useful for numeric models)
df = df.drop(columns=["Date"], errors="ignore")
print("✅ Date column dropped successfully.")

✅ Date column dropped successfully.


In [686]:
#  === Step 3: Define features and target variable
X = df.drop(columns=["Close"])  # Features: everything except Close price
y = df["Close"]                 # Target: the closing price
print("✅ Features and target variable defined.")

✅ Features and target variable defined.


In [687]:
#  === Step 4: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Display dataset split information
print("✅ Data successfully split into training and testing sets.")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

✅ Data successfully split into training and testing sets.
Training set size: 2170 samples
Testing set size: 543 samples


In [688]:
# === Step 5: Train Models ===
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
}
for name, model in models.items():
    model.fit(X_train, y_train)
    print(f"✅ {name} trained successfully.")

✅ Linear Regression trained successfully.
✅ Random Forest Regressor trained successfully.


In [689]:
# Step 6: Predict using the trained models
# Use Linear Regression and Random Forest to predict the target values for the test set
y_pred_lr = models["Linear Regression"].predict(X_test)
y_pred_rf = models["Random Forest Regressor"].predict(X_test)

# Show results
print("✅ Predictions are ready!")
print("\nLinear Regression predictions:", y_pred_lr)
print("\nRandom Forest predictions:", y_pred_rf)

✅ Predictions are ready!

Linear Regression predictions: [  386.549011    4017.268555    7448.307617    1133.25
 43099.699219     243.593994     386.354004   17706.900391
 55907.19921899 41821.261719     777.94397     2548.290039
  4073.26001      281.653992   11657.200195    9771.489258
  1205.01001    50429.859375   57229.828125     447.610992
  8745.894531    8820.52246099 11600.099609   57401.097656
   447.976013   10948.990234     455.67099     5903.439941
  1929.819946    8550.760742    7068.47998    65466.839844
  9525.750977    6423.759766     456.078003     416.437988
 41500.875        973.497009     580.182007     254.320007
   374.785004     758.700012    6371.299805    7037.580078
   217.110992    8206.145508   61888.832031   10400.915039
  9800.636719     422.744995     657.070984    3894.130859
  9264.813477    9412.612305    1100.22998      228.572998
   375.490997    1187.810059   35615.871094    9344.365234
   376.522003    1211.670044     233.542999    4087.659912
  7

In [690]:
# === Step 7: Metrics Function ===
def evaluate_model(model_name, y_pred, y_true):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)

    print(f"\n{model_name} Performance:")
    print(f"  R²   : {r2:.3f}")
    print(f"  MAE  : {mae:.0f}")
    print(f"  MSE  : {mse:.0f}")
    print(f"  RMSE : {rmse:.0f}")

evaluate_model("Linear Regression", y_pred_lr, y_test)
evaluate_model("Random Forest Regressor", y_pred_rf, y_test)





Linear Regression Performance:
  R²   : 1.000
  MAE  : 0
  MSE  : 0
  RMSE : 0

Random Forest Regressor Performance:
  R²   : 1.000
  MAE  : 31
  MSE  : 9434
  RMSE : 97


In [691]:
# === Step 8: Single-row Sanity Check ===
i = 3
x_one_df = X_test.iloc[[i]]
y_true = y_test.iloc[i]
p_lr_one = float(models["Linear Regression"].predict(x_one_df)[0])
p_rf_one = float(models["Random Forest Regressor"].predict(x_one_df)[0])
print("\n=== Single-row Sanity Check (Index 3) ===")
print(f"  Actual Close Price : {y_true:.0f}")
print(f"  LR Predicted       : {p_lr_one:.0f}")
print(f"  RF Predicted       : {p_rf_one:.0f}")


=== Single-row Sanity Check (Index 3) ===
  Actual Close Price : 1133
  LR Predicted       : 1133
  RF Predicted       : 1140


In [692]:
# === Step 9: Save  Models  === (New)
joblib.dump(models["Linear Regression"], "./Models/bitcoin_lr_model.joblib")
joblib.dump(models["Random Forest Regressor"], "./Models/bitcoin_rf_model.joblib")

print("\n✅ Models saved successfully to the Models/ directory.")


✅ Models saved successfully to the Models/ directory.


In [693]:
# === Step 10: Custom Input Prediction using helper ===
import os
import json
import pandas as pd
import joblib

ART_DIR = "./Models"  # correct folder in this repo

# artifact paths (use the filenames that exist in the repo)
lr_path = os.path.join(ART_DIR, "bitcoin_lr_model.joblib")
rf_path = os.path.join(ART_DIR, "bitcoin_rf_model.joblib")
scaler_path = os.path.join(ART_DIR, "bitcoin_scaler.pkl")
train_cols_path = os.path.join(ART_DIR, "bitcoin_train_columns.json")

# Load models/artifacts with helpful errors
if not os.path.exists(lr_path) or not os.path.exists(rf_path):
    raise FileNotFoundError(f"Expected model files not found in {ART_DIR}. "
                            f"Found: {os.listdir(ART_DIR)}")

lr = joblib.load(lr_path)
rf = joblib.load(rf_path)
scaler = joblib.load(scaler_path) if os.path.exists(scaler_path) else None

if not os.path.exists(train_cols_path):
    raise FileNotFoundError(f"Training columns file not found: {train_cols_path}")
train_cols = json.load(open(train_cols_path))

# Helper function to prepare features from raw input
def prepare_features_from_raw(raw):
    df = pd.DataFrame([raw])
    # Example derived features (keep these if your model expects similar columns)
    # Adjust names/logic to match the exact derived features you used during training
    if "High" in df.columns and "Low" in df.columns:
        df["High-Low"] = df["High"] - df["Low"]
    if "Low" in df.columns and "Close" in df.columns:
        df["Low-Close"] = df["Low"] - df["Close"]

    # Example: day-of-week one-hot (adjust indices/naming to your training set)
    for i in range(1, 8):  # 1..7 safer, adjust to your training encoding
        df[f"DayOfWeek_{i}"] = 1 if i == raw.get("DayOfWeek", 1) else 0

    # Example: month one-hot — adapt months to what you used in training
    months_used = list(range(1, 13))  # include all months by default; change if needed
    for m in months_used:
        df[f"Month_{m}"] = 1 if m == raw.get("Month", 1) else 0

    # Return dataframe of derived features (we'll align to train_cols later)
    return df

# Example raw custom input (change values as needed)
custom = {
    "Open": 45000, "High": 46000, "Low": 44000, "Close": 45500, "Volume": 35000,
    "DayOfWeek": 1, "Month": 10
}

# Prepare features from raw input
x_new = prepare_features_from_raw(custom)

# Ensure all training columns exist and are in the correct order
for c in train_cols:
    if c not in x_new.columns:
        # initialize missing columns to 0.0 (or NaN -> fill with 0.0)
        x_new[c] = 0.0

# Reorder columns exactly to training order
x_new = x_new[train_cols].copy()

# Apply scaler safely if available
X_input = x_new.copy()
if scaler is not None:
    try:
        # If scaler has mean_ attribute, use its length to decide which columns it was fit on
        scaler_mean = getattr(scaler, "mean_", None)
        if scaler_mean is not None and scaler_mean.shape[0] == len(train_cols):
            X_input = pd.DataFrame(scaler.transform(X_input), columns=train_cols)
        elif scaler_mean is not None and scaler_mean.shape[0] < len(train_cols):
            # scaler was fit on a prefix of columns; apply to that prefix only
            n = scaler_mean.shape[0]
            prefix_cols = train_cols[:n]
            X_input[prefix_cols] = scaler.transform(X_input[prefix_cols])
        else:
            # fallback - attempt to transform whole row (may raise if shapes mismatch)
            X_input = pd.DataFrame(scaler.transform(X_input), columns=train_cols)
    except Exception as e:
        print("⚠️ Warning: scaler transform failed, using unscaled input. Error:", e)
        X_input = x_new.copy()

# Make predictions
lr_pred = lr.predict(X_input)
rf_pred = rf.predict(X_input)

print("\n=== Custom Input Prediction ===")
print("Linear Regression:", float(lr_pred[0]))
print("Random Forest    :", float(rf_pred[0]))

Feature names unseen at fit time:
- Adj Close
- Volume
Feature names seen at fit time, yet now missing:
- Day_Return
- Volatility


=== Custom Input Prediction ===
Linear Regression: 2.787997226642443
Random Forest    : 21653.515100409993
