In [287]:
# model.py

# House Price Prediction – Model Training Script
# Author: Ali Omar Abdi
# Course: Data Science & Machine Learning Bootcamp
# Task: Regression (Supervised Learning)

# === Step 0: Import Libraries ===
import pandas as pd
import numpy as np

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor




# Evaluation Metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV

# Models & Pickle for saving
import joblib


print("✅ Libraries imported successfully.")

✅ Libraries imported successfully.


In [288]:
# === Step 0: Load the dataset
csv_path = "./Dataset/Bitcoin-Cleaned-Dataset.csv"
df = pd.read_csv(csv_path)

print("✅ Dataset loaded successfully.")

✅ Dataset loaded successfully.


In [289]:
# Step: 1 Prepare X and y
X = df.drop(columns=["Close", "Date"]).copy()
y = df["Close"].copy()

print("✅ Features and target variable prepared.")

✅ Features and target variable prepared.


In [290]:
# === Step 2: Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("✅ Data split into training and testing sets.")

✅ Data split into training and testing sets.


In [291]:
# === Step 3: Feature Scaling ===
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

print("✅ Feature scaling applied.")

✅ Feature scaling applied.


In [292]:
# === Step 4: Train Models ===
# Train Random Forest


rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train)
y_pred_rf = rf.predict(X_test_scaled)

# Train XGBoost
xgb = XGBRegressor(n_estimators=200, random_state=42)
xgb.fit(X_train_scaled, y_train)
y_pred_xgb = xgb.predict(X_test_scaled)

# Train Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

# Train SVM
svm = SVR(kernel='rbf')
svm.fit(X_train_scaled, y_train)
y_pred_svm = svm.predict(X_test_scaled)

# Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_scaled, y_train)
y_pred_dt = dt.predict(X_test_scaled)



print("✅ Models trained successfully.")

✅ Models trained successfully.


In [293]:
# === Step 5 : Train Models and Make Predictions ===

# 1) Define all models in a dictionary for easy training & evaluation
models = {
    "Random Forest": RandomForestRegressor(n_estimators=200, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, random_state=42),
    "Linear Regression": LinearRegression(),
    "SVM": SVR(kernel='rbf'),
    "Decision Tree": DecisionTreeRegressor(random_state=42)
}

# 2) Train models and store predictions
predictions = {}  # to store test predictions
for name, model in models.items():
    model.fit(X_train_scaled, y_train)            # Train the model
    y_pred = model.predict(X_test_scaled)         # Predict on test set
    predictions[name] = y_pred
    print(f"✅ {name} trained successfully.")

# 3) Compare actual vs predicted values for Linear Regression (example)
mlr_diff = pd.DataFrame({
    'Actual value': y_test,
    'Predicted value': predictions["Linear Regression"]
})

print("\n=== Linear Regression: Actual vs Predicted ===")

print(mlr_diff.head())


✅ Random Forest trained successfully.
✅ XGBoost trained successfully.
✅ Linear Regression trained successfully.
✅ SVM trained successfully.
✅ Decision Tree trained successfully.

=== Linear Regression: Actual vs Predicted ===
      Actual value  Predicted value
506     386.549011       386.549011
1535   4017.268555      4017.268555
1905   7448.307617      7448.307617
930    1133.250000      1133.250000
2676  43099.699219     43099.699219


In [294]:
# Step 6: Evaluate the models' accuracy on the training and testing datasets
# === 1) List of models 
models = {
    "Random Forest": rf,
    "XGBoost": xgb,
    "Linear Regression": lr,
    "SVM": svm,
    "Decision Tree Regressor": dt
}

# === 2) Evaluate accuracy on training and testing datasets ===
for name, model in models.items():
    train_acc = model.score(X_train_scaled, y_train) * 100
    test_acc = model.score(X_test_scaled, y_test) * 100
 
    print(f"{name} - Accuracy on Training Dataset: {train_acc:.2f}%")
    print(f"{name} - Accuracy on Testing Dataset: {test_acc:.2f}%")
    print("-" * 50)

print("✅ All models evaluated successfully.")


Random Forest - Accuracy on Training Dataset: 100.00%
Random Forest - Accuracy on Testing Dataset: 100.00%
--------------------------------------------------
XGBoost - Accuracy on Training Dataset: 100.00%
XGBoost - Accuracy on Testing Dataset: 99.98%
--------------------------------------------------
Linear Regression - Accuracy on Training Dataset: 100.00%
Linear Regression - Accuracy on Testing Dataset: 100.00%
--------------------------------------------------
SVM - Accuracy on Training Dataset: -7.09%
SVM - Accuracy on Testing Dataset: -8.03%
--------------------------------------------------
Decision Tree Regressor - Accuracy on Training Dataset: 100.00%
Decision Tree Regressor - Accuracy on Testing Dataset: 99.99%
--------------------------------------------------
✅ All models evaluated successfully.


In [295]:
# === Step 7: Model Evaluation ===
# Evaluate Random Forest
rf_r2 = r2_score(y_test, y_pred_rf)
rf_mse = mean_squared_error(y_test, y_pred_rf)
rf_mae = mean_absolute_error(y_test, y_pred_rf)
print(f"Random Forest - R2: {rf_r2:.3f}, MSE: {rf_mse:.0f}, MAE: {rf_mae:.0f}")

# Evaluate XGBoost
xgb_r2 = r2_score(y_test, y_pred_xgb)
xgb_mse = mean_squared_error(y_test, y_pred_xgb)
xgb_mae = mean_absolute_error(y_test, y_pred_xgb)
print(f"XGBoost - R2: {xgb_r2:.3f}, MSE: {xgb_mse:.2f}, MAE: {xgb_mae:.2f}")

# Evaluate Linear Regression
lr_r2 = r2_score(y_test, y_pred_lr)
lr_mse = mean_squared_error(y_test, y_pred_lr)
lr_mae = mean_absolute_error(y_test, y_pred_lr)
print(f"Linear Regression - R2: {lr_r2:.3f}, MSE: {lr_mse:.0f}, MAE: {lr_mae:.0f}")


# Evaluate SVM
svm_r2 = r2_score(y_test, y_pred_svm)
svm_mse = mean_squared_error(y_test, y_pred_svm)
svm_mae = mean_absolute_error(y_test, y_pred_svm)
print(f"SVM - R2: {svm_r2:.3f}, MSE: {svm_mse:.0f}, MAE: {svm_mae:.0f}")

# Evaluate Decision Tree
dt_r2 = r2_score(y_test, y_pred_dt)
dt_mse = mean_squared_error(y_test, y_pred_dt)
dt_mae = mean_absolute_error(y_test, y_pred_dt)
print(f"Decision Tree - R2: {dt_r2:.3f}, MSE: {dt_mse:.0f}, MAE: {dt_mae:.0f}")

print("\n✅ Models evaluated successfully.")

Random Forest - R2: 1.000, MSE: 9692, MAE: 31
XGBoost - R2: 1.000, MSE: 53295.84, MAE: 89.79
Linear Regression - R2: 1.000, MSE: 0, MAE: 0
SVM - R2: -0.080, MSE: 296752779, MAE: 9857
Decision Tree - R2: 1.000, MSE: 15949, MAE: 42

✅ Models evaluated successfully.


In [296]:

# step 8: Single-row sanity check
i = 3
x_one_df = X_test.iloc[[i]]
y_true = y_test.iloc[i]
p_lr_one = float(lr.predict(x_one_df)[0])
p_rf_one = float(rf.predict(x_one_df)[0])
pr_XGBoost_one = float(xgb.predict(x_one_df)[0])
pr_Decision_reegr_one= float(dt.predict(x_one_df)[0])
print("\nSingle-row sanity check:")
print(f"  Actual Price: ${y_true:,.0f}")
print(f"  Linear Regression Prediction: ${p_lr_one:,.0f}")
print(f"  Random Forest Prediction: ${p_rf_one:,.0f}")
print(f"  XGBoost Prediction: ${pr_XGBoost_one:,.0f}")
print(f"  Decision Tree Prediction: ${pr_Decision_reegr_one:,.0f}")





Single-row sanity check:
  Actual Price: $1,133
  Linear Regression Prediction: $9,085,507
  Random Forest Prediction: $25,182
  XGBoost Prediction: $17,028
  Decision Tree Prediction: $65,993
