# 03_model_generation.ipynb

## Notebook Purpose
This notebook is designed to develop and train machine learning models using the preprocessed cryptocurrency data. The trained models will be used for making predictions in subsequent notebooks.

## Instructions
1. **Import Necessary Libraries**:
   - Import `pandas` for data manipulation.
   - Import functions from `models.py` for training models.

2. **Load Preprocessed Data**:
   - Load the preprocessed CSV file created in the first notebook.

3. **Train Machine Learning Models**:
   - Use the `train_model` function to train a machine learning model (e.g., Random Forest) on the historical data.
   - Split the data into training and testing sets.

4. **Save the Trained Model**:
   - Save the trained model to a file for later use in making predictions.

5. **Evaluate Model Performance**:
   - Evaluate the model's performance using appropriate metrics (e.g., R^2 score).

## Example Code
```python
# Import necessary libraries
import pandas as pd
from scripts.models import train_model
import joblib

# Load preprocessed data
data_path = 'data/historical_data/btc_usd_preprocessed.csv'  # Update this path based on the selected cryptocurrency
data = pd.read_csv(data_path, parse_dates=['Date'], index_col='Date')

# Train model
model, X_test, y_test = train_model(data)

# Save the model and test data
joblib.dump(model, 'models/trained_model.pkl')
X_test.to_csv('data/historical_data/X_test.csv')
y_test.to_csv('data/historical_data/y_test.csv')

# Display model performance
print(f"Model trained. R^2 score on training data: {model.score(X_test, y_test)}")


In [None]:
# Import necessary libraries and verify
try:
    import pandas as pd
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.metrics import mean_squared_error, r2_score
    import joblib
    import os
    print("Libraries imported successfully.")
except ImportError as e:
    print(f"Error importing libraries: {e}")


In [None]:
# Load preprocessed data
def load_data(data_path):
    try:
        data = pd.read_csv(data_path, parse_dates=['time'], index_col='time')
        print(f"Data loaded successfully from {data_path}.")
        return data
    except FileNotFoundError as e:
        print(f"Error loading data: {e}")
        return None

data_path = 'data/cleaned_data/BTC_cleaned.csv'  # Update this path based on the selected cryptocurrency
data = load_data(data_path)

if data is not None:
    display(data.head())


In [None]:
# Prepare features and target variable
def prepare_features_target(data):
    data = data.copy()
    data['returns'] = data['close'].pct_change()
    data.dropna(inplace=True)
    
    X = data.drop(columns=['close', 'returns'])
    y = data['returns']
    
    return X, y

if data is not None:
    X, y = prepare_features_target(data)
    print(f"Features and target variable prepared. X shape: {X.shape}, y shape: {y.shape}")


In [None]:
# Split data into training and testing sets
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

if data is not None:
    X_train, X_test, y_train, y_test = split_data(X, y)
    print(f"Data split into training and testing sets. X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")


In [None]:
# Train the machine learning model
def train_model(X_train, y_train):
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    return model

if data is not None:
    model = train_model(X_train, y_train)
    print("Model trained successfully.")


In [None]:
# Evaluate model performance
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Model evaluation - MSE: {mse}, R2: {r2}")
    return mse, r2

if data is not None:
    mse, r2 = evaluate_model(model, X_test, y_test)


In [None]:
# Save the trained model and test data
def save_model_data(model, X_test, y_test, model_path, X_test_path, y_test_path):
    joblib.dump(model, model_path)
    X_test.to_csv(X_test_path)
    y_test.to_csv(y_test_path)
    print(f"Model and test data saved to {model_path}, {X_test_path}, {y_test_path}")

if data is not None:
    model_path = 'models/trained_model.pkl'
    X_test_path = 'data/historical_data/X_test.csv'
    y_test_path = 'data/historical_data/y_test.csv'
    
    save_model_data(model, X_test, y_test, model_path, X_test_path, y_test_path)


In [None]:
# Display model performance
if data is not None:
    print(f"Model trained. R^2 score on test data: {r2}")
