# LightGBM Model Training for Mean Reversion Strategy
## Train and save ML model for cryptocurrency trading

In [None]:
# ========== IMPORTS ==========
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score, precision_recall_curve, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import sys
from pathlib import Path

# Add strategies directory to path
sys.path.insert(0, str(Path.cwd().parent / 'strategies'))
from features import create_features, select_features

print("Imports complete!")

## 1. Load Data

First, download data using freqtrade:
```bash
freqtrade download-data --exchange binance --pairs BTC/USDT ETH/USDT --timeframe 5m --days 180
```

In [None]:
# Load from freqtrade data directory
data_path = Path.cwd().parent / 'data' / 'binance' / 'BTC_USDT-5m.json'

if not data_path.exists():
    print(f"Data file not found at {data_path}")
    print("Please run: freqtrade download-data --exchange binance --pairs BTC/USDT --timeframe 5m --days 180")
else:
    df_btc = pd.read_json(data_path)
    df_btc['date'] = pd.to_datetime(df_btc['date'], unit='ms')
    df_btc.set_index('date', inplace=True)

    print(f"Loaded {len(df_btc)} candles")
    print(f"Date range: {df_btc.index.min()} to {df_btc.index.max()}")
    print(f"Columns: {df_btc.columns.tolist()}")
    df_btc.head()

## 2. Feature Engineering

In [None]:
print("Creating features...")
df = create_features(df_btc)

# Select features
feature_columns = select_features(df)
print(f"Selected {len(feature_columns)} features")
print(f"Features: {feature_columns}")

## 3. Create Labels

Label = 1 if price increases by >1% in next 5-20 candles

In [None]:
def create_labels(df, threshold=0.01, horizon_min=5, horizon_max=20):
    """
    Create labels for mean reversion:
    Label = 1 if price increases by threshold within horizon
    """
    labels = np.zeros(len(df))

    for i in range(len(df) - horizon_max):
        future_prices = df['close'].iloc[i+horizon_min:i+horizon_max+1]
        max_return = (future_prices.max() - df['close'].iloc[i]) / df['close'].iloc[i]

        if max_return > threshold:
            labels[i] = 1

    return labels

df['label'] = create_labels(df, threshold=0.01, horizon_min=5, horizon_max=20)

print(f"Positive labels: {df['label'].sum()} ({df['label'].mean()*100:.1f}%)")
print(f"Negative labels: {(1-df['label']).sum()} ({(1-df['label']).mean()*100:.1f}%)")

## 4. Prepare Data

In [None]:
# Remove NaN values
df = df.dropna()

# Split features and labels
X = df[feature_columns]
y = df['label']

# Temporal train/test split (80/20)
split_idx = int(len(df) * 0.8)
X_train = X.iloc[:split_idx]
y_train = y.iloc[:split_idx]
X_test = X.iloc[split_idx:]
y_test = y.iloc[split_idx:]

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")
print(f"Train date range: {X_train.index.min()} to {X_train.index.max()}")
print(f"Test date range: {X_test.index.min()} to {X_test.index.max()}")

## 5. Train LightGBM Model

In [None]:
# Optimal parameters for trading
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'max_depth': 5,
    'learning_rate': 0.01,
    'n_estimators': 1000,
    'min_child_samples': 20,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1,
    'random_state': 42,
    'n_jobs': -1,
    'verbose': -1
}

# Create model
model = lgb.LGBMClassifier(**params)

# Train with early stopping
print("Training model...")
model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    callbacks=[lgb.early_stopping(20), lgb.log_evaluation(50)]
)

print(f"\nBest iteration: {model.best_iteration_}")
print("Training complete!")

## 6. Evaluate Model

In [None]:
# Predictions
y_pred_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_proba > 0.6).astype(int)  # 60% threshold

# Metrics
auc = roc_auc_score(y_test, y_pred_proba)
print("="*50)
print("MODEL EVALUATION")
print("="*50)
print(f"\nROC-AUC Score: {auc:.4f}")

print("\nClassification Report (threshold=0.6):")
print(classification_report(y_test, y_pred, target_names=['No Trade', 'Trade']))

# Precision-Recall at different thresholds
print("\nPrecision-Recall at different thresholds:")
for threshold in [0.5, 0.6, 0.7, 0.8]:
    y_pred_t = (y_pred_proba > threshold).astype(int)
    precision = np.sum((y_pred_t == 1) & (y_test == 1)) / max(np.sum(y_pred_t == 1), 1)
    recall = np.sum((y_pred_t == 1) & (y_test == 1)) / np.sum(y_test == 1)
    print(f"  Threshold {threshold}: Precision={precision:.3f}, Recall={recall:.3f}")

## 7. Feature Importance

In [None]:
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("="*50)
print("FEATURE IMPORTANCE (Top 20)")
print("="*50)
print(feature_importance.head(20).to_string(index=False))

# Plot
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(20)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title('Top 20 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()

# Save plot
plot_path = Path.cwd().parent / 'feature_importance.png'
plt.savefig(plot_path, dpi=150)
print(f"\nFeature importance plot saved to {plot_path}")
plt.show()

## 8. Save Model

In [None]:
# Create models directory if it doesn't exist
models_dir = Path.cwd().parent / 'models'
models_dir.mkdir(exist_ok=True)

# Save model
model_path = models_dir / 'mean_reversion_lgb.pkl'
joblib.dump(model, model_path)
print(f"Model saved to {model_path}")

# Save feature list
feature_list_path = models_dir / 'feature_list.txt'
with open(feature_list_path, 'w') as f:
    for feature in feature_columns:
        f.write(f"{feature}\n")
print(f"Feature list saved to {feature_list_path}")

print("\n" + "="*50)
print("TRAINING COMPLETE!")
print("="*50)
print(f"\nModel performance: AUC = {auc:.4f}")
print(f"Ready for integration with Freqtrade")