In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import talib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt


In [2]:
# Define the major currency pairs to fetch data for
currency_pairs = ['EURUSD=X', 'GBPUSD=X', 'USDJPY=X', 'AUDUSD=X', 'USDCHF=X', 'USDCAD=X', 'NZDUSD=X']

# Fetch the data using yfinance
forex_data = {}
for pair in currency_pairs:
    forex_data[pair] = yf.download(pair, start="2010-01-01", end="2025-01-01")


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [3]:
import talib
import pandas as pd

# Function to add technical indicators to the data
def add_technical_indicators(pair_data, pair_name):
    # Ensure 'Close' column is a 1D NumPy array
    close_prices = pair_data['Close'].values  # This should be already 1D
    if close_prices.ndim != 1:
        print(f"Warning: Close prices for {pair_name} is not 1D, reshaping...")
        close_prices = close_prices.flatten()  # Ensure it's 1D
    
    # Double-check the type and shape
    print(f"Close prices shape for {pair_name}: {close_prices.shape}")
    
    # Add indicators to the DataFrame
    try:
        pair_data['SMA_50'] = talib.SMA(close_prices, timeperiod=50)
        pair_data['SMA_200'] = talib.SMA(close_prices, timeperiod=200)
        pair_data['RSI'] = talib.RSI(close_prices, timeperiod=14)
        pair_data['MACD'], pair_data['MACD_signal'], _ = talib.MACD(close_prices, fastperiod=12, slowperiod=26, signalperiod=9)
        pair_data['Momentum'] = talib.MOM(close_prices, timeperiod=10)
        pair_data['ROC'] = talib.ROC(close_prices, timeperiod=10)
    except Exception as e:
        print(f"Error applying technical indicators to {pair_name}: {e}")
    
    # Drop rows with NaN values caused by indicators (since they can't be calculated for the first few rows)
    pair_data.dropna(inplace=True)  # Drop rows with missing values

    return pair_data

# Apply the technical indicators to each currency pair
for pair in forex_data:
    forex_data[pair] = add_technical_indicators(forex_data[pair], pair)

# Now, let's check the columns to verify if the indicators were added
for pair in forex_data:
    print(f"Columns in {pair}: {forex_data[pair].columns}")


Close prices shape for EURUSD=X: (3908,)
Close prices shape for GBPUSD=X: (3908,)
Close prices shape for USDJPY=X: (3908,)
Close prices shape for AUDUSD=X: (3907,)
Close prices shape for USDCHF=X: (3905,)
Close prices shape for USDCAD=X: (3907,)
Close prices shape for NZDUSD=X: (3908,)
Columns in EURUSD=X: MultiIndex([(      'Close', 'EURUSD=X'),
            (       'High', 'EURUSD=X'),
            (        'Low', 'EURUSD=X'),
            (       'Open', 'EURUSD=X'),
            (     'Volume', 'EURUSD=X'),
            (     'SMA_50',         ''),
            (    'SMA_200',         ''),
            (        'RSI',         ''),
            (       'MACD',         ''),
            ('MACD_signal',         ''),
            (   'Momentum',         ''),
            (        'ROC',         '')],
           names=['Price', 'Ticker'])
Columns in GBPUSD=X: MultiIndex([(      'Close', 'GBPUSD=X'),
            (       'High', 'GBPUSD=X'),
            (        'Low', 'GBPUSD=X'),
            (    

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Function to prepare data for training
def prepare_data(pair_data):
    # Calculate daily price change (up or down)
    pair_data['Price_Change'] = pair_data['Close'].diff().shift(-1)
    pair_data['Price_Change'] = pair_data['Price_Change'].apply(lambda x: 1 if x > 0 else 0)
    
    # Select features (technical indicators) and target (Price_Change)
    X = pair_data[['SMA_50', 'SMA_200', 'RSI', 'MACD', 'MACD_signal', 'Momentum', 'ROC']]
    y = pair_data['Price_Change']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale the features using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, y_train, y_test

# Prepare the data for each currency pair
train_data = {}
for pair in forex_data:
    X_train, X_test, y_train, y_test = prepare_data(forex_data[pair])
    train_data[pair] = (X_train, X_test, y_train, y_test)

# Check the shapes of the data
for pair in train_data:
    X_train, X_test, y_train, y_test = train_data[pair]
    print(f"Data for {pair}:")
    print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}\n")


Data for EURUSD=X:
X_train shape: (2967, 7), X_test shape: (742, 7)
y_train shape: (2967,), y_test shape: (742,)

Data for GBPUSD=X:
X_train shape: (2967, 7), X_test shape: (742, 7)
y_train shape: (2967,), y_test shape: (742,)

Data for USDJPY=X:
X_train shape: (2967, 7), X_test shape: (742, 7)
y_train shape: (2967,), y_test shape: (742,)

Data for AUDUSD=X:
X_train shape: (2966, 7), X_test shape: (742, 7)
y_train shape: (2966,), y_test shape: (742,)

Data for USDCHF=X:
X_train shape: (2964, 7), X_test shape: (742, 7)
y_train shape: (2964,), y_test shape: (742,)

Data for USDCAD=X:
X_train shape: (2966, 7), X_test shape: (742, 7)
y_train shape: (2966,), y_test shape: (742,)

Data for NZDUSD=X:
X_train shape: (2967, 7), X_test shape: (742, 7)
y_train shape: (2967,), y_test shape: (742,)



In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Function to train and evaluate the model
def train_and_evaluate_model(X_train, X_test, y_train, y_test):
    # Initialize the RandomForestClassifier
    model = RandomForestClassifier(random_state=42)
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate accuracy and other metrics
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    return model, accuracy, report

# Train and evaluate the model for each currency pair
model_results = {}
for pair in train_data:
    X_train, X_test, y_train, y_test = train_data[pair]
    model, accuracy, report = train_and_evaluate_model(X_train, X_test, y_train, y_test)
    model_results[pair] = (model, accuracy, report)

# Display the results for each pair
for pair in model_results:
    model, accuracy, report = model_results[pair]
    print(f"Results for {pair}:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Classification Report:\n{report}")
    print("\n")


Results for EURUSD=X:
Accuracy: 0.51
Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.51      0.52       377
           1       0.51      0.52      0.51       365

    accuracy                           0.51       742
   macro avg       0.51      0.51      0.51       742
weighted avg       0.52      0.51      0.51       742



Results for GBPUSD=X:
Accuracy: 0.54
Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.55      0.54       369
           1       0.54      0.52      0.53       373

    accuracy                           0.54       742
   macro avg       0.54      0.54      0.53       742
weighted avg       0.54      0.54      0.53       742



Results for USDJPY=X:
Accuracy: 0.47
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.40      0.42       352
           1       0.49      0.53      0.51       390

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [10, 20, 30, None],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Perform GridSearchCV on the training data
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3,  # 3-fold cross-validation
                           n_jobs=-1,  # Use all available cores
                           verbose=2)

# Train the model on the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score (Accuracy): {best_score:.2f}")


Fitting 3 folds for each of 216 candidates, totalling 648 fits


In [19]:
import joblib

# Save the model to a file
joblib.dump(grid_search.best_estimator_, 'forex_model.pkl')
print("Model saved successfully!")


Model saved successfully!
