# Data Loading and Preprocessing
Load the CSV file, handle datetime conversion, parse the factors and scores columns into separate features, and clean the data.

In [6]:
!pip install pandas
!pip install numpy




[notice] A new release of pip available: 22.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip available: 22.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
# Import necessary libraries

import pandas as pd
import numpy as np

# Load the CSV file
file_path = r"C:\Users\StdUser\Desktop\MyProjects\Backtesting\logs\SYM_10030436_transactions.csv"
df = pd.read_csv(file_path, delimiter='\t')

# Convert 'Time' column to datetime
df['Time'] = pd.to_datetime(df['Time'], format='%Y.%m.%d %H:%M')

# Parse 'factors' and 'score' columns into separate features
factors_df = df['factors'].str.split('|', expand=True).apply(lambda x: x.str.split('=', expand=True).set_index(0).T, axis=1)
factors_df.columns = factors_df.columns.droplevel(0)
factors_df = factors_df.apply(pd.to_numeric, errors='coerce')

score_df = df['score'].str.split('|', expand=True).apply(lambda x: x.str.split('=', expand=True).set_index(0).T, axis=1)
score_df.columns = score_df.columns.droplevel(0)
score_df = score_df.apply(pd.to_numeric, errors='coerce')

# Concatenate the parsed factors and scores back to the original dataframe
df = pd.concat([df, factors_df, score_df], axis=1)

# Drop the original 'factors' and 'score' columns
df.drop(columns=['factors', 'score'], inplace=True)

# Handle missing values (if any)
df.fillna(0, inplace=True)

# Display the first few rows of the cleaned dataframe
df.head()

ModuleNotFoundError: No module named 'pandas'

# Feature Engineering
Extract numerical values from the factors string, create new features from the trading metrics, and prepare the target variable (profit/performance metrics).

In [None]:
# Feature Engineering

# Extract numerical values from 'efactors' and 'exitScore' columns
efactors_df = df['efactors'].str.split('|', expand=True).apply(lambda x: x.str.split('=', expand=True).set_index(0).T, axis=1)
efactors_df.columns = efactors_df.columns.droplevel(0)
efactors_df = efactors_df.apply(pd.to_numeric, errors='coerce')

exit_score_df = df['exitScore'].str.split('|', expand=True).apply(lambda x: x.str.split('=', expand=True).set_index(0).T, axis=1)
exit_score_df.columns = exit_score_df.columns.droplevel(0)
exit_score_df = exit_score_df.apply(pd.to_numeric, errors='coerce')

# Concatenate the parsed efactors and exit scores back to the original dataframe
df = pd.concat([df, efactors_df, exit_score_df], axis=1)

# Drop the original 'efactors' and 'exitScore' columns
df.drop(columns=['efactors', 'exitScore'], inplace=True)

# Create new features from trading metrics
df['PriceChange'] = df['Price'] - df['Price'].shift(1)
df['VolumeChange'] = df['Volume'] - df['Volume'].shift(1)
df['ProfitChange'] = df['CurrentProfit'] - df['CurrentProfit'].shift(1)

# Prepare the target variable (CurrentProfit)
target = df['CurrentProfit']

# Display the first few rows of the dataframe with new features
df.head()

# Correlation Analysis
Calculate and visualize correlations between different factors and trading performance using heatmaps and correlation matrices.

In [None]:
# Correlation Analysis

import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix
correlation_matrix = df.corr()

# Plot the heatmap
plt.figure(figsize=(16, 12))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

# Display the top correlations with the target variable (CurrentProfit)
target_correlations = correlation_matrix['CurrentProfit'].sort_values(ascending=False)
print("Top correlations with CurrentProfit:")
print(target_correlations.head(10))

# Feature Importance with Random Forest
Implement Random Forest to identify the most important factors affecting trading decisions and performance.

In [None]:
# Feature Importance with Random Forest

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare the feature matrix (X) and target vector (y)
X = df.drop(columns=['CurrentProfit', 'Time', 'Ticket', 'Action', 'Symbol', 'CloseReason'])
y = df['CurrentProfit']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Get feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame for feature importances
feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

# Sort the DataFrame by importance
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importances_df)
plt.title('Feature Importances')
plt.show()

# Model Training and Evaluation
Train multiple ML models (Random Forest, XGBoost) to predict trading success based on factors, evaluate model performance.

In [None]:
# Model Training and Evaluation

from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# Initialize the XGBoost Regressor
xgb = XGBRegressor(n_estimators=100, random_state=42)

# Train the XGBoost model
xgb.fit(X_train, y_train)

# Make predictions with XGBoost
y_pred_xgb = xgb.predict(X_test)

# Calculate the mean squared error for XGBoost
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f"XGBoost Mean Squared Error: {mse_xgb}")

# Calculate the R-squared score for XGBoost
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGBoost R-squared Score: {r2_xgb}")

# Compare model performance
print(f"Random Forest Mean Squared Error: {mse}")
print(f"XGBoost Mean Squared Error: {mse_xgb}")

print(f"Random Forest R-squared Score: {rf.score(X_test, y_test)}")
print(f"XGBoost R-squared Score: {r2_xgb}")

# Plotting feature importances for XGBoost
xgb_importances = xgb.feature_importances_

# Create a DataFrame for XGBoost feature importances
xgb_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': xgb_importances
})

# Sort the DataFrame by importance
xgb_importances_df = xgb_importances_df.sort_values(by='Importance', ascending=False)

# Plot the XGBoost feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=xgb_importances_df)
plt.title('XGBoost Feature Importances')
plt.show()

# Factor Impact Visualization
Create visualizations showing the impact of key factors on trading performance using plots and charts.

In [None]:
# Factor Impact Visualization

import matplotlib.pyplot as plt
import seaborn as sns

# Visualize the impact of top factors on trading performance
top_factors = feature_importances_df.head(10)['Feature']

# Plot the impact of top factors on CurrentProfit
plt.figure(figsize=(14, 10))
for factor in top_factors:
    sns.scatterplot(x=df[factor], y=df['CurrentProfit'], label=factor)

plt.title('Impact of Top Factors on CurrentProfit')
plt.xlabel('Factor Value')
plt.ylabel('CurrentProfit')
plt.legend()
plt.show()

# Visualize the distribution of CurrentProfit
plt.figure(figsize=(12, 6))
sns.histplot(df['CurrentProfit'], kde=True, bins=30)
plt.title('Distribution of CurrentProfit')
plt.xlabel('CurrentProfit')
plt.ylabel('Frequency')
plt.show()

# Visualize the relationship between PriceChange and CurrentProfit
plt.figure(figsize=(12, 6))
sns.scatterplot(x=df['PriceChange'], y=df['CurrentProfit'])
plt.title('PriceChange vs CurrentProfit')
plt.xlabel('PriceChange')
plt.ylabel('CurrentProfit')
plt.show()

# Visualize the relationship between VolumeChange and CurrentProfit
plt.figure(figsize=(12, 6))
sns.scatterplot(x=df['VolumeChange'], y=df['CurrentProfit'])
plt.title('VolumeChange vs CurrentProfit')
plt.xlabel('VolumeChange')
plt.ylabel('CurrentProfit')
plt.show()

# Visualize the relationship between ProfitChange and CurrentProfit
plt.figure(figsize=(12, 6))
sns.scatterplot(x=df['ProfitChange'], y=df['CurrentProfit'])
plt.title('ProfitChange vs CurrentProfit')
plt.xlabel('ProfitChange')
plt.ylabel('CurrentProfit')
plt.show()

# Optimization Analysis
Use the model insights to suggest optimal factor thresholds and strategy improvements.

In [None]:
# Optimization Analysis

from scipy.optimize import minimize

# Define the objective function to minimize (negative profit)
def objective_function(params, df, top_factors):
    df_copy = df.copy()
    for i, factor in enumerate(top_factors):
        df_copy[factor] = df_copy[factor] * params[i]
    X = df_copy[top_factors]
    y = df_copy['CurrentProfit']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Get the top factors from the feature importances
top_factors = feature_importances_df.head(10)['Feature'].values

# Initial parameters (all ones)
initial_params = np.ones(len(top_factors))

# Perform the optimization
result = minimize(objective_function, initial_params, args=(df, top_factors), method='Nelder-Mead')

# Get the optimized parameters
optimized_params = result.x

# Display the optimized parameters
print("Optimized Parameters:")
for factor, param in zip(top_factors, optimized_params):
    print(f"{factor}: {param}")

# Apply the optimized parameters to the dataframe
for i, factor in enumerate(top_factors):
    df[factor] = df[factor] * optimized_params[i]

# Re-evaluate the model with optimized factors
X_optimized = df[top_factors]
y = df['CurrentProfit']
X_train_opt, X_test_opt, y_train_opt, y_test_opt = train_test_split(X_optimized, y, test_size=0.2, random_state=42)
model_opt = RandomForestRegressor(n_estimators=100, random_state=42)
model_opt.fit(X_train_opt, y_train_opt)
y_pred_opt = model_opt.predict(X_test_opt)
mse_opt = mean_squared_error(y_test_opt, y_pred_opt)
r2_opt = r2_score(y_test_opt, y_pred_opt)

# Display the optimized model performance
print(f"Optimized Mean Squared Error: {mse_opt}")
print(f"Optimized R-squared Score: {r2_opt}")