In [4]:
!pip install pandas numpy scikit-learn matplotlib seaborn chardet




[notice] A new release of pip available: 22.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
# Trading Data Analysis and Regression
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Union

class TradingFeatureExtractor:
    """Extract and process features from trading data."""
    
    @staticmethod
    def parse_factor_string(factor_str: str) -> Dict[str, float]:
        """Parse string of format 'key1=value1|key2=value2' into a dictionary."""
        if pd.isna(factor_str) or factor_str == '':
            return {}
        
        result = {}
        pairs = factor_str.split('|')
        
        for pair in pairs:
            if '=' not in pair:
                continue
            key, value = pair.split('=')
            try:
                result[key.strip()] = float(value.strip())
            except ValueError:
                continue
        
        return result
    
    def extract_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Extract features from the complex columns."""
        # Create copy of input DataFrame
        processed_df = df.copy()
        
        # Convert Time to datetime
        processed_df['Time'] = pd.to_datetime(processed_df['Time'])
        
        # Add basic trading features
        processed_df['TradeDirection'] = processed_df['Action'].map({
            'BUY': 1, 'SELL': -1, 'CLOSE': 0
        })
        
        # Calculate trade duration
        processed_df['TradeDuration'] = (
            processed_df.groupby('Ticket')['Time']
            .diff()
            .dt.total_seconds()
        )
        
        # Process factor columns
        factor_columns = ['factors', 'score', 'efactors', 'exitScore']
        
        for col in factor_columns:
            if col not in processed_df.columns:
                continue
            
            # Parse factor strings
            parsed_dicts = processed_df[col].apply(self.parse_factor_string)
            
            # Convert to DataFrame
            features_df = pd.DataFrame.from_records(parsed_dicts.tolist())
            
            if not features_df.empty:
                # Add column prefix to avoid name conflicts
                features_df.columns = [f"{col}_{c}" for c in features_df.columns]
                
                # Join with main DataFrame
                processed_df = pd.concat(
                    [processed_df.drop(col, axis=1), features_df],
                    axis=1
                )
        
        return processed_df

class TradingAnalysis:
    """Analysis of trading data with ML models."""
    
    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.feature_extractor = TradingFeatureExtractor()
        self.processed_df = None
        self.X = None
        self.y = None
        self.models = {}
        self.results = {}
    
    def preprocess_data(self, target_col: str = 'CurrentProfit'):
        """Preprocess the data and prepare for modeling."""
        # Extract features
        self.processed_df = self.feature_extractor.extract_features(self.df)
        
        # Select numeric columns only
        numeric_cols = self.processed_df.select_dtypes(
            include=[np.number]
        ).columns
        
        # Remove target and any unwanted columns
        feature_cols = [
            col for col in numeric_cols 
            if col not in [target_col, 'Ticket', 'TradeDuration']
        ]
        
        self.X = self.processed_df[feature_cols]
        self.y = self.processed_df[target_col]
        
        # Handle missing values
        self.X = self.X.fillna(0)
        
        return self.X, self.y
    
    def train_models(self, test_size: float = 0.2):
        """Train multiple regression models."""
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=42
        )
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Initialize models
        models = {
            'linear': LinearRegression(),
            'ridge': Ridge(alpha=1.0),
            'lasso': Lasso(alpha=1.0)
        }
        
        # Train and evaluate each model
        for name, model in models.items():
            # Train
            model.fit(X_train_scaled, y_train)
            
            # Predict
            y_pred = model.predict(X_test_scaled)
            
            # Store results
            self.results[name] = {
                'model': model,
                'predictions': y_pred,
                'true_values': y_test,
                'r2_score': r2_score(y_test, y_pred),
                'rmse': np.sqrt(mean_squared_error(y_test, y_pred)),
                'feature_importance': pd.DataFrame({
                    'feature': self.X.columns,
                    'importance': np.abs(model.coef_)
                }).sort_values('importance', ascending=False)
            }
        
        return self.results
    
    def plot_results(self):
        """Plot analysis results."""
        # Set up the plotting area
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        fig.suptitle('Regression Analysis Results', fontsize=16)
        
        # Plot actual vs predicted for each model
        for i, (name, results) in enumerate(self.results.items()):
            ax = axes[i // 2, i % 2]
            ax.scatter(
                results['true_values'],
                results['predictions'],
                alpha=0.5
            )
            ax.plot(
                [min(results['true_values']), max(results['true_values'])],
                [min(results['true_values']), max(results['true_values'])],
                'r--'
            )
            ax.set_title(f'{name.capitalize()} Regression\nR² = {results["r2_score"]:.3f}')
            ax.set_xlabel('Actual Values')
            ax.set_ylabel('Predicted Values')
        
        plt.tight_layout()
        plt.show()
        
        # Plot feature importance
        plt.figure(figsize=(12, 6))
        for name, results in self.results.items():
            top_features = results['feature_importance'].head(10)
            plt.bar(
                range(len(top_features)),
                top_features['importance'],
                alpha=0.3,
                label=name
            )
        
        plt.title('Top 10 Feature Importance by Model')
        plt.xlabel('Features')
        plt.ylabel('Importance')
        plt.xticks(
            range(len(top_features)),
            top_features['feature'],
            rotation=45,
            ha='right'
        )
        plt.legend()
        plt.tight_layout()
        plt.show()

# Example usage in Jupyter notebook:
'''
# Load the data
df = pd.read_csv('your_trading_data.csv')

# Initialize analysis
analysis = TradingAnalysis(df)

# Preprocess data
X, y = analysis.preprocess_data(target_col='CurrentProfit')

# Train models
results = analysis.train_models()

# Plot results
analysis.plot_results()

# Access detailed results
for model_name, model_results in results.items():
    print(f"\n{model_name.capitalize()} Regression Results:")
    print(f"R² Score: {model_results['r2_score']:.3f}")
    print(f"RMSE: {model_results['rmse']:.3f}")
    print("\nTop 5 Important Features:")
    print(model_results['feature_importance'].head())
'''

ModuleNotFoundError: No module named 'pandas'

In [None]:
# Load your data
df = pd.read_csv('path_to_your_csv')

# Initialize the analysis
analysis = TradingAnalysis(df)

# Preprocess the data
X, y = analysis.preprocess_data()

# Train the models
results = analysis.train_models()

# Visualize the results
analysis.plot_results()