## Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif

# Set random seed for reproducibility
np.random.seed(42)

## Visualization setup and saving

In [None]:
# Visualization Setup
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

def setup_figure(figsize=(10, 6)):
    """Set up a figure with the specified size"""
    return plt.figure(figsize=figsize)

def save_figure(filename, dpi=300, bbox_inches='tight'):
    """Save the current figure"""
    plt.savefig(filename, dpi=dpi, bbox_inches=bbox_inches)
    plt.close()

## Dataset loading

In [None]:
# Data Loading
def load_data(file_path="Data set/Stock Prices Data Set.csv"):
    """Load the stock price dataset and perform initial checks"""
    try:
        # Try to load the data
        df = pd.read_csv(file_path)
        print(f"Successfully loaded dataset with shape: {df.shape}")
        print(f"\nColumns: {df.columns.tolist()}")
        print(f"\nFirst 5 rows of the dataset:")
        print(df.head())
        return df
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"An error occurred while loading the data: {e}")
        return None


## Data Exploration

In [4]:
# Data Exploration
def explore_data(df):
    """Perform comprehensive data exploration on the stock dataset"""
    if df is None:
        return
    
    print("\n=== Data Types ===")
    print(df.dtypes)
    
    print("\n=== DataFrame Information ===")
    print(df.info())
    
    print("\n=== Descriptive Statistics ===")
    print(df.describe())
    
    print("\n=== Missing Values ===")
    missing = df.isnull().sum()
    print(missing)
    
    # Check for duplicate rows
    print("\n=== Duplicate Rows ===")
    duplicates = df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicates}")
    
    # Visualize stock price distributions
    numeric_columns = ['open', 'high', 'low', 'close', 'volume']
    if all(col in df.columns for col in numeric_columns):
        setup_figure(figsize=(14, 10))
        for i, column in enumerate(numeric_columns):
            plt.subplot(2, 3, i+1)
            sns.histplot(df[column], bins=30, kde=True)
            plt.title(f'Distribution of {column.capitalize()}')
        plt.tight_layout()
        save_figure("stock_price_distributions.png")
    
    # Check distribution of stock symbols
    if 'symbol' in df.columns:
        print("\n=== Stock Symbol Distribution ===")
        symbol_counts = df['symbol'].value_counts().head(10)
        print(symbol_counts)
        
        # Plot symbol distribution (top 10)
        setup_figure(figsize=(12, 6))
        symbol_counts.plot(kind='bar', color='skyblue')
        plt.title('Distribution of Top 10 Stock Symbols', fontsize=14)
        plt.xlabel('Stock Symbol', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.xticks(rotation=45)
        plt.tight_layout()
        save_figure("symbol_distribution.png")
    
    # Create correlation matrix
    corr_columns = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
    if len(corr_columns) > 1:
        setup_figure(figsize=(10, 8))
        correlation = df[corr_columns].corr()
        mask = np.triu(correlation)
        sns.heatmap(correlation, annot=True, fmt=".2f", cmap='coolwarm', mask=mask)
        plt.title('Feature Correlation Heatmap', fontsize=14)
        plt.tight_layout()
        save_figure("correlation_heatmap.png")


## Data preprocessing

In [5]:
# Data Preprocessing
def preprocess_data(df):
    """Preprocess and clean the stock data for modeling"""
    if df is None:
        return None, None, None, None
    
    print("\n=== Data Preprocessing ===")
    
    # Make a copy of the dataframe to avoid modifying the original
    df_processed = df.copy()
    
    # Create a binary classification task: price increased or not
    print("Creating binary target variable: price_increased")
    df_processed['price_increased'] = (df_processed['close'] > df_processed['open']).astype(int)
    
    # Check class distribution
    print(f"\nClass distribution:\n{df_processed['price_increased'].value_counts()}")
    print(f"Class distribution (%):\n{df_processed['price_increased'].value_counts(normalize=True) * 100}")
    
    # Visualize class distribution
    setup_figure(figsize=(8, 6))
    ax = sns.countplot(x='price_increased', data=df_processed, palette=['salmon', 'lightgreen'])
    plt.title('Distribution of Price Direction', fontsize=14)
    plt.xlabel('Price Increased (1) vs. Not Increased (0)', fontsize=12)
    plt.ylabel('Count', fontsize=12)
    
    # Add count labels on top of bars
    for p in ax.patches:
        ax.annotate(f'{p.get_height()}', 
                   (p.get_x() + p.get_width()/2., p.get_height()), 
                   ha='center', va='center', 
                   xytext=(0, 10), 
                   textcoords='offset points')
    
    save_figure("price_direction_distribution.png")
    
    # Check for missing values and handle them
    if df_processed.isnull().sum().sum() > 0:
        print("Handling missing values...")
        # For numeric columns, fill with median
        numeric_cols = df_processed.select_dtypes(include=['float64', 'int64']).columns
        for col in numeric_cols:
            if df_processed[col].isnull().sum() > 0:
                df_processed[col].fillna(df_processed[col].median(), inplace=True)
        
        # For categorical columns, fill with mode
        cat_cols = df_processed.select_dtypes(include=['object']).columns
        for col in cat_cols:
            if df_processed[col].isnull().sum() > 0:
                df_processed[col].fillna(df_processed[col].mode()[0], inplace=True)
    
    # Handle duplicates
    if df_processed.duplicated().sum() > 0:
        print(f"Removing {df_processed.duplicated().sum()} duplicate rows...")
        df_processed = df_processed.drop_duplicates()
    
    # Split features and target
    y = df_processed['price_increased']
    
    # Define features to be used initially
    # We'll drop the target, date, symbol, and the raw price columns that would cause data leakage
    drop_columns = ['price_increased', 'open', 'high', 'low', 'close', 'adjusted_close', 'date', 'symbol']
    X = df_processed.drop(columns=[col for col in drop_columns if col in df_processed.columns])
    
    print(f"Feature matrix shape after initial selection: {X.shape}")
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=42, stratify=y
    )
    
    print(f"Training set shape: {X_train.shape}, Test set shape: {X_test.shape}")
    
    return X_train, X_test, y_train, y_test


## Feature Engineering

In [6]:
# Feature Engineering
def engineer_features(df):
    """Perform feature engineering on the stock dataset"""
    print("\n=== Feature Engineering ===")
    
    if df is None:
        return None
    
    # Make a copy to avoid modifying the original
    df_eng = df.copy()
    
    # 1. Calculate daily price change percentage
    if all(col in df_eng.columns for col in ['close', 'open']):
        print("Creating price change percentage feature...")
        df_eng['price_change_pct'] = ((df_eng['close'] - df_eng['open']) / df_eng['open']) * 100
    
    # 2. Calculate high-low range as percentage of opening price
    if all(col in df_eng.columns for col in ['high', 'low', 'open']):
        print("Creating high-low range percentage feature...")
        df_eng['hl_range_pct'] = ((df_eng['high'] - df_eng['low']) / df_eng['open']) * 100
    
    # 3. Calculate volume relative to the stock's average volume
    if all(col in df_eng.columns for col in ['volume', 'symbol']):
        print("Creating relative volume feature...")
        # First, group by symbol to get each stock's average volume
        avg_volumes = df_eng.groupby('symbol')['volume'].mean()
        df_eng['rel_volume'] = df_eng.apply(lambda row: row['volume'] / avg_volumes[row['symbol']], axis=1)
    
    # 4. Calculate moving averages (if date information is available)
    if 'date' in df_eng.columns:
        try:
            # Convert date to datetime if it's not already
            if df_eng['date'].dtype == 'object':
                df_eng['date'] = pd.to_datetime(df_eng['date'])
            
            # Sort by symbol and date
            df_eng = df_eng.sort_values(['symbol', 'date'])
            
            # Group by symbol and calculate moving averages
            print("Creating moving average features...")
            for symbol in df_eng['symbol'].unique():
                mask = df_eng['symbol'] == symbol
                # 5-day moving average of closing price
                df_eng.loc[mask, 'ma5_close'] = df_eng.loc[mask, 'close'].rolling(window=5).mean()
                # 10-day moving average of closing price
                df_eng.loc[mask, 'ma10_close'] = df_eng.loc[mask, 'close'].rolling(window=10).mean()
                # 5-day moving average of volume
                df_eng.loc[mask, 'ma5_volume'] = df_eng.loc[mask, 'volume'].rolling(window=5).mean()
        except Exception as e:
            print(f"Error creating moving averages: {e}")
    
    # 5. Calculate price volatility (rolling standard deviation)
    if 'date' in df_eng.columns and 'close' in df_eng.columns:
        try:
            print("Creating volatility features...")
            for symbol in df_eng['symbol'].unique():
                mask = df_eng['symbol'] == symbol
                # 5-day volatility
                df_eng.loc[mask, 'volatility_5d'] = df_eng.loc[mask, 'close'].rolling(window=5).std()
                # 10-day volatility
                df_eng.loc[mask, 'volatility_10d'] = df_eng.loc[mask, 'close'].rolling(window=10).std()
        except Exception as e:
            print(f"Error creating volatility features: {e}")
    
    # 6. Create normalized features (normalize each numeric feature by its mean)
    print("Creating normalized features...")
    numeric_cols = df_eng.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        if col not in ['price_increased']:  # Don't normalize the target
            mean_val = df_eng[col].mean()
            df_eng[f'norm_{col}'] = df_eng[col] / mean_val
    
    # Drop rows with missing values after feature engineering
    missing_count = df_eng.isnull().sum().sum()
    if missing_count > 0:
        print(f"Dropping {missing_count} rows with missing values after feature engineering...")
        df_eng = df_eng.dropna()
    
    print(f"Original feature count: {df.shape[1]}")
    print(f"Engineered feature count: {df_eng.shape[1]}")
    
    # Print new features
    new_features = set(df_eng.columns) - set(df.columns)
    print(f"New features created: {new_features}")
    
    return df_eng


## Feature Selection

In [7]:
# Feature Selection
def select_features(X_train, X_test, y_train, k=None):
    """Select the most important features using SelectKBest"""
    print("\n=== Feature Selection ===")
    
    # If k is not specified, use half of the features or 10, whichever is smaller
    if k is None:
        k = min(10, X_train.shape[1] // 2)
    
    # Select top k features
    selector = SelectKBest(f_classif, k=k)
    X_train_selected = selector.fit_transform(X_train, y_train)
    X_test_selected = selector.transform(X_test)
    
    # Get selected feature names
    selected_indices = selector.get_support(indices=True)
    selected_features = X_train.columns[selected_indices]
    print(f"Selected {k} features: {selected_features.tolist()}")
    
    # Create dataframes with selected features
    X_train_selected_df = pd.DataFrame(X_train_selected, columns=selected_features)
    X_test_selected_df = pd.DataFrame(X_test_selected, columns=selected_features)
    
    # Plot feature importance scores
    setup_figure(figsize=(12, 6))
    feature_scores = pd.Series(selector.scores_, index=X_train.columns)
    feature_scores = feature_scores.sort_values(ascending=False)
    sns.barplot(x=feature_scores.values, y=feature_scores.index)
    plt.title('Feature Importance Scores (ANOVA F-value)', fontsize=14)
    plt.xlabel('F-value', fontsize=12)
    plt.tight_layout()
    save_figure("feature_importance_scores.png")
    
    return X_train_selected_df, X_test_selected_df, selected_features


## Model Training
Using 3 models, SVM(Linear kernel), SVM(RBF kernel), SVM(Polynomial kernel). Uncomment the polynomial kernel if needed.

In [16]:
# Model Training
def train_svm_models(X_train, y_train, X_test, y_test):
    """Train different SVM models and return the results"""
    print("\n=== Training SVM Models ===")
    
    # Standardize features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Define models with different kernels
    models = {
        'SVM (Linear Kernel)': SVC(kernel='linear', C=1.0, random_state=42, probability=True),
        'SVM (RBF Kernel)': SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42, probability=True),
        # 'SVM (Polynomial Kernel)': SVC(kernel='poly', degree=3, C=1.0, random_state=42, probability=True)
    }
    
    # Dictionary to store results
    results = {}
    
    # Train and evaluate each model
    for name, model in models.items():
        print(f"\nTraining {name}...")
        
        # Train the model
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test_scaled)
        y_prob = model.predict_proba(X_test_scaled)[:, 1]
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        try:
            auc_score = roc_auc_score(y_test, y_prob)
        except:
            auc_score = 0.5  # Default value if AUC calculation fails
        
        print(f"Performance of {name}:")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1 Score: {f1:.4f}")
        print(f"  AUC: {auc_score:.4f}")
        
        # Store results
        results[name] = {
            'model': model,
            'scaler': scaler,
            'y_pred': y_pred,
            'y_prob': y_prob,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'auc': auc_score
        }
    
    return results


## Model Optimization


In [17]:
# Model Optimization
def optimize_svm_hyperparameters(X_train, y_train, kernel='rbf'):
    """Find optimal hyperparameters for SVM using grid search"""
    print(f"\n=== Optimizing SVM Hyperparameters for {kernel} kernel ===")
    
    # Define the pipeline with scaling
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('svm', SVC(kernel=kernel, probability=True, random_state=42))
    ])
    
    # Define parameter grid based on kernel
    if kernel == 'linear':
        param_grid = {
            'svm__C': [0.1, 1, 10, 100]
        }
    elif kernel == 'rbf':
        param_grid = {
            'svm__C': [0.1, 1, 10, 100],
            'svm__gamma': ['scale', 'auto', 0.1, 0.01]
        }
    elif kernel == 'poly':
        param_grid = {
            'svm__C': [0.1, 1, 10],
            'svm__degree': [2, 3, 4],
            'svm__gamma': ['scale', 'auto', 0.1]
        }
    else:
        print(f"Unsupported kernel: {kernel}")
        return None
    
    # Create and run grid search
    grid_search = GridSearchCV(
        pipeline, param_grid, cv=2, scoring='accuracy', verbose=1, n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    
    # Print results
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")
    
    # Get the best estimator
    best_model = grid_search.best_estimator_
    
    # Store CV results in a DataFrame for visualization
    cv_results = pd.DataFrame(grid_search.cv_results_)
    
    # Plot CV results
    if kernel == 'linear':
        setup_figure(figsize=(10, 6))
        plt.plot(cv_results['param_svm__C'], cv_results['mean_test_score'], marker='o')
        plt.xscale('log')
        plt.title(f'Grid Search Results for {kernel} SVM', fontsize=14)
        plt.xlabel('C parameter', fontsize=12)
        plt.ylabel('Mean CV Accuracy', fontsize=12)
        plt.grid(True, alpha=0.3)
        save_figure(f"grid_search_{kernel}_svm.png")
    elif kernel == 'rbf':
        setup_figure(figsize=(12, 8))
        # Group by gamma
        gamma_values = cv_results['param_svm__gamma'].unique()
        for gamma in gamma_values:
            subset = cv_results[cv_results['param_svm__gamma'] == gamma]
            plt.plot(subset['param_svm__C'], subset['mean_test_score'], 
                     marker='o', label=f'gamma = {gamma}')
        plt.xscale('log')
        plt.title(f'Grid Search Results for {kernel} SVM', fontsize=14)
        plt.xlabel('C parameter', fontsize=12)
        plt.ylabel('Mean CV Accuracy', fontsize=12)
        plt.legend()
        plt.grid(True, alpha=0.3)
        save_figure(f"grid_search_{kernel}_svm.png")
    
    return best_model


## Cross-Validation


In [18]:
# Cross-Validation
def perform_cross_validation(X, y, model):
    """Perform cross-validation to evaluate model performance"""
    print("\n=== Cross-Validation Evaluation ===")
    
    # Define the pipeline with scaling
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', model)
    ])
    
    # Perform 2-fold cross-validation
    cv_scores = cross_val_score(pipeline, X, y, cv=2, scoring='accuracy')
    
    # Print results
    print(f"Cross-validation accuracy scores: {cv_scores}")
    print(f"Mean CV accuracy: {cv_scores.mean():.4f}")
    print(f"Standard deviation of CV accuracy: {cv_scores.std():.4f}")
    
    return cv_scores


## Evaluate Models


In [19]:
# Evaluate Models
def evaluate_model(model_results, X_test, y_test, model_name):
    """Evaluate model performance using various metrics"""
    print(f"\n=== Evaluating {model_name} ===")
    
    # Extract model and predictions
    y_pred = model_results['y_pred']
    y_prob = model_results['y_prob']
    
    # Calculate metrics
    accuracy = model_results['accuracy']
    precision = model_results['precision']
    recall = model_results['recall']
    f1 = model_results['f1']
    auc_score = model_results['auc']
    
    # Print metrics
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"AUC: {auc_score:.4f}")
    
    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Price Decreased/Same', 'Price Increased']))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    setup_figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Price Decreased/Same', 'Price Increased'],
                yticklabels=['Price Decreased/Same', 'Price Increased'])
    plt.xlabel('Predicted', fontsize=12)
    plt.ylabel('Actual', fontsize=12)
    plt.title(f'Confusion Matrix - {model_name}', fontsize=14)
    save_figure(f"confusion_matrix_{model_name.lower().replace(' ', '_')}.png")
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    
    setup_figure(figsize=(8, 6))
    plt.plot(fpr, tpr, lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title(f'ROC Curve - {model_name}', fontsize=14)
    plt.legend(loc='lower right')
    plt.grid(True, alpha=0.3)
    save_figure(f"roc_curve_{model_name.lower().replace(' ', '_')}.png")
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'auc': auc_score
    }


## # Compare Models

In [20]:
# Compare Models
def compare_models(results):
    """Compare the performance of different SVM models"""
    print("\n=== Model Comparison ===")
    
    # Create a comparison table
    comparison_data = {
        name: {
            'Accuracy': results[name]['accuracy'],
            'Precision': results[name]['precision'],
            'Recall': results[name]['recall'],
            'F1 Score': results[name]['f1'],
            'AUC': results[name]['auc']
        }
        for name in results.keys()
    }
    
    comparison_df = pd.DataFrame(comparison_data)
    
    # Print the comparison table
    print(comparison_df)
    
    # Plot comparison
    setup_figure(figsize=(12, 8))
    comparison_df.plot(kind='bar', figsize=(12, 8))
    plt.title('SVM Model Performance Comparison', fontsize=14)
    plt.ylabel('Score', fontsize=12)
    plt.xlabel('Metric', fontsize=12)
    plt.ylim(0, 1.1)
    plt.grid(axis='y', alpha=0.3)
    plt.legend(title='Model')
    plt.tight_layout()
    save_figure("model_comparison.png")
    
    # Find the best model based on accuracy
    best_model_name = max(results.keys(), key=lambda k: results[k]['accuracy'])
    print(f"\nBest model based on accuracy: {best_model_name}")
    print(f"Accuracy: {results[best_model_name]['accuracy']:.4f}")
    
    return best_model_name, results[best_model_name]


## Model Visualization


In [21]:
# Model Visualization
def visualize_decision_boundary(model_results, X_test, y_test, feature_names, title):
    """Visualize decision boundary for SVM model"""
    print(f"\n=== Visualizing Decision Boundary for {title} ===")
    
    # Extract model and scaler
    model = model_results['model']
    scaler = model_results['scaler']
    
    # We can only visualize 2D decision boundaries
    # Let's select the two most important features
    if len(feature_names) > 2:
        print("Selecting the first two features for visualization...")
        feature_indices = [0, 1]
        feature_names_selected = feature_names[:2]
    else:
        feature_indices = range(len(feature_names))
        feature_names_selected = feature_names
    
    # Extract the selected features
    X_visual = X_test.iloc[:, feature_indices].values
    
    # Scale the features
    X_visual_scaled = scaler.transform(X_test)[:, feature_indices]
    
    # Create a mesh grid
    h = 0.02  # step size in the mesh
    x_min, x_max = X_visual_scaled[:, 0].min() - 1, X_visual_scaled[:, 0].max() + 1
    y_min, y_max = X_visual_scaled[:, 1].min() - 1, X_visual_scaled[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    # Create full feature vector with zeros for non-visualized features
    if model.n_features_in_ > 2:
        mesh_points = np.zeros((xx.ravel().shape[0], model.n_features_in_))
        mesh_points[:, feature_indices] = np.c_[xx.ravel(), yy.ravel()]
    else:
        mesh_points = np.c_[xx.ravel(), yy.ravel()]
    
    # Predict labels for mesh grid points
    Z = model.predict(mesh_points)
    Z = Z.reshape(xx.shape)
    
    # Plot the decision boundary and training points
    setup_figure(figsize=(10, 8))
    plt.contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdBu)
    
    # Plot the training points
    scatter = plt.scatter(X_visual_scaled[:, 0], X_visual_scaled[:, 1], 
                c=y_test, edgecolors='k', cmap=plt.cm.RdBu)
    
    plt.xlabel(feature_names_selected[0], fontsize=12)
    plt.ylabel(feature_names_selected[1], fontsize=12)
    plt.title(f'Decision Boundary - {title}', fontsize=14)
    plt.legend(*scatter.legend_elements(), title='Price Direction')
    plt.tight_layout()
    save_figure(f"decision_boundary_{title.lower().replace(' ', '_')}.png")


## Save Model


In [22]:
# Save Model
def save_model(model, scaler, selected_features, filename='svm_stock_model.pkl'):
    """Save the trained model to a file"""
    import pickle
    
    # Create a dictionary with all components needed for prediction
    model_package = {
        'model': model,
        'scaler': scaler,
        'selected_features': selected_features
    }
    
    # Save to file
    try:
        with open(filename, 'wb') as f:
            pickle.dump(model_package, f)
        print(f"\nModel successfully saved to {filename}")
        return True
    except Exception as e:
        print(f"Error saving model: {e}")
        return False


## Main Function.

In [None]:
def main():
    """Main function to run the stock price SVM workflow"""
    print("=== SVM Classification for Stock Price Direction Prediction ===")
    
    # 1. Load data
    df = load_data("Data set/Stock Prices Data Set.csv")
    if df is None:
        print("Error loading data. Exiting...")
        return
    
    # 2. Explore data
    explore_data(df
                 )
    
    # 3. Engineer features
    df_eng = engineer_features(df)
    if df_eng is None:
        print("Error engineering features. Exiting...")
        return
    
    # 4. Preprocess data
    X_train, X_test, y_train, y_test = preprocess_data(df_eng)
    if X_train is None:
        print("Error preprocessing data. Exiting...")
        return
    
    # 5. Select important features
    X_train_selected, X_test_selected, selected_features = select_features(
        X_train, X_test, y_train, k=None
    )
    if X_train_selected is None:
        print("Error selecting features. Exiting...")
        return
    
    # 6. Train different SVM models
    model_results = train_svm_models(X_train_selected, y_train, X_test_selected, y_test)
    
    # 7. Compare all models
    best_model_name, best_model_results = compare_models(model_results)
    
    # 8. Optimize hyperparameters for the best kernel type
    best_kernel = best_model_name.split('(')[1].split(' ')[0].lower()
    optimized_model = optimize_svm_hyperparameters(X_train_selected, y_train, kernel=best_kernel)
    
    # 9. Perform cross-validation on the optimized model
    cv_scores = perform_cross_validation(X_train_selected, y_train, optimized_model.named_steps['svm'])
    
    # 10. Evaluate the optimized model
    # Predict with optimized model
    X_test_scaled = optimized_model.named_steps['scaler'].transform(X_test_selected)
    y_pred = optimized_model.named_steps['svm'].predict(X_test_scaled)
    y_prob = optimized_model.named_steps['svm'].predict_proba(X_test_scaled)[:, 1]
    
    # Create results dictionary for the optimized model
    optimized_results = {
        'model': optimized_model.named_steps['svm'],
        'scaler': optimized_model.named_steps['scaler'],
        'y_pred': y_pred,
        'y_prob': y_prob,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'auc': roc_auc_score(y_test, y_prob)
    }
    
    optimized_metrics = evaluate_model(
        optimized_results, X_test_selected, y_test, 
        model_name=f"Optimized {best_kernel.upper()} SVM"
    )
    
    # 11. Visualize decision boundary for the best model
    visualize_decision_boundary(
        optimized_results, X_test_selected, y_test, 
        selected_features, f"Optimized {best_kernel.upper()} SVM"
    )
    
    # 12. Save the optimized model
    save_model(
        optimized_model.named_steps['svm'], 
        optimized_model.named_steps['scaler'], 
        selected_features, 
        filename=f'optimized_{best_kernel}_svm_stock_model.pkl'
    )
    
    # Summary
    print("\n=== Summary ===")
    print(f"1. Dataset shape: {df.shape}")
    print(f"2. Number of engineered features: {df_eng.shape[1] - df.shape[1]}")
    print(f"3. Number of selected features: {len(selected_features)}")
    print(f"4. Best basic model: {best_model_name}")
    print(f"5. Best model accuracy: {best_model_results['accuracy']:.4f}")
    print(f"6. Optimized model accuracy: {optimized_metrics['accuracy']:.4f}")
    print(f"7. Cross-validation accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    
    print("\nConclusion:")
    if optimized_metrics['accuracy'] >= best_model_results['accuracy']:
        improvement = optimized_metrics['accuracy'] - best_model_results['accuracy']
        print(f"Hyperparameter optimization improved model accuracy by {improvement:.4f}!")
    else:
        diff = best_model_results['accuracy'] - optimized_metrics['accuracy']
        print(f"The basic model outperformed the optimized model by {diff:.4f}.")
        print("This suggests possible overfitting during optimization or that the basic model was already well-tuned.")

if __name__ == "__main__":
    main()

=== SVM Classification for Stock Price Direction Prediction ===
Successfully loaded dataset with shape: (497472, 7)

Columns: ['symbol', 'date', 'open', 'high', 'low', 'close', 'volume']

First 5 rows of the dataset:
  symbol        date      open      high       low     close    volume
0    AAL  2014-01-02   25.0700   25.8200   25.0600   25.3600   8998943
1   AAPL  2014-01-02   79.3828   79.5756   78.8601   79.0185  58791957
2    AAP  2014-01-02  110.3600  111.8800  109.2900  109.7400    542711
3   ABBV  2014-01-02   52.1200   52.3300   51.5200   51.9800   4569061
4    ABC  2014-01-02   70.1100   70.2300   69.4800   69.8900   1148391

=== Data Types ===
symbol     object
date       object
open      float64
high      float64
low       float64
close     float64
volume      int64
dtype: object

=== DataFrame Information ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497472 entries, 0 to 497471
Data columns (total 7 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  -----


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.countplot(x='price_increased', data=df_processed, palette=['salmon', 'lightgreen'])


Feature matrix shape after initial selection: (492920, 22)
Training set shape: (369690, 22), Test set shape: (123230, 22)

=== Feature Selection ===
Selected 10 features: ['volume', 'price_change_pct', 'hl_range_pct', 'rel_volume', 'ma5_volume', 'norm_open', 'norm_volume', 'norm_price_change_pct', 'norm_rel_volume', 'norm_ma5_volume']

=== Training SVM Models ===

Training SVM (Linear Kernel)...
Performance of SVM (Linear Kernel):
  Accuracy: 0.9990
  Precision: 0.9998
  Recall: 0.9982
  F1 Score: 0.9990
  AUC: 1.0000

Training SVM (RBF Kernel)...
Performance of SVM (RBF Kernel):
  Accuracy: 0.9990
  Precision: 0.9994
  Recall: 0.9987
  F1 Score: 0.9991
  AUC: 1.0000

=== Model Comparison ===
           SVM (Linear Kernel)  SVM (RBF Kernel)
Accuracy              0.998978          0.999042
Precision             0.999842          0.999448
Recall                0.998172          0.998692
F1 Score              0.999006          0.999070
AUC                   0.999998          0.999994

Bes