In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv('C:/Users/hp/Desktop/small projects/stock market using Linear/data/sp500.csv')

# Data preparation (as in your original code)
df['Date'] = pd.to_datetime(df['Date'])
df['Tommorow'] = df['Close'].shift(-1)
df.dropna(inplace=True)

# Define Target and Features
y = df['Tommorow']
X = df.drop(['Tommorow', 'Date', 'Dividends', 'Stock Splits'], axis=1)

# Define the Scikit-learn Pipeline
# This encapsulates the StandardScaler and the RandomForestRegressor.
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestRegressor(random_state=42))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the entire pipeline on the training data
# This automatically fits the scaler on X_train and then trains the RF model.
pipeline.fit(X_train, y_train)

# Predict using the fitted pipeline on the test data
# This automatically transforms the test data and then makes predictions.
y_pred_rf = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred_rf)
rmse=mse ** 0.5
r2 = pipeline.score(X_test, y_test)
print(f"Mean Squared Error: {mse}, Root Mean Squared Error: {rmse}, R^2 Score: {r2}")


  df['Date'] = pd.to_datetime(df['Date'])


Mean Squared Error: 294.08457495230584, Root Mean Squared Error: 17.148894277833364, R^2 Score: 0.9997947324731867


In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import numpy as np

# ------------------------------------------------
# 0. Load Data (Placeholder for a working example)
# Assume 'df' is your loaded DataFrame with 'Close', 'Open', etc. data.
# e.g., df = pd.read_csv('your_stock_data.csv') 
# If you don't have a file, you must load data first for this code to run.
# ------------------------------------------------

# 1. Feature Engineering (Outside the sklearn pipeline)
df['Return'] = df['Close'].pct_change()

# Note: The second 'target' assignment overwrites the first in your original code.
# The code below uses the Moving Average Crossover as the target.
# df['target'] = (df['Return'].shift(-1) > 0.003).astype(int) 
df['MA5'] = df['Close'].rolling(5).mean()
df['MA10'] = df['Close'].rolling(10).mean()
df['target'] = (df['MA5'].shift(-1) > df['MA10'].shift(-1)).astype(int) # This is your target variable y
df['MA_Crossover'] = (df['MA5'] > df['MA10']).astype(int) # This is a feature X
df['Prev_Close'] = df['Close'].shift(1)

# Drop missing values created by rolling means and shifting
df = df.dropna()

# Define Target and Features DataFrames
y = df['target']
X = df[['Open', 'High', 'Low', 'Close', 'Volume',
        'Return',  'MA5', 'MA10',
        'MA_Crossover', 'Prev_Close']]

# 2. Define the Scikit-learn Pipeline
# This encapsulates the StandardScaler and the XGBoost model.
# The 'scaler' component will automatically use fit_transform correctly within the CV loop.
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(
        n_estimators=500,
        learning_rate=0.03,
        max_depth=3,
        subsample=0.8,
        colsample_bytree=0.7,
        gamma=0.1,
        reg_alpha=2,
        reg_lambda=4,
        objective='binary:logistic',
        random_state=42,
        use_label_encoder=False, # Suppress XGBoost deprecation warning
        eval_metric='logloss'    # Use a standard evaluation metric
    ))
])

# 3. Time Series Cross-Validation Loop
tscv = TimeSeriesSplit(n_splits=5)
fold = 1

# Iterate through the time series splits
for train_index, test_index in tscv.split(X):
    # Select data using .iloc to handle pandas DataFrames/Series correctly
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the ENTIRE pipeline on the training data for the current fold
    # This automatically fits the scaler and then fits the XGBoost model
    pipeline.fit(X_train, y_train)
    
    # Predict using the fitted pipeline
    # This automatically transforms the test data using the *training set's* scaling parameters
    preds = pipeline.predict(X_test)

    print(f"Fold {fold} Accuracy:", accuracy_score(y_test, preds))
    fold += 1



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 1 Accuracy: 0.902855748108372


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 2 Accuracy: 0.9089577739809617


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 3 Accuracy: 0.891628020502807


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 4 Accuracy: 0.8818647791066634


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 5 Accuracy: 0.8948010739565536
