In [1]:
import pandas as pd
import numpy as np

import xgboost as xgb

from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Generating a sample dataset for demonstration
np.random.seed(42)  # for reproducibility

rows = 10000
dates = pd.date_range('20230101', periods=rows)
open_prices = np.random.uniform(100, 200, size=(rows,))
close_prices = open_prices + np.random.uniform(-10, 10, size=(rows,))
high_prices = np.maximum(open_prices, close_prices) + np.random.uniform(0, 5, size=(rows,))
low_prices = np.minimum(open_prices, close_prices) - np.random.uniform(0, 5, size=(rows,))
volumes = np.random.randint(5000, 20000, size=(rows,))
# targets = np.random.randint(0, 2, size=(rows,))  # whether the stock price increased (1) or decreased (0) the next day.
# Calculate the target labels based on price change
price_changes = np.diff(close_prices)  # Calculate the daily price changes
targets = np.where(price_changes > 0, 1, 0)  # 1 if price increased, 0 if price decreased
targets = np.insert(targets, 0, 0)  # Set the first target as 0

df = pd.DataFrame({
    'Date': dates,
    'Open': open_prices,
    'High': high_prices,
    'Low': low_prices,
    'Close': close_prices,
    'Volume': volumes,
    'Target': targets
})

df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Target
0,2023-01-01,137.454012,141.104003,131.736105,134.926828,12764,0
1,2023-01-02,195.071431,195.993991,189.43321,191.729673,19912,1
2,2023-01-03,173.199394,174.932593,161.89998,166.722472,6075,0
3,2023-01-04,159.865848,165.327585,158.770956,162.011182,10122,0
4,2023-01-05,115.601864,118.012311,112.195065,115.134347,19412,0


In [3]:
def train_and_evaluate(df):
    """
    Trains an XGBoost classifier on the given DataFrame and evaluates its performance.

    Parameters:
    df (DataFrame): A pandas DataFrame with the features and target variable.

    Returns:
    None: Prints the classification report of the model.
    """
    df[df == np.inf] = 1e10

    # Define features and target variable
    X = df[df.columns.difference(['Date','Target'])]
    y = df['Target']

    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train XGBoost classifier
    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    # Evaluate the model
    print(classification_report(y_test, predictions))

In [4]:
train_and_evaluate(df)

              precision    recall  f1-score   support

           0       0.69      0.73      0.71       946
           1       0.74      0.71      0.72      1054

    accuracy                           0.72      2000
   macro avg       0.72      0.72      0.72      2000
weighted avg       0.72      0.72      0.72      2000



Standardization

In [5]:
def standarlization(df, columns_to_scale, feature_range=(0, 1)):
    """
    Scale selected columns in a DataFrame using Min-Max scaling.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - columns_to_scale (list): List of column names to scale.
    - feature_range (tuple, optional): The desired feature range for scaling. Default is (0, 1).

    Returns:
    - pd.DataFrame: A new DataFrame with scaled columns.
    """
    scaler = MinMaxScaler(feature_range=feature_range)
    df[columns_to_scale] = scaler.fit_transform(df[columns_to_scale])
    return df

Adding new features

In [6]:
# Basic preprocessing and feature engineering
# Assuming the data is already clean and doesn't have missing values
df['Price_Change'] = (df['Close'] - df['Open']) / df['Open'] * 100

In [7]:
train_and_evaluate(df)

              precision    recall  f1-score   support

           0       0.68      0.72      0.70       946
           1       0.74      0.70      0.72      1054

    accuracy                           0.71      2000
   macro avg       0.71      0.71      0.71      2000
weighted avg       0.71      0.71      0.71      2000



 Moving Averages

In [8]:
df['SMA_5'] = df['Close'].rolling(window=5).mean()  # 5-day moving average
df['EMA_10'] = df['Close'].ewm(span=10, adjust=False).mean()  # 10-day EMA

Relative Strength Index (RSI)

In [9]:
delta = df['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
df['RSI'] = 100 - (100 / (1 + rs))

Moving Average Convergence Divergence (MACD)

In [10]:
exp1 = df['Close'].ewm(span=12, adjust=False).mean()
exp2 = df['Close'].ewm(span=26, adjust=False).mean()
df['MACD'] = exp1 - exp2
df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()

Bollinger Bands:

In [11]:
df['SMA_20'] = df['Close'].rolling(window=20).mean()
rstd = df['Close'].rolling(window=20).std()
df['Upper_Band'] = df['SMA_20'] + 2 * rstd
df['Lower_Band'] = df['SMA_20'] - 2 * rstd

Volume-based Indicators

In [12]:
df['OBV'] = (np.sign(df['Close'].diff()) * df['Volume']).fillna(0).cumsum()

In [13]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Target,Price_Change,SMA_5,EMA_10,RSI,MACD,Signal_Line,SMA_20,Upper_Band,Lower_Band,OBV
0,2023-01-01,137.454012,141.104003,131.736105,134.926828,12764,0,-1.838567,,134.926828,,0.000000,0.000000,,,,0.0
1,2023-01-02,195.071431,195.993991,189.433210,191.729673,19912,1,-1.713095,,145.254618,,4.531281,0.906256,,,,19912.0
2,2023-01-03,173.199394,174.932593,161.899980,166.722472,6075,0,-3.739575,,149.157864,,6.034915,1.931988,,,,13837.0
3,2023-01-04,159.865848,165.327585,158.770956,162.011182,10122,0,1.341959,,151.494831,,6.768373,2.899265,,,,3715.0
4,2023-01-05,115.601864,118.012311,112.195065,115.134347,19412,0,-0.404420,154.104900,144.883834,,3.526424,3.024697,,,,-15697.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2050-05-14,185.765599,198.067749,181.995429,193.306371,6878,1,4.059294,154.423352,155.630925,53.417070,1.686574,-0.043289,155.776153,209.532286,102.020020,98318.0
9996,2050-05-15,189.750884,193.490005,176.864528,180.687163,16483,0,-4.776642,163.020832,160.186605,54.397779,3.613317,0.688032,157.863626,212.106847,103.620405,81835.0
9997,2050-05-16,194.670792,194.768071,189.396914,190.744761,6852,1,-2.016754,173.389514,165.742633,50.362610,5.884012,1.727228,158.506471,214.038326,102.974616,88687.0
9998,2050-05-17,139.748799,141.753823,136.443601,138.615199,9755,0,-0.811170,164.767137,160.810373,45.407442,3.437509,2.069284,156.094164,210.622318,101.566011,78932.0


In [14]:
df.isna().sum()

Date             0
Open             0
High             0
Low              0
Close            0
Volume           0
Target           0
Price_Change     0
SMA_5            4
EMA_10           0
RSI             13
MACD             0
Signal_Line      0
SMA_20          19
Upper_Band      19
Lower_Band      19
OBV              0
dtype: int64

Manipulation on NaN

In [15]:
for column in df.columns:
    df[column].fillna(df[column].mean(), inplace=True)

In [16]:
df.isna().sum().sum()

0

In [17]:
if df.isnull().values.any():
    df = df.dropna()

In [18]:
df

Unnamed: 0,Date,Open,High,Low,Close,Volume,Target,Price_Change,SMA_5,EMA_10,RSI,MACD,Signal_Line,SMA_20,Upper_Band,Lower_Band,OBV
0,2023-01-01,137.454012,141.104003,131.736105,134.926828,12764,0,-1.838567,149.505375,134.926828,49.980994,0.000000,0.000000,149.502307,207.930223,91.074392,0.0
1,2023-01-02,195.071431,195.993991,189.433210,191.729673,19912,1,-1.713095,149.505375,145.254618,49.980994,4.531281,0.906256,149.502307,207.930223,91.074392,19912.0
2,2023-01-03,173.199394,174.932593,161.899980,166.722472,6075,0,-3.739575,149.505375,149.157864,49.980994,6.034915,1.931988,149.502307,207.930223,91.074392,13837.0
3,2023-01-04,159.865848,165.327585,158.770956,162.011182,10122,0,1.341959,149.505375,151.494831,49.980994,6.768373,2.899265,149.502307,207.930223,91.074392,3715.0
4,2023-01-05,115.601864,118.012311,112.195065,115.134347,19412,0,-0.404420,154.104900,144.883834,49.980994,3.526424,3.024697,149.502307,207.930223,91.074392,-15697.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2050-05-14,185.765599,198.067749,181.995429,193.306371,6878,1,4.059294,154.423352,155.630925,53.417070,1.686574,-0.043289,155.776153,209.532286,102.020020,98318.0
9996,2050-05-15,189.750884,193.490005,176.864528,180.687163,16483,0,-4.776642,163.020832,160.186605,54.397779,3.613317,0.688032,157.863626,212.106847,103.620405,81835.0
9997,2050-05-16,194.670792,194.768071,189.396914,190.744761,6852,1,-2.016754,173.389514,165.742633,50.362610,5.884012,1.727228,158.506471,214.038326,102.974616,88687.0
9998,2050-05-17,139.748799,141.753823,136.443601,138.615199,9755,0,-0.811170,164.767137,160.810373,45.407442,3.437509,2.069284,156.094164,210.622318,101.566011,78932.0


In [19]:
df.dtypes

Date            datetime64[ns]
Open                   float64
High                   float64
Low                    float64
Close                  float64
Volume                   int32
Target                   int32
Price_Change           float64
SMA_5                  float64
EMA_10                 float64
RSI                    float64
MACD                   float64
Signal_Line            float64
SMA_20                 float64
Upper_Band             float64
Lower_Band             float64
OBV                    float64
dtype: object

In [20]:
train_and_evaluate(df)

              precision    recall  f1-score   support

           0       0.84      0.84      0.84       946
           1       0.86      0.85      0.86      1054

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



In [21]:
columns_to_scale = df.columns.difference(['Date', 'Target'])
df = standarlization(df, columns_to_scale, feature_range=(0, 1))

In [22]:
train_and_evaluate(df)

              precision    recall  f1-score   support

           0       0.84      0.85      0.85       946
           1       0.86      0.86      0.86      1054

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000

