In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
import ta
import joblib


In [None]:
#Load the dataset
df = pd.read_csv("E:\\ETF_Prediction_Model\\dataset.txt")
df = df.sort_values("Date").reset_index(drop=True)


In [None]:
#Basic EDA
print(df.info())
print(df.describe())
print(df.isnull().sum())
df.head()# Checking for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 983 entries, 0 to 982
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     983 non-null    object 
 1   Open     983 non-null    float64
 2   High     983 non-null    float64
 3   Low      983 non-null    float64
 4   Close    983 non-null    float64
 5   Volume   983 non-null    int64  
 6   OpenInt  983 non-null    int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 53.9+ KB
None
             Open        High         Low       Close        Volume  OpenInt
count  983.000000  983.000000  983.000000  983.000000  9.830000e+02    983.0
mean    39.419284   39.450038   39.392560   39.417884  1.748969e+04      0.0
std      0.375645    0.395290    0.379753    0.374739  2.008278e+05      0.0
min     38.938000   38.938000   38.454000   38.938000  1.000000e+02      0.0
25%     39.147000   39.177500   39.119500   39.151000  1.015000e+03      0.0
50%     39.213000   39.2

Unnamed: 0,Date,Open,High,Low,Close,Volume,OpenInt
0,2013-10-17,38.938,38.938,38.938,38.938,20858,0
1,2013-10-18,39.003,39.003,38.994,38.994,8717,0
2,2013-10-21,38.938,38.938,38.938,38.938,7732,0
3,2013-10-22,38.938,38.938,38.938,38.938,1362,0
4,2013-10-23,38.985,38.994,38.985,38.994,16785,0


In [None]:
#Feature Engineering
from ta.momentum import RSIIndicator
from ta.trend import MACD
from ta.volatility import BollingerBands

df['daily_return'] = df['Close'].pct_change()
df['sma_10'] = df['Close'].rolling(10).mean()
df['sma_30'] = df['Close'].rolling(30).mean()
df['volatility'] = df['daily_return'].rolling(10).std()

#RSI
df['rsi'] = RSIIndicator(close=df['Close']).rsi()

#MACD
macd = MACD(close=df['Close'])
df['macd_diff'] = macd.macd_diff()

#Bollinger Bands
bb = BollingerBands(close=df['Close'])
df['bb_bbm'] = bb.bollinger_mavg()
df['bb_bbh'] = bb.bollinger_hband()
df['bb_bbl'] = bb.bollinger_lband()


In [None]:
#Target Variable Creation
df['target'] = (df['Close'].rolling(3).mean().shift(-3) > df['Close']).astype(int)


In [None]:
#Clean Data
df = df[df['Volume'] > 1000]
df = df[df['Close'] != df['Open']]
df.dropna(inplace=True)


In [None]:
#Features & Scaling
features = [
    'sma_10', 'sma_30', 'daily_return', 'volatility',
    'rsi', 'macd_diff', 'bb_bbm', 'bb_bbh', 'bb_bbl'
]
X = df[features]
y = df['target']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
#Apply SMOTE
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_scaled, y)


In [None]:
#Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, shuffle=True, random_state=42
)


In [None]:
#Train CatBoost Model
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    verbose=100,
    random_seed=42
)
model.fit(X_train, y_train)


0:	learn: 0.6705427	total: 4.77ms	remaining: 2.38s
100:	learn: 0.2656639	total: 271ms	remaining: 1.07s
200:	learn: 0.1682234	total: 517ms	remaining: 769ms
300:	learn: 0.1110154	total: 800ms	remaining: 529ms
400:	learn: 0.0742894	total: 1.02s	remaining: 252ms
499:	learn: 0.0555980	total: 1.27s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1b06364f2b0>

In [None]:
#Evaluation
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8114754098360656
              precision    recall  f1-score   support

           0       0.88      0.79      0.83        71
           1       0.74      0.84      0.79        51

    accuracy                           0.81       122
   macro avg       0.81      0.82      0.81       122
weighted avg       0.82      0.81      0.81       122



In [None]:
#Save Model
joblib.dump(model, "catboost_model.pkl")


['catboost_model.pkl']