Imports

In [1]:
import yfinance as yf
import datetime
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


Data Gathering

In [2]:

ticker_symbols = ['NTDOY', 'TTWO', 'RIOT', 'EA', 'TCEHY']

# Get the current year
current_year = datetime.datetime.now().year

# Set end_date to March 1st of the current year
end_date = datetime.date(current_year, 3, 1)

# Calculate start_date as March 1st, 5 years ago from end_date
start_date = datetime.date(current_year - 5, 3, 1)

# Download stock data for each ticker
for ticker in ticker_symbols:
    data = yf.download(ticker, start=start_date, end=end_date)
    print(f"Data for {ticker}:")
    print(data)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Data for NTDOY:
              Open    High     Low   Close  Adj Close   Volume
Date                                                          
2019-03-01   6.926   6.950   6.866   6.934      6.934  2462500
2019-03-04   6.810   6.828   6.704   6.742      6.742  3426500
2019-03-05   6.796   6.826   6.750   6.810      6.810  1389000
2019-03-06   6.910   6.958   6.910   6.930      6.930   744500
2019-03-07   6.834   6.834   6.744   6.762      6.762  1333000
...            ...     ...     ...     ...        ...      ...
2024-02-23  14.000  14.030  13.920  13.960     13.960   589500
2024-02-26  14.320  14.450  14.240  14.240     14.240  1128300
2024-02-27  14.180  14.180  14.020  14.040     14.040   887600
2024-02-28  13.780  13.830  13.730  13.770     13.770   880400
2024-02-29  13.910  14.000  13.890  13.910     13.910  1060600

[1259 rows x 6 columns]
Data for TTWO:
                  Open        High         Low       Close   Adj Close  \
Date                                               

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Data for RIOT:
                 Open       High        Low      Close  Adj Close    Volume
Date                                                                       
2019-03-01   3.200000   3.480000   3.170000   3.260000   3.260000   1034500
2019-03-04   3.170000   3.240000   2.850000   3.000000   3.000000   1139600
2019-03-05   3.320000   3.441000   3.160000   3.390000   3.390000   1460700
2019-03-06   3.350000   3.430000   3.220000   3.230000   3.230000    593900
2019-03-07   3.230000   3.620000   3.110000   3.490000   3.490000   1090700
...               ...        ...        ...        ...        ...       ...
2024-02-23  15.140000  15.310000  14.330000  14.850000  14.850000  18732400
2024-02-26  14.900000  17.450001  14.890000  17.370001  17.370001  39162500
2024-02-27  18.100000  18.360001  16.219999  16.799999  16.799999  43619600
2024-02-28  17.440001  17.590000  15.230000  15.650000  15.650000  60473200
2024-02-29  15.680000  15.825000  13.710000  14.120000  14.120000  394777

[*********************100%%**********************]  1 of 1 completed

Data for TCEHY:
                 Open       High        Low      Close  Adj Close   Volume
Date                                                                      
2019-03-01  42.950001  43.029999  42.430000  42.639999  39.685432  2850300
2019-03-04  44.180000  44.240002  43.259998  43.799999  40.765049  2941500
2019-03-05  45.709999  46.790001  45.320000  46.570000  43.343124  7615900
2019-03-06  46.509998  46.509998  45.849998  45.930000  42.747463  2347400
2019-03-07  45.139999  45.139999  44.259998  44.450001  41.370018  4581000
...               ...        ...        ...        ...        ...      ...
2024-02-23  37.060001  37.189999  36.730000  36.950001  36.950001  1665400
2024-02-26  36.630001  36.840000  36.509998  36.529999  36.529999  2692700
2024-02-27  36.430000  36.509998  36.349998  36.419998  36.419998  1958500
2024-02-28  35.490002  35.490002  34.919998  34.980000  34.980000  3087100
2024-02-29  35.250000  35.299999  34.860001  34.939999  34.939999  5149600

[1259 ro




In [6]:
print(data.isnull().sum())
duplicates = data.duplicated().sum()
print(f'duplicate rows: {duplicates}')

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64
duplicate rows: 0


In [8]:
for ticker in ticker_symbols:
    data = yf.download(ticker, start=start_date, end=end_date)
    filename = f"{ticker}_stock_data.csv"
    data.to_csv(filename)
    print(f"Saved data for {ticker} to {filename}")

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

Saved data for NTDOY to NTDOY_stock_data.csv
Saved data for TTWO to TTWO_stock_data.csv
Saved data for RIOT to RIOT_stock_data.csv
Saved data for EA to EA_stock_data.csv
Saved data for TCEHY to TCEHY_stock_data.csv





In [2]:
import pandas as pd
file_path = 'NTDOY_stock_data.csv'
ntdoy_df = pd.read_csv(file_path)
file_path = 'EA_stock_data.csv'
ea_df = pd.read_csv(file_path)
file_path = 'RIOT_stock_data.csv'
riot_df = pd.read_csv(file_path)
file_path = 'TTWO_stock_data.csv'
ttwo_df = pd.read_csv(file_path)
file_path = 'TCEHY_stock_data.csv'
tchey_df = pd.read_csv(file_path)


# Display the first few rows of the DataFrame
print(ntdoy_df.head())
print(ea_df.head())
print(riot_df.head())
print(ttwo_df.head())
print(tchey_df.head())

         Date   Open   High    Low  Close  Adj Close   Volume
0  2019-03-01  6.926  6.950  6.866  6.934      6.934  2462500
1  2019-03-04  6.810  6.828  6.704  6.742      6.742  3426500
2  2019-03-05  6.796  6.826  6.750  6.810      6.810  1389000
3  2019-03-06  6.910  6.958  6.910  6.930      6.930   744500
4  2019-03-07  6.834  6.834  6.744  6.762      6.762  1333000
         Date       Open       High        Low      Close  Adj Close   Volume
0  2019-03-01  96.830002  97.940002  95.309998  97.410004  95.533249  4443600
1  2019-03-04  98.309998  99.430000  95.570000  97.290001  95.415581  7187200
2  2019-03-05  96.260002  97.059998  95.150002  95.720001  93.875832  6041900
3  2019-03-06  95.320000  96.379997  94.129997  94.769997  92.944115  3987500
4  2019-03-07  95.000000  99.559998  94.470001  99.360001  97.445694  8866800
         Date  Open   High   Low  Close  Adj Close   Volume
0  2019-03-01  3.20  3.480  3.17   3.26       3.26  1034500
1  2019-03-04  3.17  3.240  2.85   3.00 

In [17]:
tchey_df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0
mean,52.750525,53.2686,52.209562,52.774352,50.011055,3812732.0
std,14.564704,14.700079,14.348422,14.56098,13.236191,2328249.0
min,25.719999,25.879999,24.75,25.68,24.208784,572600.0
25%,42.150002,42.450001,41.740002,42.130001,40.120689,2350400.0
50%,47.779999,48.290001,47.209999,47.830002,45.287487,3259800.0
75%,61.580002,62.549999,60.860001,61.849998,58.031876,4612200.0
max,99.010002,99.400002,98.43,99.099998,92.734238,26054200.0


In [18]:
ea_df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0
mean,123.431551,124.815632,122.004845,123.432506,121.823489,2628729.0
std,15.753103,15.775715,15.717186,15.739958,15.749835,1465153.0
min,87.93,88.949997,85.690002,86.940002,85.264969,583900.0
25%,113.230003,114.389999,111.830002,113.339996,112.353157,1726300.0
50%,126.779999,128.070007,125.449997,126.68,125.378502,2251400.0
75%,136.389999,137.830002,135.110001,136.460007,135.082062,3031500.0
max,148.919998,150.300003,146.149994,148.970001,146.294571,17468500.0


In [19]:
ntdoy_df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0
mean,11.328558,11.422229,11.240959,11.332516,11.332516,1784015.0
std,1.997902,2.010442,1.978169,1.996392,1.996392,1558097.0
min,6.656,6.686,6.612,6.68,6.68,141500.0
25%,10.066,10.18,9.99,10.07,10.07,887800.0
50%,10.906,10.98,10.84,10.906,10.906,1330000.0
75%,12.6,12.7,12.482,12.61,12.61,2193500.0
max,16.309999,16.51,16.309999,16.43,16.43,19199500.0


In [28]:
riot_df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0
mean,12.576244,13.191185,11.949423,12.540561,12.540561,13282100.0
std,12.993244,13.735684,12.230499,12.93418,12.93418,12282550.0
min,0.61,0.7,0.511,0.65,0.65,175700.0
25%,3.11,3.24,2.95,3.1,3.1,3369000.0
50%,7.39,7.77,7.06,7.34,7.34,10625200.0
75%,17.523001,18.120001,16.43,17.35,17.35,18765600.0
max,72.760002,79.5,67.419998,77.900002,77.900002,86796100.0


In [21]:
ttwo_df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,1257.0,1257.0,1257.0,1257.0,1257.0,1257.0
mean,141.53424,143.450859,139.579745,141.560366,141.560366,1743539.0
std,27.46836,27.631334,27.138041,27.358017,27.358017,1368869.0
min,86.949997,87.57,85.830002,87.040001,87.040001,211600.0
25%,120.940002,122.57,118.93,121.040001,121.040001,1074600.0
50%,137.869995,139.740005,136.679993,138.080002,138.080002,1407400.0
75%,163.919998,165.690002,161.179993,163.720001,163.720001,1926900.0
max,210.479996,214.910004,209.440002,213.339996,213.339996,19700700.0


SVM model without ada boost and pca and technical indicators

In [3]:
file_path = 'EA_stock_data.csv'
ea_df = pd.read_csv(file_path)
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

ea_df.dropna(inplace=True)
ea_df['Target'] = (ea_df['Adj Close'].shift(-80) > ea_df['Adj Close']).astype(int)
ea_df['Target'] = ea_df['Target'].astype('category')
ea_df['Smoothed_Close'] = ea_df['Close'].ewm(alpha=0.1).mean()
# Define your target variable: 1 if the price goes up next week, 0 otherwise
features = ['Open', 'High', 'Low', 'Close', 'Volume','Smoothed_Close']
X = ea_df[features]
ea_df.dropna(inplace=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = ea_df['Target'].values

# Split the data into training and testing sets


# Initialize and train the SVM model

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)
svm = SVC(C=9.699920698680053,class_weight='balanced',degree=5,coef0=6.080765165859437 ,kernel='linear', gamma=0.02997773203425469, probability=True, random_state=42)
svm.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)
svm = SVC(C=9.699920698680053,class_weight='balanced',degree=5,coef0=6.080765165859437 ,kernel='linear', gamma=0.02997773203425469, probability=True, random_state=42)
svm.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.8531746031746031
              precision    recall  f1-score   support

           0       0.82      0.96      0.89       152
           1       0.92      0.69      0.79       100

    accuracy                           0.85       252
   macro avg       0.87      0.83      0.84       252
weighted avg       0.86      0.85      0.85       252



Cross validation for svm without ta and adaboost

In [4]:
from sklearn.model_selection import TimeSeriesSplit


tscv = TimeSeriesSplit(n_splits=10)

# Initialize the SVM model
svm = SVC(C=9.699920698680053, kernel='linear', degree=5, coef0=6.080765165859437, gamma=0.02997773203425469, class_weight='balanced', probability=True, random_state=42)

# List to store scores for each fold
accuracy_scores = []

# Perform the time series cross-validation
for train_index, test_index in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Fit the model
    svm.fit(X_train, y_train)
    
    # Make predictions
    y_pred = svm.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    
    # Print the classification report for each fold
    print(classification_report(y_test, y_pred, zero_division=0))

# Average accuracy across all folds
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f'Average Accuracy across folds: {average_accuracy}')

              precision    recall  f1-score   support

           0       0.17      1.00      0.29        16
           1       1.00      0.21      0.35        98

    accuracy                           0.32       114
   macro avg       0.59      0.61      0.32       114
weighted avg       0.88      0.32      0.34       114

              precision    recall  f1-score   support

           0       0.15      1.00      0.27        16
           1       1.00      0.10      0.19        98

    accuracy                           0.23       114
   macro avg       0.58      0.55      0.23       114
weighted avg       0.88      0.23      0.20       114

              precision    recall  f1-score   support

           0       0.53      0.83      0.65        36
           1       0.89      0.65      0.76        78

    accuracy                           0.71       114
   macro avg       0.71      0.74      0.70       114
weighted avg       0.78      0.71      0.72       114

              preci

another svm with ta and removed featured

svm with ta

In [3]:
import ta  # Import the technical analysis library
file_path = 'EA_stock_data.csv'
ea_df = pd.read_csv(file_path)
# Assuming ea_df is your DataFrame loaded with stock data including 'Open', 'High', 'Low', 'Close', 'Volume'

# Calculate Moving Average
ea_df['SMA_10'] = ea_df['Close'].rolling(window=10).mean()

# Calculate MACD
ea_df['MACD'] = ta.trend.MACD(ea_df['Close']).macd()

# Calculate ADX
ea_df['ADX'] = ta.trend.ADXIndicator(ea_df['High'], ea_df['Low'], ea_df['Close'], window=14).adx()
ea_df['Smoothed_Close'] = ea_df['Close'].ewm(alpha=0.2).mean()
# Drop any rows with NaN values that were introduced by the indicator calculations
ea_df.dropna(inplace=True)

# Define the target variable: 1 if the price goes up next week, 0 otherwise
ea_df['Target'] = (ea_df['Adj Close'].shift(-80) > ea_df['Adj Close']).astype(int)
ea_df['Target'] = ea_df['Target'].astype('category')

# Update features list to include the new technical indicators
features = [ 'High', 'Low', 'Close', 'Volume', 'SMA_10', 'MACD', 'ADX','Smoothed_Close']
X = ea_df[features]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = ea_df['Target'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

# Initialize and train the SVM model
svm = SVC(C=0.7400230211761833, kernel='rbf',degree=4,coef0=0.09111040120352687, gamma=0.2002723939261901, class_weight='balanced', probability=True, random_state=42)
svm.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))



Accuracy: 0.8866396761133604
              precision    recall  f1-score   support

           0       0.85      0.99      0.92       152
           1       0.99      0.72      0.83        95

    accuracy                           0.89       247
   macro avg       0.92      0.85      0.87       247
weighted avg       0.90      0.89      0.88       247



svm without ada boost and with ta with feature reduction

In [28]:
import ta  # Import the technical analysis library
file_path = 'EA_stock_data.csv'
ea_df = pd.read_csv(file_path)
# Assuming ea_df is your DataFrame loaded with stock data including 'Open', 'High', 'Low', 'Close', 'Volume'

# Calculate Moving Average
ea_df['SMA_10'] = ea_df['Close'].rolling(window=20).mean()

# Calculate MACD
ea_df['MACD'] = ta.trend.MACD(ea_df['Close']).macd()

# Calculate ADX
ea_df['ADX'] = ta.trend.ADXIndicator(ea_df['High'], ea_df['Low'], ea_df['Close'], window=14).adx()  
ea_df['Smoothed_Close'] = ea_df['Close'].ewm(alpha=0.2).mean()
# Drop any rows with NaN values that were introduced by the indicator calculations
ea_df.dropna(inplace=True)

# Define the target variable: 1 if the price goes up next week, 0 otherwise
ea_df['Target'] = (ea_df['Adj Close'].shift(-80) > ea_df['Adj Close']).astype(int)
ea_df['Target'] = ea_df['Target'].astype('category')

# Update features list to include the new technical indicators
features = ['Close', 'SMA_10', 'MACD', 'ADX','Smoothed_Close']
X = ea_df[features]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = ea_df['Target'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

# Initialize and train the SVM model
svm = SVC(C=0.7400230211761833, kernel='rbf',degree=4,coef0=0.09111040120352687, gamma=0.2002723939261901, class_weight='balanced', probability=True, random_state=42)
svm.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.8947368421052632
              precision    recall  f1-score   support

           0       0.87      0.98      0.92       152
           1       0.96      0.76      0.85        95

    accuracy                           0.89       247
   macro avg       0.91      0.87      0.88       247
weighted avg       0.90      0.89      0.89       247



CV for with ta and without adaboost

In [31]:
from sklearn.model_selection import TimeSeriesSplit


tscv = TimeSeriesSplit(n_splits=10)

# Initialize the SVM model
svm = SVC(C=0.7400230211761833, kernel='rbf', degree=4, coef0=0.09111040120352687, gamma=0.2002723939261901, class_weight='balanced', probability=True, random_state=42)

# List to store scores for each fold
accuracy_scores = []

# Perform the time series cross-validation
for train_index, test_index in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Fit the model
    svm.fit(X_train, y_train)
    
    # Make predictions
    y_pred = svm.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    
    # Print the classification report for each fold
    print(classification_report(y_test, y_pred, zero_division=0))

# Average accuracy across all folds
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f'Average Accuracy across folds: {average_accuracy}')

              precision    recall  f1-score   support

           0       0.21      1.00      0.34        16
           1       1.00      0.36      0.53        96

    accuracy                           0.46       112
   macro avg       0.60      0.68      0.44       112
weighted avg       0.89      0.46      0.51       112

              precision    recall  f1-score   support

           0       0.33      0.25      0.29        32
           1       0.73      0.80      0.76        80

    accuracy                           0.64       112
   macro avg       0.53      0.53      0.52       112
weighted avg       0.61      0.64      0.63       112

              precision    recall  f1-score   support

           0       0.45      1.00      0.62        22
           1       1.00      0.70      0.82        90

    accuracy                           0.76       112
   macro avg       0.72      0.85      0.72       112
weighted avg       0.89      0.76      0.78       112

              preci

With adaboost 

In [4]:
from sklearn.ensemble import AdaBoostClassifier
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

# Initialize the SVM model with probability=True for AdaBoost compatibility
svm_base = SVC(C=8, kernel='rbf', gamma=0.01, class_weight=None, probability=True, random_state=42)

# Initialize and train the AdaBoost model with SVM as the base estimator
ada_boost_model = AdaBoostClassifier(base_estimator=svm_base, n_estimators=50, random_state=42)
ada_boost_model.fit(X_train, y_train)

# Make predictions and evaluate the AdaBoost model
y_pred = ada_boost_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.8906882591093117
              precision    recall  f1-score   support

           0       0.85      1.00      0.92       152
           1       1.00      0.72      0.83        95

    accuracy                           0.89       247
   macro avg       0.92      0.86      0.88       247
weighted avg       0.91      0.89      0.89       247



In [None]:
from sklearn.model_selection import TimeSeriesSplit


tscv = TimeSeriesSplit(n_splits=10)

# Initialize the SVM model
svm = SVC(C=0.7400230211761833, kernel='rbf', degree=4, coef0=0.09111040120352687, gamma=0.2002723939261901, class_weight='balanced', probability=True, random_state=42)

# List to store scores for each fold
accuracy_scores = []

# Perform the time series cross-validation
for train_index, test_index in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Fit the model
    svm.fit(X_train, y_train)
    
    # Make predictions
    y_pred = svm.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    
    # Print the classification report for each fold
    print(classification_report(y_test, y_pred, zero_division=0))

# Average accuracy across all folds
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f'Average Accuracy across folds: {average_accuracy}')

svm without ta and with adaboost

In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
file_path = 'EA_stock_data.csv'
ea_df = pd.read_csv(file_path)

ea_df.dropna(inplace=True)
ea_df['Target'] = (ea_df['Adj Close'].shift(-80) > ea_df['Adj Close']).astype(int)
ea_df['Target'] = ea_df['Target'].astype('category')
ea_df['Smoothed_Close'] = ea_df['Close'].ewm(alpha=0.1).mean()
# Define your target variable: 1 if the price goes up next week, 0 otherwise
features = ['Open', 'High', 'Low', 'Close', 'Volume','Smoothed_Close']
X = ea_df[features]
ea_df.dropna(inplace=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = ea_df['Target'].values

from sklearn.ensemble import AdaBoostClassifier
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)

# Initialize the SVM model with probability=True for AdaBoost compatibility
svm_base = SVC(C=8, kernel='rbf', gamma=0.01, class_weight=None, probability=True, random_state=42)

# Initialize and train the AdaBoost model with SVM as the base estimator
ada_boost_model = AdaBoostClassifier(base_estimator=svm_base, n_estimators=50, random_state=42)
ada_boost_model.fit(X_train, y_train)

# Make predictions and evaluate the AdaBoost model
y_pred = ada_boost_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.8690476190476191
              precision    recall  f1-score   support

           0       0.82      1.00      0.90       152
           1       1.00      0.67      0.80       100

    accuracy                           0.87       252
   macro avg       0.91      0.83      0.85       252
weighted avg       0.89      0.87      0.86       252



best hyper parameter search for svm without ta and adaboost

In [3]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pandas as pd

# Assuming ea_df is previously defined and loaded with data
# ea_df = pd.read_csv('path_to_your_data.csv')

ea_df.dropna(inplace=True)
ea_df['Target'] = (ea_df['Adj Close'].shift(-80) > ea_df['Adj Close']).astype(int)
ea_df['Smoothed_Close'] = ea_df['Close'].ewm(alpha=0.1).mean()
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Smoothed_Close']
X = ea_df[features]
y = ea_df['Target']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

space = {
    'C': hp.uniform('C', 0.01, 100),
    'gamma': hp.uniform('gamma', 0.01, 5),
    'kernel': hp.choice('kernel', ['rbf']),
    'degree': hp.choice('degree', [2, 3, 4, 5]),
    'coef0': hp.uniform('coef0', 0.0, 10.0),
    'class_weight': hp.choice('class_weight', [None, 'balanced'])
}

def objective(params):
    clf = SVC(**params, probability=True, random_state=42)
    score = cross_val_score(clf, X_scaled, y, scoring='accuracy').mean()
    return {'loss': -score, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)
best['kernel'] = ['linear', 'rbf', 'poly', 'sigmoid'][best['kernel']]
best['degree'] = [2, 3, 4, 5][best['degree']]
best['class_weight'] = [None, 'balanced'][best['class_weight']]

print("Best parameters:", best)


100%|██████████| 100/100 [01:47<00:00,  1.07s/trial, best loss: -0.7213495225447417]
Best parameters: {'C': 9.699920698680053, 'class_weight': 'balanced', 'coef0': 6.080765165859437, 'degree': 5, 'gamma': 0.02997773203425469, 'kernel': 'linear'}


Best hyper parameters for svm with ta:

In [12]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pandas as pd
import ta  # Import the technical analysis library

# Assuming ea_df is previously defined and loaded with data
# ea_df = pd.read_csv('path_to_your_data.csv')

ea_df.dropna(inplace=True)
ea_df['Target'] = (ea_df['Adj Close'].shift(-80) > ea_df['Adj Close']).astype(int)
ea_df['SMA_10'] = ea_df['Close'].rolling(window=10).mean()

# Calculate MACD
ea_df['MACD'] = ta.trend.MACD(ea_df['Close']).macd()

# Calculate ADX
ea_df['ADX'] = ta.trend.ADXIndicator(ea_df['High'], ea_df['Low'], ea_df['Close'], window=14).adx()
ea_df['Smoothed_Close'] = ea_df['Close'].ewm(alpha=0.2).mean()
# Drop any rows with NaN values that were introduced by the indicator calculations
ea_df.dropna(inplace=True)
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Smoothed_Close']
X = ea_df[features]
y = ea_df['Target']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

space = {
    'C': hp.uniform('C', 0.01, 100),
    'gamma': hp.uniform('gamma', 0.01, 5),
    'kernel': hp.choice('kernel', ['rbf']),
    'degree': hp.choice('degree', [2, 3, 4, 5]),
    'coef0': hp.uniform('coef0', 0.0, 10.0),
    'class_weight': hp.choice('class_weight', [None, 'balanced'])
}

def objective(params):
    clf = SVC(**params, probability=True, random_state=42)
    score = cross_val_score(clf, X_scaled, y, scoring='accuracy').mean()
    return {'loss': -score, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)
best['kernel'] = ['linear', 'rbf', 'poly', 'sigmoid'][best['kernel']]
best['degree'] = [2, 3, 4, 5][best['degree']]
best['class_weight'] = [None, 'balanced'][best['class_weight']]

print("Best parameters:", best)


  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [01:39<00:00,  1.01trial/s, best loss: -0.7436734693877551]
Best parameters: {'C': 0.7400230211761833, 'class_weight': 'balanced', 'coef0': 0.09111040120352687, 'degree': 4, 'gamma': 0.2002723939261901, 'kernel': 'linear'}


Best hyper parameters for svm with ta and adaboost:

In [25]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.datasets import make_classification

# Assuming X_scaled and y are defined, for example, by:
# X, y = make_classification(n_samples=1000, n_features=20, n_informative=2, n_redundant=0, random_state=42)
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

space = {
    'C': hp.uniform('C', 0.01, 100),
    'gamma': hp.uniform('gamma', 0.01, 5),
    'kernel': hp.choice('kernel', ['rbf']),  # If you only intend to test 'rbf', this could be fixed instead of using hp.choice
    'degree': hp.choice('degree', [2, 3, 4, 5]),
    'coef0': hp.uniform('coef0', 0.0, 10.0),
    'class_weight': hp.choice('class_weight', [None, 'balanced']),
    'n_estimators_ab': scope.int(hp.quniform('n_estimators_ab', 50, 200, 1)),
    'learning_rate': hp.uniform('learning_rate', 0.01, 1.0),
    'algorithm': hp.choice('algorithm', ['SAMME', 'SAMME.R'])
}

def objective(params):
    # Adjust SVC instantiation to pass **params directly, ensuring all SVC-related hyperparams are used
    svc = SVC(probability=True, **{k: v for k, v in params.items() if k in ['C', 'gamma', 'kernel', 'degree', 'coef0', 'class_weight']})
    
    # Instantiate AdaBoost with the SVC base estimator
    clf = AdaBoostClassifier(
        base_estimator=svc,
        n_estimators=params['n_estimators_ab'],  # Make sure n_estimators is cast to int if needed
        learning_rate=params['learning_rate'],
        random_state=42
    )
    
    # Perform cross-validation
    score = cross_val_score(clf, X_scaled, y, scoring='accuracy').mean()
    return {'loss': -score, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=100,
            trials=trials)

# Since 'kernel' and 'class_weight' are the only hyperparameters chosen with hp.choice, map them back to their string representations
best['kernel'] = ['rbf'][best['kernel']]  # Adjust accordingly if more options are added later
best['class_weight'] = [None, 'balanced'][best['class_weight']]

print("Best parameters:", best)


  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [4:47:54<00:00, 172.74s/trial, best loss: -0.7715842138178466] 
Best parameters: {'C': 98.09738852903301, 'algorithm': 0, 'class_weight': None, 'coef0': 4.561076344128647, 'degree': 2, 'gamma': 0.06514441289951156, 'kernel': 'rbf', 'learning_rate': 0.671993194805663, 'n_estimators_ab': 171.0}


<h1> split cross validation with best hyper parameters

Without ta and adaboost

In [23]:
from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import pandas as pd

# Assuming ea_df is previously defined and loaded with data
# ea_df = pd.read_csv('path_to_your_data.csv')
file_path = 'EA_stock_data.csv'

ea_df = pd.read_csv(file_path)
ea_df.dropna(inplace=True)
ea_df['Target'] = (ea_df['Adj Close'].shift(-80) > ea_df['Adj Close']).astype(int)
ea_df['Smoothed_Close'] = ea_df['Close'].ewm(alpha=0.1).mean()
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Smoothed_Close']
X = ea_df[features]
y = ea_df['Target']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define TimeSeriesSplit cross-validator
tscv = TimeSeriesSplit(n_splits=5)

space = {
    'C': hp.uniform('C', 0.01, 100),
    'gamma': hp.uniform('gamma', 0.01, 5),
    'kernel': hp.choice('kernel', ['rbf']),
    'degree': hp.choice('degree', [2, 3, 4, 5]),
    'coef0': hp.uniform('coef0', 0.0, 10.0),
    'class_weight': hp.choice('class_weight', [None, 'balanced'])
}

def objective(params):
    clf = SVC(**params, probability=True, random_state=42)
    # Here we use TimeSeriesSplit in cross_val_score
    score = cross_val_score(clf, X_scaled, y, cv=tscv, scoring='accuracy').mean()
    return {'loss': -score, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

# Convert indexes back to parameter values
best['kernel'] = 'rbf'  # since you had hp.choice with only one option 'rbf'
best['degree'] = [2, 3, 4, 5][best['degree']]
best['class_weight'] = [None, 'balanced'][best['class_weight']]

print("Best parameters:", best)


  0%|          | 0/100 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 100/100 [00:59<00:00,  1.67trial/s, best loss: -0.7119617224880382]
Best parameters: {'C': 30.615622222954407, 'class_weight': 'balanced', 'coef0': 6.588366669688766, 'degree': 2, 'gamma': 0.34292238995846713, 'kernel': 'rbf'}


In [24]:
file_path = 'EA_stock_data.csv'
ea_df = pd.read_csv(file_path)
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

ea_df.dropna(inplace=True)
ea_df['Target'] = (ea_df['Adj Close'].shift(-80) > ea_df['Adj Close']).astype(int)
ea_df['Target'] = ea_df['Target'].astype('category')
ea_df['Smoothed_Close'] = ea_df['Close'].ewm(alpha=0.1).mean()
# Define your target variable: 1 if the price goes up next week, 0 otherwise
features = ['Open', 'High', 'Low', 'Close', 'Volume','Smoothed_Close']
X = ea_df[features]
ea_df.dropna(inplace=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y = ea_df['Target'].values

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, shuffle=False)
svm = SVC(C=30.61562222295440,class_weight='balanced',degree=2,coef0= 6.588366669688766 ,kernel='rbf', gamma= 0.34292238995846713, probability=True, random_state=42)
svm.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = svm.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.8531746031746031
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       152
           1       0.83      0.79      0.81       100

    accuracy                           0.85       252
   macro avg       0.85      0.84      0.85       252
weighted avg       0.85      0.85      0.85       252



In [18]:
from sklearn.model_selection import TimeSeriesSplit


tscv = TimeSeriesSplit(n_splits=5)

# Initialize the SVM model
svm = SVC(C=30.61562222295440,class_weight='balanced',degree=2,coef0= 6.588366669688766 ,kernel='rbf', gamma= 0.34292238995846713, probability=True, random_state=42)

# List to store scores for each fold
accuracy_scores = []

# Perform the time series cross-validation
for train_index, test_index in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Fit the model
    svm.fit(X_train, y_train)
    
    # Make predictions
    y_pred = svm.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    
    # Print the classification report for each fold
    print(classification_report(y_test, y_pred, zero_division=0))

# Average accuracy across all folds
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f'Average Accuracy across folds: {average_accuracy}')

              precision    recall  f1-score   support

           0       0.55      0.85      0.67        48
           1       0.95      0.79      0.86       161

    accuracy                           0.80       209
   macro avg       0.75      0.82      0.76       209
weighted avg       0.86      0.80      0.82       209

              precision    recall  f1-score   support

           0       0.63      0.84      0.72       103
           1       0.77      0.51      0.61       106

    accuracy                           0.67       209
   macro avg       0.70      0.68      0.67       209
weighted avg       0.70      0.67      0.67       209

              precision    recall  f1-score   support

           0       0.85      0.50      0.63       121
           1       0.56      0.88      0.68        88

    accuracy                           0.66       209
   macro avg       0.70      0.69      0.65       209
weighted avg       0.72      0.66      0.65       209

              preci

In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the data
file_path = 'EA_stock_data.csv'
ea_df = pd.read_csv(file_path)

# Preprocess the data
ea_df.dropna(inplace=True)
ea_df['Target'] = (ea_df['Adj Close'].shift(-80) > ea_df['Adj Close']).astype(int)
ea_df['Smoothed_Close'] = ea_df['Close'].ewm(alpha=0.1).mean()
ea_df.dropna(inplace=True)

# Define features and target variable
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Smoothed_Close']
X = ea_df[features]
y = ea_df['Target']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Set up the TimeSeriesSplit cross-validator
tscv = TimeSeriesSplit(n_splits=10)

# Initialize the SVM model
svm = SVC(C=9.699920698680053, class_weight='balanced', degree=5, coef0=6.080765165859437,
          kernel='linear', gamma=0.02997773203425469, probability=True, random_state=42)

# List to store scores and create a loop to perform cross-validation
accuracy_scores = []

for train_index, test_index in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the SVM model
    svm.fit(X_train, y_train)

    # Make predictions
    y_pred = svm.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    print("Fold Accuracy:", accuracy)
    print(classification_report(y_test, y_pred, zero_division=0))

# Calculate and print the average accuracy
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print("Average Accuracy across all folds:", average_accuracy)


Fold Accuracy: 0.32456140350877194
              precision    recall  f1-score   support

           0       0.17      1.00      0.29        16
           1       1.00      0.21      0.35        98

    accuracy                           0.32       114
   macro avg       0.59      0.61      0.32       114
weighted avg       0.88      0.32      0.34       114

Fold Accuracy: 0.22807017543859648
              precision    recall  f1-score   support

           0       0.15      1.00      0.27        16
           1       1.00      0.10      0.19        98

    accuracy                           0.23       114
   macro avg       0.58      0.55      0.23       114
weighted avg       0.88      0.23      0.20       114

Fold Accuracy: 0.7105263157894737
              precision    recall  f1-score   support

           0       0.53      0.83      0.65        36
           1       0.89      0.65      0.76        78

    accuracy                           0.71       114
   macro avg       0.71 

In [3]:
import pandas as pd
import ta  # Import the technical analysis library
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load your data
file_path = 'EA_stock_data.csv'
ea_df = pd.read_csv(file_path)

# Calculate technical indicators
ea_df['SMA_10'] = ea_df['Close'].rolling(window=20).mean()
ea_df['MACD'] = ta.trend.MACD(ea_df['Close']).macd()
ea_df['ADX'] = ta.trend.ADXIndicator(ea_df['High'], ea_df['Low'], ea_df['Close'], window=14).adx()
ea_df['Smoothed_Close'] = ea_df['Close'].ewm(alpha=0.2).mean()

# Ensure all technical indicators are calculated before dropping NaNs
ea_df.dropna(inplace=True)

# Define the target variable for future price increase over the next 80 days
ea_df['Target'] = (ea_df['Adj Close'].shift(-80) > ea_df['Adj Close']).astype(int)

# Feature selection
features = ['Close', 'SMA_10', 'MACD', 'ADX', 'Smoothed_Close']
X = ea_df[features].values
y = ea_df['Target'].values

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Initialize the SVM model
svm = SVC(C=0.7400230211761833, kernel='rbf', degree=4, coef0=0.09111040120352687, gamma=0.2002723939261901, class_weight='balanced', probability=True, random_state=42)

# Rolling window cross-validation
tscv = TimeSeriesSplit(n_splits=10)
accuracy_scores = []

for train_index, test_index in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train the SVM model
    svm.fit(X_train, y_train)

    # Make predictions
    y_pred = svm.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    print("Fold accuracy:", accuracy)
    print(classification_report(y_test, y_pred, zero_division=0))

# Print the average accuracy
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print("Average Accuracy across all folds:", average_accuracy)


Fold accuracy: 0.44642857142857145
              precision    recall  f1-score   support

           0       0.21      1.00      0.34        16
           1       1.00      0.35      0.52        96

    accuracy                           0.45       112
   macro avg       0.60      0.68      0.43       112
weighted avg       0.89      0.45      0.50       112

Fold accuracy: 0.5803571428571429
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        32
           1       0.67      0.81      0.73        80

    accuracy                           0.58       112
   macro avg       0.34      0.41      0.37       112
weighted avg       0.48      0.58      0.52       112

Fold accuracy: 0.7946428571428571
              precision    recall  f1-score   support

           0       0.49      1.00      0.66        22
           1       1.00      0.74      0.85        90

    accuracy                           0.79       112
   macro avg       0.74  

In [7]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load your data
file_path = 'EA_stock_data.csv'
ea_df = pd.read_csv(file_path)

# Preprocess the data
ea_df.dropna(inplace=True)
ea_df['Smoothed_Close'] = ea_df['Close'].ewm(alpha=0.1).mean()
ea_df['Target'] = (ea_df['Adj Close'].shift(-80) > ea_df['Adj Close']).astype(int)
ea_df.dropna(inplace=True)

# Define features and target variable
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Smoothed_Close']
X = ea_df[features]
y = ea_df['Target']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Set up the TimeSeriesSplit cross-validator
tscv = TimeSeriesSplit(n_splits=10)

# Initialize the SVM model with probability=True for AdaBoost compatibility
svm_base = SVC(C=8, kernel='rbf', gamma=0.01, class_weight=None, probability=True, random_state=42)

# List to store scores and create a loop to perform cross-validation
accuracy_scores = []

for train_index, test_index in tscv.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Initialize and train the AdaBoost model with SVM as the base estimator
    ada_boost_model = AdaBoostClassifier(base_estimator=svm_base, n_estimators=50, random_state=42)
    ada_boost_model.fit(X_train, y_train)

    # Make predictions and evaluate the AdaBoost model
    y_pred = ada_boost_model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    print("Fold Accuracy:", accuracy)
    print(classification_report(y_test, y_pred, zero_division=0))

# Calculate and print the average accuracy
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print("Average Accuracy across all folds:", average_accuracy)


Fold Accuracy: 0.4298245614035088
              precision    recall  f1-score   support

           0       0.20      1.00      0.33        16
           1       1.00      0.34      0.50        98

    accuracy                           0.43       114
   macro avg       0.60      0.67      0.42       114
weighted avg       0.89      0.43      0.48       114

Fold Accuracy: 0.8596491228070176
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        16
           1       0.86      1.00      0.92        98

    accuracy                           0.86       114
   macro avg       0.43      0.50      0.46       114
weighted avg       0.74      0.86      0.79       114

Fold Accuracy: 0.6842105263157895
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        36
           1       0.68      1.00      0.81        78

    accuracy                           0.68       114
   macro avg       0.34   