In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [3]:
# Load the dataset
file_path = 'data_with_indicator/iADBL.csv'
data = pd.read_csv(file_path)

In [4]:
# Step 1: Data Standardization (excluding non-numeric columns like 'date' and target 'Y_close')
features = data.drop(columns=['date', 'Y_close'])
scaler = StandardScaler()
standardized_data = scaler.fit_transform(features)

In [5]:
# Convert standardized data back to DataFrame for easier handling
standardized_df = pd.DataFrame(standardized_data, columns=features.columns)

In [6]:
standardized_df.head()

Unnamed: 0,high,low,close,noOfTransaction,volume,amount,open,change,chgPercent,ADX,...,BOLL_middle,BOLL_lower,BIAS1,BIAS2,BIAS3,WR,PSY,MTM,BR,AR
0,0.545055,0.506458,0.453172,0.064121,0.093126,0.085742,0.662489,-3.243381,-3.020617,0.156413,...,0.419071,0.029213,-1.956213,-1.316621,0.568564,-0.288241,0.561207,-1.071762,1.153074,-0.440212
1,0.506185,0.496407,0.502615,-0.522274,-0.417261,-0.358727,0.454663,0.751549,0.724549,0.109487,...,0.449473,0.115612,-1.082312,-1.043832,0.674573,-0.109396,0.561207,-1.282363,1.169499,-0.195717
2,0.409011,0.456202,0.453172,-0.503559,-0.31472,-0.278028,0.43487,0.290596,0.28,0.043724,...,0.474392,0.194902,-1.145128,-1.237482,0.285953,-0.776113,-0.002911,-1.029642,1.279534,0.004824
3,0.447881,0.4361,0.512504,-0.575299,-0.323756,-0.281946,0.454663,0.9052,0.876791,-0.001271,...,0.497816,0.263268,-0.194351,-0.676781,0.442621,-0.874193,-0.002911,-0.566319,1.52211,0.168439
4,0.642229,0.576816,0.641055,-0.453653,-0.361593,-0.302819,0.514041,1.980759,1.89986,0.02714,...,0.523234,0.320298,1.090178,0.22576,0.954246,-0.244001,0.561207,-0.102997,1.548747,1.086252


In [28]:
# Step 2: Calculate the Covariance Matrix
cov_matrix = np.cov(standardized_df.T)
np.isnan(cov_matrix).any(), np.isinf(cov_matrix).any()


(np.False_, np.False_)

In [8]:
# Step 3: Calculate Eigenvalues and Eigenvectors
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
np.isnan(eigenvalues).any(), np.isinf(eigenvalues).any()
np.isnan(eigenvectors).any(), np.isinf(eigenvectors).any()

(np.False_, np.False_)

In [9]:
# Step 4: Sort Eigenvalues and Select Principal Components
# Create a list of (eigenvalue, eigenvector) tuples
eigen_pairs = [(np.abs(eigenvalues[i]), eigenvectors[:, i]) for i in range(len(eigenvalues))]
eigen_pairs

[(np.float64(18.859763595199503),
  array([ 0.12232269,  0.11845215,  0.12205313,  0.1588825 ,  0.17204282,
          0.17251925,  0.11737252,  0.07402887,  0.0764687 ,  0.13572067,
          0.12371911,  0.06462469,  0.16714772,  0.11779803,  0.18536478,
          0.15924888,  0.18501873,  0.12982155,  0.16713194,  0.17382458,
          0.16560715,  0.18162359,  0.20954904,  0.17629974,  0.19296216,
          0.16074083,  0.10112649,  0.04692012,  0.02404187,  0.1119261 ,
          0.11865439,  0.06573947,  0.1071183 ,  0.04222018,  0.09285726,
          0.0988879 ,  0.01326545,  0.01730746, -0.09936903,  0.10240039,
          0.10235969,  0.14207448,  0.14407563,  0.1003058 ,  0.09410953,
          0.08695433,  0.07650053,  0.13452859,  0.17646522,  0.20335286,
          0.17629974,  0.15684445,  0.17382458,  0.1465423 ,  0.14529264])),
 (np.float64(11.153599727323163),
  array([-0.24397608, -0.24555387, -0.24302708, -0.06988229, -0.0300605 ,
         -0.06305754, -0.24770131,  0.069

In [10]:
# Sort the (eigenvalue, eigenvector) tuples from high to low
eigen_pairs.sort(key=lambda x: x[0], reverse=True)

In [20]:
# Select the top k eigenvectors (for example, let's keep 2 principal components)
k = 2
selected_eigenvectors = np.column_stack([eigen_pairs[i][1] for i in range(k)])
selected_eigenvectors


array([[ 0.12232269, -0.24397608],
       [ 0.11845215, -0.24555387],
       [ 0.12205313, -0.24302708],
       [ 0.1588825 , -0.06988229],
       [ 0.17204282, -0.0300605 ],
       [ 0.17251925, -0.06305754],
       [ 0.11737252, -0.24770131],
       [ 0.07402887,  0.06971452],
       [ 0.0764687 ,  0.07228711],
       [ 0.13572067,  0.01499691],
       [ 0.12371911, -0.00980045],
       [ 0.06462469,  0.0608053 ],
       [ 0.16714772,  0.11164423],
       [ 0.11779803,  0.05639641],
       [ 0.18536478,  0.04538246],
       [ 0.15924888,  0.0448486 ],
       [ 0.18501873,  0.04486767],
       [ 0.12982155, -0.06572   ],
       [ 0.16713194,  0.06273712],
       [ 0.17382458,  0.12788726],
       [ 0.16560715,  0.04634254],
       [ 0.18162359,  0.12790542],
       [ 0.20954904,  0.08980265],
       [ 0.17629974,  0.10280046],
       [ 0.19296216,  0.12714746],
       [ 0.16074083,  0.014321  ],
       [ 0.10112649, -0.26076703],
       [ 0.04692012, -0.11739614],
       [ 0.02404187,

In [14]:
# Step 5: Transform the Data
# Transform the original standardized data
pca_transformed_data = standardized_df.dot(selected_eigenvectors)
pca_transformed_data

Unnamed: 0,0,1
0,0.935544,-3.004253
1,1.414613,-1.878644
2,0.400508,-2.314777
3,1.147256,-1.804921
4,3.309434,-0.763167
...,...,...
964,1.829420,-2.977889
965,0.680970,-3.745384
966,-0.495955,-3.732409
967,-1.034510,-3.365787


In [14]:
print(data.shape)
print(pca_transformed_data.shape)

(969, 57)
(969, 2)


In [20]:
pca_transformed_df = pd.DataFrame(np.array(pca_transformed_data), columns=[f'PC{i+1}' for i in range(2)])
# Example of combining PCA features with target variable
final_df = pd.concat([pca_transformed_df, standardized_df['close']], axis=1)  # Make sure 'original_df' contains your target variable

In [21]:
# Display the transformed data
print("PCA Transformed Data:")
print(final_df.head())

PCA Transformed Data:
        PC1       PC2     close
0  0.935544  3.004253  0.453172
1  1.414613  1.878644  0.502615
2  0.400508  2.314777  0.453172
3  1.147256  1.804921  0.512504
4  3.309434  0.763167  0.641055


In [22]:
# If you want to save the transformed data to a CSV file:
output_path = 'pca_transformed_iADBL.csv'
final_df.to_csv(output_path, index=False)

In [26]:
from sklearn.decomposition import PCA
# PCA
pca = PCA(n_components=2)
pca_transformed_data = pca.fit_transform(standardized_df)
pca_transformed_data

array([18.8597636 , 11.15359973])