In [70]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

In [71]:
df = pd.read_csv("data/ASML_v1.csv")
df

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,obv,ma6,ma10,rsi6,rsi10,macd,dif,atr10,atr20,bias6,bias10,tapi
0,0,2016-11-17,101.239998,102.849998,101.190002,102.559998,97.645622,1344700,1344700,,,,,,,0.000000,0.000000,,,13111.349710
1,1,2016-11-18,103.580002,104.750000,103.379997,104.589996,99.578346,1914300,3259000,,,,,,,0.000000,0.000000,,,18302.897726
2,2,2016-11-21,104.089996,105.110001,104.019997,105.089996,100.054390,1408200,4667200,,,,,,,0.000000,0.000000,,,13399.943416
3,3,2016-11-22,105.449997,105.610001,104.800003,105.320000,100.273384,876300,5543500,,,,,,,0.000000,0.000000,,,8320.357007
4,4,2016-11-23,104.480003,105.349998,104.250000,105.099998,100.063911,432300,5111200,,,,,,,0.000000,0.000000,,,4113.225578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1253,1253,2021-11-10,829.609985,835.440002,815.109985,817.849976,817.849976,737700,86884600,840.889994,829.088001,43.364854,51.307540,14.121893,4.518819,19.559911,20.038150,-3.840003,-1.123802,901.999171
1254,1254,2021-11-11,833.270020,839.039978,829.200012,833.390015,833.390015,416000,87300600,841.451660,831.108002,55.059322,57.428905,13.815968,3.370316,19.722920,20.095742,-1.343608,0.228201,499.166048
1255,1255,2021-11-12,840.000000,855.000000,835.099976,851.630005,851.630005,531200,87831800,841.641663,834.983002,65.184828,63.425384,14.873879,3.542582,19.911627,20.171455,1.664724,1.664700,623.745050
1256,1256,2021-11-15,855.000000,861.599976,852.989990,856.719971,856.719971,638000,88469800,843.261658,839.909998,67.627289,64.955921,15.939261,3.686371,18.917461,19.661380,2.243052,1.680997,744.700744


In [72]:
sel = VarianceThreshold(0)
sel.fit_transform(df.drop(["Date"],axis=1))
df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [73]:
df.dropna(inplace=True)

In [74]:
scaler = StandardScaler()
scaler.fit(df.drop(["Date"],axis=1))
X=scaler.transform(df.drop(["Date"],axis=1))

In [75]:
model = PCA(n_components=18).fit(X)
X_pc = model.transform(X)

# number of components
n_pcs= model.components_.shape[0]

# get the index of the most important feature on EACH component
# most_important = [np.abs(model.components_[i]).argmax() for i in range(n_pcs)]

initial_feature_names = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'obv', 'ma6', 'ma10', 'rsi6', 'rsi10', 'macd', 'dif', 'atr10', 'atr20', 'bias6', 'bias10', 'tapi']

In [82]:
# Rank the lists withn each pc

feat_rank = [np.abs(model.components_[i]).argsort() for i in range(n_pcs)]

dic_rank = {'PC{}'.format(i): [initial_feature_names[feat_rank[i][j]] for j in range(len(initial_feature_names))][::-1] for i in range(n_pcs)}

In [83]:
dic_rank

{'PC0': ['Adj Close',
  'Close',
  'Low',
  'High',
  'Open',
  'ma6',
  'ma10',
  'atr20',
  'obv',
  'atr10',
  'macd',
  'tapi',
  'bias10',
  'bias6',
  'rsi10',
  'rsi6',
  'dif',
  'Volume'],
 'PC1': ['rsi6',
  'bias10',
  'rsi10',
  'dif',
  'bias6',
  'macd',
  'atr10',
  'Volume',
  'atr20',
  'ma10',
  'ma6',
  'obv',
  'Open',
  'High',
  'Low',
  'Adj Close',
  'Close',
  'tapi'],
 'PC2': ['Volume',
  'tapi',
  'atr10',
  'bias6',
  'atr20',
  'bias10',
  'dif',
  'rsi6',
  'macd',
  'obv',
  'rsi10',
  'High',
  'Close',
  'Adj Close',
  'Open',
  'Low',
  'ma6',
  'ma10'],
 'PC3': ['macd',
  'bias6',
  'rsi10',
  'bias10',
  'atr10',
  'atr20',
  'rsi6',
  'dif',
  'tapi',
  'Volume',
  'obv',
  'ma6',
  'ma10',
  'Open',
  'Low',
  'High',
  'Close',
  'Adj Close'],
 'PC4': ['dif',
  'rsi6',
  'macd',
  'rsi10',
  'bias6',
  'bias10',
  'obv',
  'atr10',
  'tapi',
  'atr20',
  'Volume',
  'ma10',
  'Close',
  'Adj Close',
  'High',
  'Low',
  'Open',
  'ma6'],
 'PC5': ['

In [77]:
most_important = [np.abs(model.components_[i]).argmax() for i in range(n_pcs)]
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]
dic = {'PC{}'.format(i): most_important_names[i] for i in range(n_pcs)}


In [78]:
most_important_names

['Adj Close',
 'rsi6',
 'Volume',
 'macd',
 'dif',
 'bias6',
 'obv',
 'atr10',
 'tapi',
 'bias10',
 'rsi10',
 'atr20',
 'Open',
 'Low',
 'High',
 'Adj Close',
 'Close',
 'ma10']