# PCA runtime test

- https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
- https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html

## Load data

In [1]:
import yaml
import os.path
import pickle

# Set data paths
config          = yaml.safe_load(open("config.yaml", "r"))
bow_768_file = os.path.join(config["EMBEDDINGS_DIRECTORY"], "amazon_drift_bow_768.pickle")
print("bow_768_file", bow_768_file)

# Load data
data = {}
with open(bow_768_file, "rb") as handle:
    data["bow_768"] = pickle.load(handle)

bow_768_file /home/eml4u/EML4U/data/amazon/amazon_drift_bow_768.pickle


## Split data

In [2]:
train_data_size = 500
train_data = data["bow_768"]['orig'][0][:train_data_size]
test_data = data["bow_768"]['orig'][0][train_data_size:]
print("Train Data. Size:", len(train_data),"Dimension:", len(train_data[0]))
print("Test Data. Size:", len(test_data),"Dimension:", len(test_data[0]))

Train Data. Size: 500 Dimension: 768
Test Data. Size: 9500 Dimension: 768


## Reduce dimensions

In [3]:
from sklearn.decomposition import PCA
import numpy as np
import time

target_dimensions = 50

begin = time.time()
pca = PCA(n_components=target_dimensions)
pca.fit(train_data)
reduced_data = pca.transform(test_data)

print("Runtime (seconds)", time.time() - begin)
print(time.time())
print("Reduced Data. Size:", len(reduced_data),"Dimension:", len(reduced_data[0]))

Runtime (seconds) 0.06365585327148438
1626107475.7857342
Reduced Data. Size: 9500 Dimension: 50


In [4]:
print("n_components_", pca.n_components_)
print("n_features_", pca.n_features_)
print("n_samples_", pca.n_samples_)

# Just for interest: Test inverse transform (did not produce usable data)
if(False):
    index = 100
    print(test_data[index][:10])
    print(pca.inverse_transform(reduced_data[index])[:10])

n_components_ 50
n_features_ 768
n_samples_ 500
