## Curse of Dimensionality

In [6]:
import math
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.spatial import distance


def get_normalized_data(num_rows: int, num_features: int) -> np.ndarray:
    """Generate a normalized dataset of dim (num_rows, num_features)"""
    data = np.random.rand(num_rows, num_features)
    data = (data - data.mean(axis=0))/ data.std(axis=0)    

    return data

num_trials = 2  # You can run this more times for further validation
features_to_test = [2,10,25,50,75,100]  # The feature/dimension sizes we will test
charts_per_row = 3  # For the visualization
num_rows = 100  # The number of rows in the dataset
for trial in range(num_trials):
    print(f"Trial {trial}\n")
    
    fig = make_subplots(
        rows=math.ceil(len(features_to_test)/charts_per_row),  # We want max 3 charts per row 
        cols=charts_per_row,
        subplot_titles=([f"{i} features" for i in features_to_test])
    )


    for idx, num_features in enumerate(features_to_test):
        data = get_normalized_data(num_rows, num_features)
        dist = distance.cdist(data, data, metric="euclidean").flatten()

        fig_row = idx+1  # Plotly rows start at 1, not 0
        fig.add_trace(
            go.Histogram(x=dist),
            row=math.ceil(fig_row/charts_per_row), col=1+(idx%charts_per_row)
        )
        
    fig.show()

Trial 0



Trial 1



## Dimensionality reduction - Load wine data

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

wine_url = (
    'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
)
df_wine = pd.read_csv(wine_url, header=None)

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(  # Split into training and testing sets
    X, 
    y, 
    test_size=0.3,
    stratify=y,
    random_state=0
)

### Calculate eigenvector

In [59]:
import numpy as np

matrix = np.random.rand(4,4)
eigenvalues, eigenvectors = np.linalg.eig(matrix)

In [82]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()  # Scale the data with a mean of 0
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)


pca = PCA(n_components=2) # We want 2 components to represent our data

X_train_pca = pca.fit_transform(X_train_scaled) # Fit and transform the data
X_test_pca = pca.transform(X_test_scaled)

print(f"Eigenvectors:\n {pca.components_}\n\n")
print(f"Eigenvalues:\n {pca.explained_variance_}\n\n")
print(f"Covariance:\n {pca.get_covariance()}")

Eigenvectors:
 [[-0.13724218  0.24724326 -0.02545159  0.20694508 -0.15436582 -0.39376952
  -0.41735106  0.30572896 -0.30668347  0.07554066 -0.32613263 -0.36861022
  -0.29669651]
 [ 0.50303478  0.16487119  0.24456476 -0.11352904  0.28974518  0.05080104
  -0.02287338  0.09048885  0.00835233  0.54977581 -0.20716433 -0.24902536
   0.38022942]]


Eigenvalues:
 [4.84274532 2.41602459]


Covariance:
 [[ 1.08959959  0.01000289  0.24689732 -0.23006639  0.36600255  0.28114279
   0.22525494 -0.09511354  0.18937599  0.4764708  -0.00341797 -0.01796752
   0.53599232]
 [ 0.01000289  0.84630413  0.04885638  0.18531307 -0.07451803 -0.40394183
  -0.45196825  0.35399632 -0.32430405  0.25133403 -0.41199572 -0.47027942
  -0.1981175 ]
 [ 0.24689732  0.04885638  0.64704576 -0.07503063  0.15047556  0.06662031
   0.03525286  0.00815766  0.03750089  0.24509123 -0.05969209 -0.07432402
   0.20779538]
 [-0.23006639  0.18531307 -0.07503063  0.74046004 -0.1997118  -0.36218309
  -0.36745995  0.25340667 -0.27540474 -0

### See the variance per component amount

In [91]:
for nc in range(2, X_train_scaled.shape[1]):
    pca_ = PCA(n_components=nc)
    pca_.fit_transform(X_train_scaled)
    print(f"For {nc} components, variance={pca_.explained_variance_ratio_.sum()}")

For 2 components, variance=0.5538639565949182
For 3 components, variance=0.6720155475408881
For 4 components, variance=0.7453580651787426
For 5 components, variance=0.8095791433960593
For 6 components, variance=0.8600963882451358
For 7 components, variance=0.8996429271575503
For 8 components, variance=0.9260821103267703
For 9 components, variance=0.9499753029186233
For 10 components, variance=0.9662714406558743
For 11 components, variance=0.9800716518778227
For 12 components, variance=0.9917939143209088


0.5538639565949182

In [102]:
import plotly.express as px

px.scatter(X_train_pca, color=[f"Class_{i}" for i in y_train])

## t-SNE application and visualization

In [208]:
from sklearn.manifold import TSNE
import plotly.express as px

tsne = TSNE(n_components=2, perplexity=60, learning_rate="auto")

X_train_tsne = tsne.fit_transform(X_train_scaled)

px.scatter(X_train_tsne, color=[f"Class_{i}" for i in y_train])


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.



In [133]:
help(TSNE)

Help on class TSNE in module sklearn.manifold._t_sne:

class TSNE(sklearn.base.BaseEstimator)
 |  TSNE(n_components=2, *, perplexity=30.0, early_exaggeration=12.0, learning_rate='warn', n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='euclidean', metric_params=None, init='warn', verbose=0, random_state=None, method='barnes_hut', angle=0.5, n_jobs=None, square_distances='deprecated')
 |  
 |  T-distributed Stochastic Neighbor Embedding.
 |  
 |  t-SNE [1] is a tool to visualize high-dimensional data. It converts
 |  similarities between data points to joint probabilities and tries
 |  to minimize the Kullback-Leibler divergence between the joint
 |  probabilities of the low-dimensional embedding and the
 |  high-dimensional data. t-SNE has a cost function that is not convex,
 |  i.e. with different initializations we can get different results.
 |  
 |  It is highly recommended to use another dimensionality reduction
 |  method (e.g. PCA for dense data or TruncatedS

In [118]:
!pip install umap_learn

You should consider upgrading via the '/Users/benepstein/Documents/Github/dcai/.venv/bin/python -m pip install --upgrade pip' command.[0m


In [156]:
from umap import UMAP

umap = UMAP(n_neighbors=4, min_dist=0.05)
X_train_umap = umap.fit_transform(X_train_scaled)
px.scatter(X_train_umap, color=[f"Class_{i}" for i in y_train])