# Normalizing and Scaling Structured Data

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

wine_url = (
    'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
)
df_wine = pd.read_csv(
    wine_url, 
    header=None,
    names=[f"feature_{i}" for i in range(14)]
)
display(df_wine.head(5))

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [5]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, Normalizer

scalers = [StandardScaler, MinMaxScaler, RobustScaler, Normalizer]
scaler_names = ["Standard Scaler (Z-Score)", "Min-Max Scaler", "RobustScaler", "Unit Vector Scaler (Normalizer)"]

# All scalers have the same syntax, so we can loop through them and try each
for Scaler, scaler_name in zip(scalers, scaler_names):
    print(scaler_name)
    # Instantiate our particular Scaler
    scaler = Scaler()
    # Scale our data
    wine_scaled = scaler.fit_transform(df_wine)
    # Create a dataframe (for viewing)
    df_wine_scaled = pd.DataFrame(wine_scaled, columns=df_wine.columns)
    # Display the first 5 rows
    display(df_wine_scaled.head(5))
    print("-"*100)

Standard Scaler (Z-Score)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13
0,-1.213944,1.518613,-0.56225,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.84792,1.013009
1,-1.213944,0.24629,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242
2,-1.213944,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.26902,0.318304,0.788587,1.395148
3,-1.213944,1.69155,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574
4,-1.213944,0.2957,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874


----------------------------------------------------------------------------------------------------
Min-Max Scaler


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13
0,0.0,0.842105,0.1917,0.572193,0.257732,0.619565,0.627586,0.57384,0.283019,0.59306,0.372014,0.455285,0.970696,0.561341
1,0.0,0.571053,0.205534,0.417112,0.030928,0.326087,0.575862,0.510549,0.245283,0.274448,0.264505,0.463415,0.78022,0.550642
2,0.0,0.560526,0.320158,0.700535,0.412371,0.336957,0.627586,0.611814,0.320755,0.757098,0.375427,0.447154,0.695971,0.646933
3,0.0,0.878947,0.23913,0.609626,0.319588,0.467391,0.989655,0.664557,0.207547,0.55836,0.556314,0.308943,0.798535,0.857347
4,0.0,0.581579,0.365613,0.807487,0.536082,0.521739,0.627586,0.495781,0.490566,0.444795,0.259386,0.455285,0.608059,0.325963


----------------------------------------------------------------------------------------------------
RobustScaler


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13
0,-0.5,0.897338,-0.10473,0.201439,-0.906977,1.526316,0.420804,0.553892,-0.358209,1.05,0.318792,0.222222,0.924949,0.80805
1,-0.5,0.114068,-0.057432,-0.633094,-1.930233,0.105263,0.27896,0.374251,-0.477612,-0.392857,-0.104027,0.251852,0.503043,0.77709
2,-0.5,0.08365,0.334459,0.892086,-0.209302,0.157895,0.420804,0.661677,-0.238806,1.792857,0.332215,0.192593,0.31643,1.055728
3,-0.5,1.003802,0.057432,0.402878,-0.627907,0.789474,1.413712,0.811377,-0.597015,0.892857,1.043624,-0.311111,0.543611,1.664603
4,-0.5,0.144487,0.489865,1.467626,0.348837,1.052632,0.420804,0.332335,0.298507,0.378571,-0.124161,0.222222,0.121704,0.126935


----------------------------------------------------------------------------------------------------
Unit Vector Scaler (Normalizer)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12,feature_13
0,0.000932,0.013264,0.001594,0.002265,0.014542,0.118383,0.00261,0.002852,0.000261,0.002135,0.005257,0.000969,0.003654,0.992738
1,0.000948,0.012513,0.001687,0.002029,0.010617,0.094794,0.002512,0.002616,0.000246,0.001213,0.004152,0.000995,0.003223,0.995336
2,0.000841,0.011063,0.001984,0.002245,0.015636,0.084906,0.002354,0.002724,0.000252,0.002362,0.004775,0.000866,0.002665,0.996175
3,0.000674,0.00968,0.001314,0.001684,0.011317,0.076119,0.002593,0.002351,0.000162,0.001468,0.005254,0.000579,0.002324,0.996961
4,0.001343,0.017775,0.003477,0.003853,0.028193,0.158417,0.003759,0.003611,0.000524,0.002443,0.0058,0.001396,0.003934,0.986752


----------------------------------------------------------------------------------------------------


In [6]:
from sklearn.preprocessing import KBinsDiscretizer

n_bins = 10

feature_name = "feature_1"
scaled_name = f"{feature_name}_scaled"
# Create a KBinsDiscretizer object with 10 bins, each with the same width, encoded into a single column
bin_encoder = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')

# Transform the 'price' feature in the original DataFrame to its binned representation
df_wine[scaled_name] = bin_encoder.fit_transform(df_wine[[feature_name]])
print("Uniform Bins, Ordinal Encoding")
display(df_wine[[feature_name, scaled_name]].head(5))
print()


# Create a KBinsDiscretizer object with 10 bins, each with the same width, encoded into a single column
bin_encoder = KBinsDiscretizer(n_bins=n_bins, encode='onehot-dense', strategy='uniform')
# Create the onehot column names
onehot_cols = [f"{feature_name}_bin_{i}" for i in range(n_bins)]
# Transform the feature in the original DataFrame to its binned representation, returning a numpy array
feature_onehot = bin_encoder.fit_transform(df_wine[[feature_name]])
# Create a dataframe from this output
df_onehot = pd.DataFrame(feature_onehot, columns=onehot_cols)
# Merge it back to the original dataframe
df_wine = df_wine.merge(df_onehot, left_index=True, right_index=True)
print("Uniform Bins, Ordinal Encoding")
display(df_wine[[feature_name] + onehot_cols].head(5))

Uniform Bins, Ordinal Encoding


Unnamed: 0,feature_1,feature_1_scaled
0,14.23,8.0
1,13.2,5.0
2,13.16,5.0
3,14.37,8.0
4,13.24,5.0



Uniform Bins, Ordinal Encoding


Unnamed: 0,feature_1,feature_1_bin_0,feature_1_bin_1,feature_1_bin_2,feature_1_bin_3,feature_1_bin_4,feature_1_bin_5,feature_1_bin_6,feature_1_bin_7,feature_1_bin_8,feature_1_bin_9
0,14.23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,13.2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,13.16,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,14.37,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,13.24,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [25]:
# Look at uniform vs quantile binning strategies
import plotly.express as px


# Uniform bins, each bin having the same "width" but different numbers of samples in each
bin_encoder = KBinsDiscretizer(n_bins=n_bins, encode='onehot-dense', strategy='uniform')
# Transform feature into uniform bins
uniform_bins = bin_encoder.fit_transform(df_wine[[feature_name]]).astype("int32")
# Get the actual bin value ranges (width)
bin_count = uniform_bins.sum(axis=0)
# Get the actual bin value ranges (width)
bins = bin_encoder.bin_edges_[0].round(3)
cols = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
display(px.bar(x=cols, y=bin_count, title="Samples per bin (Uniform)"))


# Quantile bins, each bin having a unique "width" in order to ensure evenly distributed samples per bin
bin_encoder = KBinsDiscretizer(n_bins=n_bins, encode='onehot-dense', strategy='quantile')
# Transform feature into quantile bins
quantile_bins = bin_encoder.fit_transform(df_wine[[feature_name]]).astype("int32")
# Get the frequency per bin
bin_count = quantile_bins.sum(axis=0)
# Get the actual bin value ranges (width)
bins = bin_encoder.bin_edges_[0].round(3)
cols = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
display(px.bar(x=cols, y=bin_count, title="Samples per bin (Quantile)"))

# Unstructured Data Preprocessing

## Text

In [26]:
# Removing tags from an HTML document
import re

def clean_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

text = '<p>This is an <strong>example</strong> text with <a href="#">HTML tags</a>.</p>'
print(clean_html_tags(text))

This is an example text with HTML tags.


In [45]:
# Bag of Words and TF-IDF for text vectorization
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Load the newsgroups dataset
newsgroups = fetch_20newsgroups(subset='train')
documents = newsgroups.data
labels = newsgroups.target

# Bag of Words
count_vectorizer = CountVectorizer(stop_words='english')
bow_matrix = count_vectorizer.fit_transform(documents)
# Get the total list of words in the dataset
bow_features = count_vectorizer.get_feature_names_out()
print('Bag of Words Matrix Shape (num_samples, num_words):', bow_matrix.shape)

# TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
# Get the total list of words in the dataset
tfidf_features = tfidf_vectorizer.get_feature_names_out()
print('TF-IDF Matrix Shape (num_samples, num_words):', tfidf_matrix.shape)



Bag of Words Matrix Shape (num_samples, num_words): (11314, 129796)
TF-IDF Matrix Shape (num_samples, num_words): (11314, 129796)


In [78]:
# Download the glove word2vec model, and unzip
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!python -m gensim.scripts.glove2word2vec --input  glove.6B.50d.txt --output glove.6B.50d.word2vecformat.txt


2023-03-25 16:45:20,682 - glove2word2vec - INFO - running /Users/benepstein/Documents/GitHub/dcai/.venv/lib/python3.9/site-packages/gensim/scripts/glove2word2vec.py --input glove.6B.50d.txt --output glove.6B.50d.word2vecformat.txt
  num_lines, num_dims = glove2word2vec(args.input, args.output)
2023-03-25 16:45:20,682 - keyedvectors - INFO - loading projection weights from glove.6B.50d.txt
2023-03-25 16:45:32,866 - utils - INFO - KeyedVectors lifecycle event {'msg': 'loaded (400000, 50) matrix of type float32 from glove.6B.50d.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2023-03-25T16:45:32.824002', 'gensim': '4.3.1', 'python': '3.9.8 (main, Aug 24 2022, 16:09:18) \n[Clang 13.0.0 (clang-1300.0.29.3)]', 'platform': 'macOS-12.0.1-x86_64-i386-64bit', 'event': 'load_word2vec_format'}
2023-03-25 16:45:32,867 - glove2word2vec - INFO - converting 400000 vectors from glove.6B.50d.txt to glove.6B.50d.word2vecformat.txt
2023-03-25 16:45:33,175 - keyedvectors - INFO - storing 400000x50 

In [82]:
# Doc2Vec and GloVe
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument, KeyedVectors

sentences = ["The bat flew across the river", "He hit the ball with his bat"]

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(common_texts)]
# Create a Doc2Vec model that converts each document into a vector of size 100
model = Doc2Vec(documents, vector_size=100, window=2, min_count=1, workers=4)

# Get a vector for each sentence
d2v_vectors = np.array([model.infer_vector([sentence]) for sentence in sentences])
print("Doc2Vec embedding shape (num_sentences, sentence_embedding_size)", d2v_vectors.shape)



# Load the pre-trained GloVe model
glove_model = KeyedVectors.load_word2vec_format('glove.6B.50d.word2vecformat.txt', binary=False)
glove_vectors = []
for sentence in sentences:
    word_embeddings = []
    for word in sentence.split():
        # If GloVe doesn't have the word embedding, skip the word
        if word not in glove_model:
            continue
        # Get the embeddings for the word in the sentence
        word_embeddings.append(glove_model[word])
    glove_vectors.append(np.mean(word_embeddings, axis=0))

glove_vectors = np.array(glove_vectors)
print("GloVe embedding shape (num_sentences, sentence_embedding_size)", glove_vectors.shape)

Doc2Vec embedding shape (num_sentences, sentence_embedding_size) (2, 100)
GloVe embedding shape (num_sentences, sentence_embedding_size) (2, 50)


In [3]:
# Extract text embeddings using BERT
from sentence_transformers import SentenceTransformer

sentences = ["The bat flew across the river", "He hit the ball with his bat"]
model = SentenceTransformer("bert-base-uncased")
embeddings = model.encode(sentences)
print("BERT embeddings shape (num_sentences, sentence_embedding_size)", embeddings.shape)


No sentence-transformers model found with name /Users/benepstein/.cache/torch/sentence_transformers/bert-base-uncased. Creating a new one with MEAN pooling.
Some weights of the model checkpoint at /Users/benepstein/.cache/torch/sentence_transformers/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassificat

BERT embeddings shape (num_sentences, sentence_embedding_size) (2, 768)


## Images

In [107]:
# Extract Image embeddings using CLIP
from sentence_transformers import SentenceTransformer
from sklearn.datasets import fetch_openml
import plotly.express as px
from PIL import Image

# Load the MNIST dataset. This may take a while
mnist = fetch_openml('mnist_784')

# Extract the features and labels
X, y = mnist['data'].values, mnist['target']

model = SentenceTransformer("clip-ViT-B-32")
image_idx = 0
image_name = str(y[image_idx])
image = X[image_idx]
print("Image label:", image_name)
# Create a figure with a single image trace
fig = px.imshow(image.reshape(28, 28), color_continuous_scale='gray')

# Set the figure layout
fig.update_layout(title='MNIST Image Example', width=300, height=300)

# Show the plot
fig.show()

# Get the embedding for the image
image_emb = model.encode(Image.fromarray(image))
print("Image embedding shape", image_emb.shape)

Image label: 5


Image embedding shape (512,)


## Audio

In [155]:
from datasets import load_dataset
from transformers import AutoProcessor, HubertModel

# Load our processor and model for HuBERT
processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")

# Don't download the full 6.4GB of audio. Stream in each sample one at a time
dset_iter = load_dataset("librispeech_asr", 'clean', split="test", streaming=True)
# Get a single audio sample
audio_sample = next(iter(dset_iter))
sampling_rate = audio_sample["audio"]["sampling_rate"]
# HuBERT can only encode audio with 16khz sampling rate
assert sampling_rate == 16000
# Get our audio embeddings
input_values = processor(audio_sample["audio"]["array"], return_tensors="pt").input_values
audio_embeddings = model(input_values).last_hidden_state.detach().numpy()
print("Audio embeddings shape", audio_embeddings.shape)

Some weights of the model checkpoint at facebook/hubert-large-ls960-ft were not used when initializing HubertModel: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing HubertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing HubertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
It is strongly recommended to pass the ``sampling_rate`` argument to this function. Failing to do so can result in silent errors that might be hard to debug.


Audio embeddings shape (175, 1024)


## Video

### TODO

## Curse of Dimensionality

In [1]:
import math
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.spatial import distance


def get_normalized_data(num_rows: int, num_features: int) -> np.ndarray:
    """Generate a normalized dataset of dim (num_rows, num_features)"""
    data = np.random.rand(num_rows, num_features)
    data = (data - data.mean(axis=0))/ data.std(axis=0)    

    return data

num_trials = 2  # You can run this more times for further validation
features_to_test = [2,10,25,50,75,100]  # The feature/dimension sizes we will test
charts_per_row = 3  # For the visualization
num_rows = 100  # The number of rows in the dataset
for trial in range(num_trials):
    print(f"Trial {trial}\n")
    
    fig = make_subplots(
        rows=math.ceil(len(features_to_test)/charts_per_row),  # We want max 3 charts per row 
        cols=charts_per_row,
        subplot_titles=([f"{i} features" for i in features_to_test])
    )


    for idx, num_features in enumerate(features_to_test):
        data = get_normalized_data(num_rows, num_features)
        dist = distance.cdist(data, data, metric="euclidean").flatten()

        fig_row = idx+1  # Plotly rows start at 1, not 0
        fig.add_trace(
            go.Histogram(x=dist),
            row=math.ceil(fig_row/charts_per_row), col=1+(idx%charts_per_row)
        )
        
    fig.show()

Trial 0



Trial 1



## Dimensionality reduction - Load mnist

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

wine_url = (
    'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
)
df_wine = pd.read_csv(wine_url, header=None)

X, y = df_wine.iloc[:, 1:].values, df_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(  # Split into training and testing sets
    X, 
    y, 
    test_size=0.3,
    stratify=y,
    random_state=0
)

### Calculate eigenvector

In [59]:
import numpy as np

matrix = np.random.rand(4,4)
eigenvalues, eigenvectors = np.linalg.eig(matrix)

In [82]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()  # Scale the data with a mean of 0
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)


pca = PCA(n_components=2) # We want 2 components to represent our data

X_train_pca = pca.fit_transform(X_train_scaled) # Fit and transform the data
X_test_pca = pca.transform(X_test_scaled)

print(f"Eigenvectors:\n {pca.components_}\n\n")
print(f"Eigenvalues:\n {pca.explained_variance_}\n\n")
print(f"Covariance:\n {pca.get_covariance()}")

Eigenvectors:
 [[-0.13724218  0.24724326 -0.02545159  0.20694508 -0.15436582 -0.39376952
  -0.41735106  0.30572896 -0.30668347  0.07554066 -0.32613263 -0.36861022
  -0.29669651]
 [ 0.50303478  0.16487119  0.24456476 -0.11352904  0.28974518  0.05080104
  -0.02287338  0.09048885  0.00835233  0.54977581 -0.20716433 -0.24902536
   0.38022942]]


Eigenvalues:
 [4.84274532 2.41602459]


Covariance:
 [[ 1.08959959  0.01000289  0.24689732 -0.23006639  0.36600255  0.28114279
   0.22525494 -0.09511354  0.18937599  0.4764708  -0.00341797 -0.01796752
   0.53599232]
 [ 0.01000289  0.84630413  0.04885638  0.18531307 -0.07451803 -0.40394183
  -0.45196825  0.35399632 -0.32430405  0.25133403 -0.41199572 -0.47027942
  -0.1981175 ]
 [ 0.24689732  0.04885638  0.64704576 -0.07503063  0.15047556  0.06662031
   0.03525286  0.00815766  0.03750089  0.24509123 -0.05969209 -0.07432402
   0.20779538]
 [-0.23006639  0.18531307 -0.07503063  0.74046004 -0.1997118  -0.36218309
  -0.36745995  0.25340667 -0.27540474 -0

### See the variance per component amount

In [91]:
for nc in range(2, X_train_scaled.shape[1]):
    pca_ = PCA(n_components=nc)
    pca_.fit_transform(X_train_scaled)
    print(f"For {nc} components, variance={pca_.explained_variance_ratio_.sum()}")

For 2 components, variance=0.5538639565949182
For 3 components, variance=0.6720155475408881
For 4 components, variance=0.7453580651787426
For 5 components, variance=0.8095791433960593
For 6 components, variance=0.8600963882451358
For 7 components, variance=0.8996429271575503
For 8 components, variance=0.9260821103267703
For 9 components, variance=0.9499753029186233
For 10 components, variance=0.9662714406558743
For 11 components, variance=0.9800716518778227
For 12 components, variance=0.9917939143209088


0.5538639565949182

In [102]:
import plotly.express as px

px.scatter(X_train_pca, color=[f"Class_{i}" for i in y_train])

## t-SNE application and visualization

In [208]:
from sklearn.manifold import TSNE
import plotly.express as px

tsne = TSNE(n_components=2, perplexity=60, learning_rate="auto")

X_train_tsne = tsne.fit_transform(X_train_scaled)

px.scatter(X_train_tsne, color=[f"Class_{i}" for i in y_train])


The default initialization in TSNE will change from 'random' to 'pca' in 1.2.



In [133]:
help(TSNE)

Help on class TSNE in module sklearn.manifold._t_sne:

class TSNE(sklearn.base.BaseEstimator)
 |  TSNE(n_components=2, *, perplexity=30.0, early_exaggeration=12.0, learning_rate='warn', n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='euclidean', metric_params=None, init='warn', verbose=0, random_state=None, method='barnes_hut', angle=0.5, n_jobs=None, square_distances='deprecated')
 |  
 |  T-distributed Stochastic Neighbor Embedding.
 |  
 |  t-SNE [1] is a tool to visualize high-dimensional data. It converts
 |  similarities between data points to joint probabilities and tries
 |  to minimize the Kullback-Leibler divergence between the joint
 |  probabilities of the low-dimensional embedding and the
 |  high-dimensional data. t-SNE has a cost function that is not convex,
 |  i.e. with different initializations we can get different results.
 |  
 |  It is highly recommended to use another dimensionality reduction
 |  method (e.g. PCA for dense data or TruncatedS

In [118]:
!pip install umap_learn

You should consider upgrading via the '/Users/benepstein/Documents/Github/dcai/.venv/bin/python -m pip install --upgrade pip' command.[0m


In [156]:
from umap import UMAP

umap = UMAP(n_neighbors=4, min_dist=0.05)
X_train_umap = umap.fit_transform(X_train_scaled)
px.scatter(X_train_umap, color=[f"Class_{i}" for i in y_train])