<a href="https://colab.research.google.com/github/Bryan-Az/Dimensionality_Reduction/blob/main/Dimensionality_Reduction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import os
import shutil
import sys
import zipfile
import gensim
import re

# Dimensionality Reduction
In this notebook assignment I will be using the following techniques:
1. PCA (Principal Component Analysis) - multivariate on distinct features
2. SVD (Singular Value Decomposition) - univariate on distinct feature
3. MDS (Multidimensional Scaling) - multivariate on distinct feature
4. ISOMap (Isometric Mapping)  - multivariate on distinct feature
5. LLE (Locally Linear Embedding) - multivariate on distinct feature

and seperately,

1. UMAP (Uniform Manifold Approximation and Projection)
2. t-SNE (t-Distributed Stochastic Neighbor Embedding)

A key difference between UMAP and t-SNE as compared to the other 5 techniques above is that they are more advanced, and they can capture the most highly dimensional, complex, non-linear relationships within the data. UMAP and t-SNE can preserve the local structure within the data and can reveal important clusters that PCA, SVD, and other techniques may miss. 

## Data Loading

In [2]:
# unpacking art_tables.zip using zipfile library and then load the two csv's (latinamerican_art.csv & non_latinamerican_art.csv)
# into a single dataframe
nga_art_sample = None
with zipfile.ZipFile('./data_samples/art_tables.zip', 'r') as zip_ref:
    zip_ref.extractall('./data_samples/')
    for file in zip_ref.namelist():
        if file.endswith('.csv'):
            # only sampling 628 rows from non_latinamerican.csv file & all from latinamerican, and selecting only title and nationality
            # latinamerican has very few rows so we are sampling all of them
            to_sample = True if 'non_latinamerican' in file else False
            if nga_art_sample is None:
                if to_sample:
                    nga_art_sample = pd.read_csv('./data_samples/' + file, on_bad_lines='skip').sample(628)
                else:
                    nga_art_sample = pd.read_csv('./data_samples/' + file, on_bad_lines='skip')
            else:
                if to_sample:
                    nga_art_sample = pd.concat([pd.read_csv('./data_samples/' + file, on_bad_lines='skip').sample(628), nga_art_sample])
                else:
                    nga_art_sample = pd.concat([pd.read_csv('./data_samples/' + file, on_bad_lines='skip'), nga_art_sample])
# delete the unzipped data directory
shutil.rmtree('./data_samples/art_tables')

print(nga_art_sample.shape)
nga_art_sample.head()

  nga_art_sample = pd.concat([pd.read_csv('./data_samples/' + file, on_bad_lines='skip').sample(628), nga_art_sample])


(1256, 40)


Unnamed: 0,iiifurl,iiifthumburl,accessioned,title,displayDate_created,roletype,role,forwarddisplayname,birthyear,deathyear,...,uuid,viewtype,sequence,width,height,maxpixels,assistivetext,depictstmsobjectid,objectid,constituentid
119131,https://api.nga.gov/iiif/975889bd-7fe1-4570-b1...,https://api.nga.gov/iiif/975889bd-7fe1-4570-b1...,1,"St. Johns, New Brunswick, Canada","American, 1870 - 1953",artist,artist,John Marin,1870.0,1953.0,...,975889bd-7fe1-4570-b1ea-d0af590580dc,primary,0.0,7075.0,5498.0,640.0,,72202.0,72202.0,2643.0
55223,https://api.nga.gov/iiif/463bd791-ef15-4f48-85...,https://api.nga.gov/iiif/463bd791-ef15-4f48-85...,1,"Lamp, Lower Portion",,artist,artist,Anonymous Artist,,,...,463bd791-ef15-4f48-85cf-7aaac4354eb8,alternate,4.0,7118.0,5694.0,,,130749.0,130749.0,13.0
31483,https://api.nga.gov/iiif/27f84928-9cf5-44fd-8c...,https://api.nga.gov/iiif/27f84928-9cf5-44fd-8c...,1,Tobias Frightened by the Fish,"Dutch, c. 1600 - 1655",artist,artist,Herman van Swanevelt,1600.0,1655.0,...,27f84928-9cf5-44fd-8c30-13834a959c29,primary,0.0,4000.0,2978.0,,,53932.0,53932.0,2769.0
121781,https://api.nga.gov/iiif/9af9c70c-661c-4a6c-93...,https://api.nga.gov/iiif/9af9c70c-661c-4a6c-93...,1,"La morgue, Paris (The Mortuary)","French, 1821 - 1868",artist,artist,Charles Meryon,1821.0,1868.0,...,9af9c70c-661c-4a6c-93e8-4e5ecb0b04ce,primary,0.0,3521.0,3956.0,,,35113.0,35113.0,2421.0
82932,https://api.nga.gov/iiif/69888422-2aa3-4e73-b1...,https://api.nga.gov/iiif/69888422-2aa3-4e73-b1...,1,"Rocken End in a Storm, Isle of Wight","British, 1819 - 1869",artist,artist,Roger Fenton,1819.0,1869.0,...,69888422-2aa3-4e73-b1d2-606a75b8e0e1,primary,0.0,5456.0,4445.0,,,212109.0,212109.0,13369.0


## Data Pre-Processing

In [3]:
nga_art_sample.columns

Index(['iiifurl', 'iiifthumburl', 'accessioned', 'title',
       'displayDate_created', 'roletype', 'role', 'forwarddisplayname',
       'birthyear', 'deathyear', 'ulanid', 'artistofngaobject', 'nationality',
       'constituenttype', 'beginyear_artistAssigned', 'endyear_artistAssigned',
       'country_artistAssigned', 'zipcode_artistAssigned', 'medium',
       'dimensions', 'inscription', 'markings', 'attribution',
       'visualBrowserClassification', 'parentID', 'isVirtual', 'portfolio',
       'series', 'volume', 'watermarks', 'uuid', 'viewtype', 'sequence',
       'width', 'height', 'maxpixels', 'assistivetext', 'depictstmsobjectid',
       'objectid', 'constituentid'],
      dtype='object')

In [4]:
# selecting numerical columns from the dataset
selected_art_sample = nga_art_sample.loc[:, ['accessioned', 'birthyear', 'deathyear', 'width', 'height', 'medium', 'viewtype']]

In [5]:
# checking for null values
selected_art_sample.isna().sum()

accessioned      1
birthyear        6
deathyear      199
width            2
height           4
medium           2
viewtype         2
dtype: int64

In [6]:
selected_art_sample.reset_index(drop=False, inplace=True)

In [7]:
# imputing null values with mean and also converting the columns to int
selected_art_sample['birthyear'] = pd.to_numeric(selected_art_sample['birthyear'], errors='coerce')
selected_art_sample['birthyear'].fillna(selected_art_sample['birthyear'].mean(), inplace=True)

selected_art_sample['deathyear'] = pd.to_numeric(selected_art_sample['deathyear'], errors='coerce')
selected_art_sample['deathyear'].fillna(selected_art_sample['deathyear'].mean(), inplace=True)

selected_art_sample['width'] = pd.to_numeric(selected_art_sample['width'], errors='coerce')
selected_art_sample['width'].fillna(selected_art_sample['width'].mean(), inplace=True)

selected_art_sample['height'] = pd.to_numeric(selected_art_sample['height'], errors='coerce')
selected_art_sample['height'].fillna(selected_art_sample['height'].mean(), inplace=True)

# imputing the categorical string column 'medium' with the most frequent value
selected_art_sample['medium'].fillna(selected_art_sample['medium'].value_counts().index[0], inplace=True)
# imputing the categorical string column 'viewtype' with the most frequent value
selected_art_sample['viewtype'].fillna(selected_art_sample['viewtype'].value_counts().index[0], inplace=True)


In [8]:
selected_art_sample.loc[:, ['medium', 'viewtype']].describe()

Unnamed: 0,medium,viewtype
count,1256,1256
unique,387,3
top,gelatin silver print,primary
freq,158,1234


In [9]:
selected_art_sample.describe()

Unnamed: 0,index,birthyear,deathyear,width,height
count,1256.0,1256.0,1256.0,1256.0,1256.0
mean,50386.377389,1864.7048,1933.715232,4101.064593,4325.584665
std,64483.981553,113.909302,112.144398,2119.12201,2326.304438
min,0.0,1200.0,1299.0,640.0,1042.0
25%,313.75,1855.0,1933.715232,3055.75,3233.75
50%,626.5,1902.0,1969.0,3561.5,4000.0
75%,99194.25,1922.0,2000.0,4000.0,4000.0
max,200025.0,1985.0,2021.0,23610.0,40461.0


## Scaling the selected art sample for use in Dimensionality Reduction

In [10]:
from sklearn.preprocessing import StandardScaler
# applying the standard scaler to the numerical data
scaler = StandardScaler()
selected_numerical_art_sample = selected_art_sample.loc[:, ['birthyear', 'deathyear', 'width', 'height']]
scaler.fit(selected_numerical_art_sample)
scaled_numerical_art_sample = scaler.transform(selected_numerical_art_sample)
scaled_numerical_art_sample = pd.DataFrame(scaler.transform(selected_numerical_art_sample), columns=selected_numerical_art_sample.columns)

In [11]:
scaled_numerical_art_sample.describe()

Unnamed: 0,birthyear,deathyear,width,height
count,1256.0,1256.0,1256.0,1256.0
mean,-1.765042e-15,1.335096e-15,2.772022e-16,7.354344e-17
std,1.000398,1.000398,1.000398,1.000398
min,-5.837711,-5.662058,-1.633905,-1.412065
25%,-0.08523154,2.028316e-15,-0.4934737,-0.4695299
50%,0.3275418,0.3147623,-0.2547185,-0.1400136
75%,0.50319,0.5913017,-0.04771073,-0.1400136
max,1.056482,0.7786348,9.209808,15.53959


In [12]:
scaled_numerical_art_sample.to_csv('./data_samples/scaled_numerical_art_sample.csv', index=False)

# Simpler Linear Methods

## PCA (Principal Component Analysis)

In [13]:
# applying PCA to the scaled data
from sklearn.decomposition import PCA
def apply_pca(scaled_df, n_components=2):
    pca = PCA(n_components=n_components)
    pca.fit(scaled_df)
    pca_df = pca.transform(scaled_df)
    pca_df = pd.DataFrame(pca_df, columns=['PC1', 'PC2'])
    return pca_df

In [14]:
pca_art = apply_pca(scaled_numerical_art_sample)

In [15]:
pca_art.head()

Unnamed: 0,PC1,PC2
0,0.067045,1.354249
1,0.233993,1.403144
2,3.25907,-1.028884
3,0.624216,-0.416424
4,0.755988,0.366136


## SVD (Singular Value Decomposition)

In [16]:
from sklearn.decomposition import TruncatedSVD

def apply_svd(scaled_df, n_components=2):
    svd = TruncatedSVD(n_components=n_components)
    svd.fit(scaled_df)
    svd_df = svd.transform(scaled_df)
    svd_df = pd.DataFrame(svd_df, columns=[f'SVD{i+1}' for i in range(n_components)])
    return svd_df


In [17]:
svd_art = apply_svd(scaled_numerical_art_sample)
svd_art.head()

Unnamed: 0,SVD1,SVD2
0,0.067045,1.354249
1,0.233993,1.403144
2,3.25907,-1.028884
3,0.624216,-0.416424
4,0.755988,0.366136


## MDS (Multidimensional Scaling)

In [18]:
from sklearn.manifold import MDS

def apply_mds(scaled_df, n_components=2):
    mds = MDS(n_components=n_components)
    mds_df = mds.fit_transform(scaled_df)
    mds_df = pd.DataFrame(mds_df, columns=[f'MDS{i+1}' for i in range(n_components)])
    return mds_df

In [19]:
mds_art = apply_mds(scaled_numerical_art_sample)
mds_art.head()



Unnamed: 0,MDS1,MDS2
0,0.079985,-1.431451
1,0.233935,-1.472768
2,3.393564,0.654666
3,0.696003,0.355472
4,0.78898,-0.478149


## ISOMap (Isometric Mapping) 

In [20]:
from sklearn.manifold import Isomap

def apply_isomap(scaled_df, n_components=2, n_neighbors=5):
    isomap = Isomap(n_components=n_components, n_neighbors=n_neighbors)
    isomap_df = isomap.fit_transform(scaled_df)
    isomap_df = pd.DataFrame(isomap_df, columns=[f'ISOMap{i+1}' for i in range(n_components)])
    return isomap_df

In [21]:
isomap_art = apply_isomap(scaled_numerical_art_sample)
isomap_art.head()

  self._fit_transform(X)
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.flat[0])
  self._set_intXint(row, col, x.

Unnamed: 0,ISOMap1,ISOMap2
0,-0.077824,1.948536
1,-0.050592,1.936797
2,5.110158,-2.146284
3,1.061674,-0.553206
4,0.593286,0.665037


## LLE (Locally Linear Embedding)

In [22]:
from sklearn.manifold import LocallyLinearEmbedding

def apply_lle(scaled_df, n_components=2, n_neighbors=5):
    lle = LocallyLinearEmbedding(n_components=n_components, n_neighbors=n_neighbors)
    lle_df = lle.fit_transform(scaled_df)
    lle_df = pd.DataFrame(lle_df, columns=[f'LLE{i+1}' for i in range(n_components)])
    return lle_df


In [23]:
lle_art = apply_lle(scaled_numerical_art_sample)
lle_art.head()

Unnamed: 0,LLE1,LLE2
0,0.000864,0.000604
1,0.00086,0.001058
2,4.7e-05,0.00029
3,0.000513,-0.000301
4,-0.002656,0.001543


# Complex Multi-linear Methods

## UMAP (Uniform Manifold Approximation and Projection)

In [24]:
from umap import UMAP

def apply_umap(scaled_df, n_components=2, n_neighbors=5):
    umap = UMAP(n_components=n_components, n_neighbors=n_neighbors)
    umap_df = umap.fit_transform(scaled_df)
    umap_df = pd.DataFrame(umap_df, columns=[f'UMAP{i+1}' for i in range(n_components)])
    return umap_df


  from .autonotebook import tqdm as notebook_tqdm


In [25]:
umap_art = apply_umap(scaled_numerical_art_sample)
umap_art.head()

Unnamed: 0,UMAP1,UMAP2
0,7.819643,13.887558
1,7.742863,13.606033
2,-2.367343,-2.281517
3,-3.254508,8.82199
4,7.543347,13.265536


## t-SNE (t-Distributed Stochastic Neighbor Embedding)

In [27]:
from sklearn.manifold import TSNE

def apply_tsne(scaled_df, n_components=2, perplexity=15, learning_rate=200):
    tsne = TSNE(n_components=n_components, perplexity=perplexity, learning_rate=learning_rate)
    tsne_df = tsne.fit_transform(scaled_df)
    tsne_df = pd.DataFrame(tsne_df, columns=[f't-SNE{i+1}' for i in range(n_components)])
    return tsne_df


In [28]:
tsne_art = apply_tsne(scaled_numerical_art_sample)
tsne_art.head()

: 