In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df_train = pd.read_csv("../data/01_raw/sign_mnist_train.csv")
print(df_train.shape)
df_train.head()

In [None]:
df_test = pd.read_csv("../data/01_raw/sign_mnist_test.csv")
print(df_test.shape)
df_test.head()

In [None]:
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

In [None]:
i=0
img = np.array(df_train.iloc[i,1:]).reshape(28, 28)
print(alphabet[df_train.loc[i,"label"]])
plt.imshow(img, cmap="gray")
plt.show()

In [None]:
num_rows = 26
num_cols = 10

fig, axs = plt.subplots(num_rows, num_cols, figsize=(10, 20))

for i, ax_row in enumerate(axs):
    df_letter = df_train[df_train["label"] == i]
    if df_letter.shape[0] != 0:
        for j,ax in enumerate(ax_row):
            ax.imshow(np.array(df_letter.iloc[j,1:]).reshape(28, 28), cmap="gray")
            ax.axis('off')
    else:
        for j,ax in enumerate(ax_row):
            ax.axis('off')

for i, ax_row in enumerate(axs):
    ax_row[0].text(-0.5, 0.5, alphabet[i], transform=ax_row[0].transAxes,size=14, weight='bold', va='center')

plt.tight_layout()
plt.show()

In [None]:
print(np.array(df_train.iloc[i,1:]).reshape(28, 28).astype(np.uint8).dtype)

In [None]:
from keras.preprocessing.image import load_img 
from keras.preprocessing.image import img_to_array 
from keras.applications.vgg16 import preprocess_input 

# models 
from keras.applications.vgg16 import VGG16 
from keras.models import Model

# clustering and dimension reduction
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# for everything else
import os
import numpy as np
import matplotlib.pyplot as plt
from random import randint
import pandas as pd
import pickle
import cv2

In [None]:
model = VGG16()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)

def extract_features(original_image, model):

    resized_image = cv2.resize(original_image, (224, 224))
    resized_image = cv2.cvtColor(resized_image, cv2.COLOR_GRAY2BGR)

    reshaped_img = resized_image.reshape(1,224,224,3) 
    # prepare image for model
    imgx = preprocess_input(reshaped_img)
    # get the feature vector
    features = model.predict(imgx, use_multiprocessing=True)
    return features

In [None]:
features = []

for i in range(1000):
    img = np.array(df_train.iloc[i,1:]).reshape(28, 28).astype(np.uint8)
    feat = extract_features(img,model)
    features.append(feat)

In [None]:
import umap

In [None]:
np.array(features).shape

In [None]:
features = np.array(features).reshape((1000, 4096))
features.shape

In [None]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(features)
embedding.shape

In [None]:
labels = df_train['label'].tolist()[:1000]

# Create a color mapping dictionary for each label
label_colors = {label: plt.cm.tab20(i) for i, label in enumerate(np.unique(labels))}

# Assign colors to points based on their labels
point_colors = [label_colors[label] for label in labels]

In [None]:
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    c=point_colors)

In [None]:
labels = df_train['label'].tolist()[:1000]
unique_labels = list(set(labels))

# reduce the amount of dimensions in the feature vector
pca = PCA(n_components=100, random_state=22)
pca.fit(features)
x = pca.transform(features)

# cluster feature vectors
kmeans = KMeans(n_clusters=len(unique_labels), random_state=22)
kmeans.fit(x)


In [None]:
labels = df_train['label'].tolist()[:1000]

# Create a color mapping dictionary for each label
label_colors = {label: plt.cm.tab20(i) for i, label in enumerate(np.unique(labels))}

# Assign colors to points based on their labels
point_colors = [label_colors[label] for label in labels]

In [None]:
data = np.array(df_train.iloc[:1000,1:])

In [None]:
reducer = umap.UMAP()
embedding = reducer.fit_transform(data)
embedding.shape

In [None]:
plt.scatter(embedding[:, 0], embedding[:, 1], c=point_colors, cmap='Spectral', s=5)