In [None]:
%matplotlib inline
import torch
from torch.utils.data import DataLoader
import AECompare.util as u
import AECompare.autoencoder as ae
import numpy as np

for digit in range(10):
    util = u.Util()
    train_data = util.get_sep_indx_data(digit_filter=digit, train=True)
    print(train_data)
    AE = ae.AutoEncoder(latent_len=30, digit=digit, random_seed=0)
    AE.to(AE.device)
    AE.fit(train_data, num_epochs=60, lr=0.001, batch_size=128)
    #AE.load_state_dict(torch.load(f'AECompare/MNIST_digits_models/AE_models/{digit}_model_30_{AE.random_seed}_t.pth'))
    #test_data = util.get_sep_indx_data(digit_filter=digit, train=False)
    #print(train_data
    test_loss, latent = AE.test(train_data, batch_size=1)
    AE.store_latent(latent, train=True)
    print(digit, test_loss)
    AE.evaluate(train_data)

In [2]:
import pandas as pd
import os
latent_df = pd.DataFrame()
for digit in range(10):
    df = pd.read_csv(f'AECompare/MNIST_digits_latents/AE_latents/{digit}_30_0_train.csv',
     names=[a for a in range(0,30)])
    df['target'] = int(digit)
    latent_df = pd.concat([latent_df, df])
    
# Shuffle the new MNIST data    
latent_df = latent_df.sample(frac=1).reset_index(drop=True)
x, y = latent_df[[a for a in range(0,30)]].to_numpy(), latent_df['target'].to_numpy()
x.shape

(60000, 30)

In [3]:
orderedDF = latent_df.sort_values(
  by='target', 
  ascending=True)
orderedDF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,target
0,0.036553,0.036253,0.003741,0.066184,0.005580,0.075897,0.003320,0.010957,0.033672,0.003816,...,0.009901,0.010883,0.005476,0.013772,0.110793,0.043746,0.050532,0.003452,0.004629,0
23597,0.013118,0.087671,0.001992,0.056542,0.002755,0.303403,0.001395,0.008328,0.090194,0.002002,...,0.007044,0.006399,0.002321,0.009361,0.072611,0.106439,0.012459,0.001624,0.002783,0
23605,0.037457,0.045796,0.010760,0.064146,0.012196,0.097986,0.009345,0.016851,0.050967,0.010555,...,0.017515,0.024970,0.012127,0.020150,0.058448,0.090370,0.060696,0.009710,0.012681,0
23609,0.076808,0.002586,0.000437,0.010482,0.001285,0.035123,0.000287,0.005454,0.014467,0.000423,...,0.003878,0.001467,0.000964,0.009738,0.431155,0.004702,0.010163,0.000338,0.000514,0
23611,0.032493,0.048621,0.002040,0.069437,0.004193,0.014118,0.001770,0.025873,0.062026,0.002279,...,0.017014,0.002865,0.003488,0.035295,0.133214,0.005265,0.026796,0.001883,0.002859,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49441,0.005155,0.154809,0.116804,0.005226,0.005266,0.005184,0.005225,0.005204,0.005498,0.005795,...,0.005209,0.006610,0.005184,0.126177,0.123393,0.005261,0.008106,0.005209,0.005555,9
3817,0.003348,0.104487,0.241295,0.003336,0.003341,0.003325,0.003289,0.003331,0.003334,0.003230,...,0.003311,0.003680,0.003294,0.028601,0.182964,0.003292,0.006856,0.003330,0.003464,9
41066,0.006627,0.083761,0.203158,0.006729,0.006713,0.006640,0.006524,0.006646,0.006722,0.006847,...,0.006684,0.008795,0.006567,0.028113,0.076942,0.006659,0.013313,0.006623,0.007243,9
13155,0.005927,0.067841,0.209516,0.005915,0.005916,0.005910,0.005789,0.005906,0.005805,0.005624,...,0.005892,0.006547,0.005833,0.021806,0.142110,0.005826,0.011402,0.005884,0.006117,9


In [4]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10, random_state=1111).fit(x)
for digit in range(10):
    indices = latent_df.index[latent_df.target == digit].tolist()
    lst = [kmeans.labels_[i] for i in indices]
    mode = max(set(lst), key=lst.count)
    #print(lst)
    count = 0
    for i in lst:
        if i != mode:
            count += 1
    print(len(lst), digit, mode, count)

5923 0 9 1523
6742 1 0 3295
5958 2 7 95
6131 3 1 703
5842 4 8 750
5421 5 4 1418
5918 6 2 99
6265 7 4 1797
5851 8 3 383
5949 9 6 751


In [None]:
from sklearn.manifold import TSNE
import plotly.express as px
tsne = TSNE(n_components=2, random_state=1)
tsne_results = tsne.fit_transform(orderedDF.drop(['target'],axis=1))
fig = px.scatter(tsne_results, x=0, y=1,
                 color=orderedDF.target.astype(str),
                 labels={'0': 'tSNE component 1', '1': 'tSNE component 2'}, color_discrete_sequence=px.colors.qualitative.Plotly)
                 #category_order={"target":["0","1","2","3","4","5", "6", "7","8", "9"]})
fig.update_layout(
    autosize=False,
    width=800,
    height=800,)
fig.update_traces(marker=dict(size=4,
                              line=dict(width=0.5,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True)
fig.update_traces(marker=dict(size=4,
                              line=dict(width=0.3,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))
fig.update_layout({ 'plot_bgcolor': 'rgba(256, 256, 256, 1)', 'paper_bgcolor': 'rgba(256, 256, 256, 1)', })
fig.update_layout(legend_traceorder="reversed")
fig.show()


/home/fatemeh.afrasiabi001/.conda/envs/tda-py/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:780: FutureWarning:

The default initialization in TSNE will change from 'random' to 'pca' in 1.2.

/home/fatemeh.afrasiabi001/.conda/envs/tda-py/lib/python3.9/site-packages/sklearn/manifold/_t_sne.py:790: FutureWarning:

The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.

In [None]:
from sklearn.manifold import TSNE
import plotly.express as px
tsne = TSNE(n_components=2, random_state=0)
tsne_results = tsne.fit_transform(latent_df.drop(['target'],axis=1))
px.scatter(tsne_results, x=0, y=1,
                 color=latent_df.target.astype(str),
                 labels={'0': 'tsne-2d-one', '1': 'tsne-2d-two'})


In [None]:
px.scatter(tsne_results, x=0, y=1,
                 color=latent_df.target.astype(str),
                 labels={'0': 'tsne-1', '1': 'tsne-2'}, opacity=1)

In [63]:
######## Read Chimera jobs results ########
import pandas as pd
import os
directory = 'chimera_jobs/'
cols = ['model', 'latent_len', 'random_seed', 'avg_accuracy']
AE_df = pd.DataFrame(columns=cols)
VAE_df = pd.DataFrame(columns=cols)
for f in os.listdir(directory):
    if f.endswith(".out"): 
        file = open(directory + f, 'r')
        lines = file.readlines()
        if lines[1].startswith('VAE'):
            for i, line in enumerate(lines):
                if i==0 or line.startswith('end'):
                    continue
                VAE_df.loc[len(VAE_df)] = line.split()
        elif lines[1].startswith('AE'):
            for i, line in enumerate(lines):
                if i==0 or line.startswith('end'):
                    continue
                AE_df.loc[len(AE_df)] = line.split()
        else:
            continue
sum_vae = VAE_df[['latent_len', 'avg_accuracy']].astype(float).groupby(VAE_df.latent_len).mean()
sum_ae = AE_df[['latent_len', 'avg_accuracy']].astype(float).groupby(AE_df.latent_len).mean()
sum_vae.to_csv(directory + 'summarized_results/vae_results.csv', index=False)
sum_ae.to_csv(directory + 'summarized_results/ae_results.csv', index=False)