In [None]:
import os
import time
import warnings
from itertools import cycle, islice

warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

from IPython.display import display

import utils

In [None]:
print(__doc__)
seed=23
np.random.seed(seed)
n_samples = 1500
std = [1.5, 2.0, 2.5]
noise = 0.05 #swiss-roll indv

squares       = utils.get_squares(n_samples, std=std, seed=seed)
blobs         = datasets.make_blobs(n_samples=n_samples, random_state=seed)
circles       = utils.get_circles(n_samples,cov=std, seed=seed)

varied           = datasets.make_blobs(n_samples=n_samples, cluster_std=std, random_state=seed)
noisy_circles    = datasets.make_circles(n_samples=n_samples, factor=.5, noise=noise, random_state=seed)
gaussian_circles = utils.get_gaussian_circles(cov=1 + noise, n_classes=5,inner_idx=2,n_samples=int(n_samples*5/3), seed=seed)

noisy_moons      = datasets.make_moons(n_samples=n_samples, noise=noise, random_state=seed)
X,y = datasets.make_moons(n_samples=int(n_samples/2), noise=noise)
doubled_noisy_moons = np.vstack((X, X+[0,0.3])), np.concatenate((y,y+2))

#rotated s-curve manifolds
X, y = datasets.make_s_curve(750, noise=noise)
X = X[:,[0,2]]
s_curve_1     = np.vstack((X, X.dot([[0,1],[1,0]]))), np.concatenate((y,y))
s_curve_2     = np.vstack((X.dot([[1,0],[1,1]]), X.dot([[-1,0],[-1,1]]))), np.concatenate((y,y))

#rotated swiss-roll manifold
X, y = datasets.make_swiss_roll(1500, noise=0.15)
X = X[:,[0,2]]
swiss_roll    = np.vstack((X, X.dot([[-1,0],[0,1]]))), np.concatenate((y,y))

test_datasets = [
    (squares, {'dataset_name':'squares'}),                                                      # 1
    (blobs, {'eps':0.7, 'dataset_name':'blobs'}),                                                          # 2
    (circles, {'dataset_name':'circles'}),                                                      # 3
    (varied, {'eps': 0.5, 'dataset_name':'blobs_varied'}),                                     # 4
    (noisy_circles, {'eps': 0.2, 'n_clusters': 2, 'dataset_name':'noisy_circles'}),                         # 5
    (gaussian_circles, {'eps': 0.25, 'dataset_name':'gaussian_circles'}),                        # 6
    (noisy_moons, {'n_clusters': 2, 'dataset_name':'noisy_moons'}),                             # 7
    #(doubled_noisy_moons, {'n_clusters': 4, 'dataset_name':'doubled_noisy_moons'}),             # 8
    #(s_curve_1, {'n_clusters': 4, 'dataset_name':'s_curve_1'}),                                 # 9
    #(s_curve_2, {'n_clusters': 4, 'dataset_name':'s_curve_2'}),                                 # 10
    #(swiss_roll, {'dataset_name':'swiss_roll'})                                                # 11
]

In [None]:
def process(X,y, params):
    folder = './plots/005/'
    if not os.path.isdir(folder+params['dataset_name']): os.makedirs(folder+params['dataset_name'])
    df = []        
    for i_m, (X_m, y_m) in zip(utils.fibs(n_samples), utils.modify_dataset(X, y, n_samples, scale=0.05, seed=seed)):        
        compressed = utils.trfm_dataset(X_m,y_m,'c', seed)
        stretched  = utils.trfm_dataset(X_m,y_m,'s', seed)
        pipe = []
        
        default = params.copy()
        default['dataset_name'] = '{0}-{1}'.format(params['dataset_name'],'original')
        pipe.append(((X, y), default))
        
        default = params.copy()
        default['dataset_name'] = '{0}-m{1}'.format(params['dataset_name'], i_m)
        pipe.append(((X_m, y_m), default))

        for i,((Xtc,ytc),(Xts,yts)) in enumerate(zip(compressed[1:], stretched[1:])):
            default = params.copy()
            default['dataset_name'] = '{0}-m{1}-{2}-{3}'.format(params['dataset_name'],i_m,'compress', i)
            pipe.append(((Xtc, ytc), default)) 
            
            default = params.copy()
            default['dataset_name'] = '{0}-m{1}-{2}-{3}'.format(params['dataset_name'],i_m,'stretch', i)
            pipe.append(((Xts, yts), default))
        
        df.append(utils.clustering(pipe, standardize=False, pic_name='{0}/m{1}'.format(folder+params['dataset_name'], i_m)))
    return pd.concat(df)

In [None]:
df_test = utils.clustering(test_datasets, standardize=False, pic_name='test_datasets/intro')

In [None]:
df_test

In [None]:
df_test.to_csv('./test_datasets/test_datasets.csv')

In [None]:
csv_folder = "./data_005_csv"

In [None]:
(X,y), params = test_datasets[0]
process(X, y, params).to_csv('{0}/{1}.csv'.format(csv_folder, params['dataset_name']))

In [None]:
(X,y), params = test_datasets[1]
process(X, y, params).to_csv('{0}/{1}.csv'.format(csv_folder, params['dataset_name']))

In [None]:
(X,y), params = test_datasets[2]
process(X, y, params).to_csv('{0}/{1}.csv'.format(csv_folder, params['dataset_name']))

In [None]:
(X,y), params = test_datasets[3]
process(X, y, params).to_csv('{0}/{1}.csv'.format(csv_folder, params['dataset_name']))

In [None]:
(X,y), params = test_datasets[4]
process(X, y, params).to_csv('{0}/{1}.csv'.format(csv_folder, params['dataset_name']))

In [None]:
(X,y), params = test_datasets[5]
process(X, y, params).to_csv('{0}/{1}.csv'.format(csv_folder, params['dataset_name']))

In [None]:
(X,y), params = test_datasets[6]
process(X, y, params).to_csv('{0}/{1}.csv'.format(csv_folder, params['dataset_name']))

In [None]:
(X,y), params = test_datasets[7]
process(X, y, params).to_csv('{0}/{1}.csv'.format(csv_folder, params['dataset_name']))

In [None]:
(X,y), params = test_datasets[8]
process(X, y, params).to_csv('{0}/{1}.csv'.format(csv_folder, params['dataset_name']))

In [None]:
(X,y), params = test_datasets[9]
process(X, y, params).to_csv('{0}/{1}.csv'.format(csv_folder, params['dataset_name']))

In [None]:
(X,y), params = test_datasets[10]
process(X, y, params).to_csv('{0}/{1}.csv'.format(csv_folder, params['dataset_name']))

In [None]:
X, curve_color = datasets.make_s_curve(750, noise=0.05)
X = X[:,[0,2]]

plt.figure(figsize=(15,10))


plt.subplot(2, 3, 1)
plt.scatter(X[:,0],X[:,1], s=3, c=curve_color)
plt.xlim(-3,3)
plt.ylim(-3,3)

T1 = np.array([[0,1],[1,0]])
plt.subplot(2, 3, 2)
plt.scatter(X.dot(T1)[:,0],X.dot(T1)[:,1], s=3, c=curve_color)
plt.xlim(-3,3)
plt.ylim(-3,3)

data = np.vstack((X, X.dot(T1)))
plt.subplot(2, 3, 3)
plt.scatter(data[:,0], data[:,1], s=3, c=np.concatenate((curve_color,curve_color)))
plt.xlim(-3,3)
plt.ylim(-3,3)


T2 = np.array([[1,0],[1,1]])
plt.subplot(2, 3, 4)
plt.scatter(X.dot(T2)[:,0],X.dot(T2)[:,1], s=3, c=curve_color)
plt.xlim(-3,3)
plt.ylim(-3,3)

T3 = np.array([[-1,0],[-1,1]])
plt.subplot(2, 3, 5)
plt.scatter(X.dot(T3)[:,0],X.dot(T3)[:,1], s=3, c=curve_color)
plt.xlim(-3,3)
plt.ylim(-3,3)


data = np.vstack((X.dot(T2), X.dot(T3)))
plt.subplot(2, 3, 6)
plt.scatter(data[:,0], data[:,1], s=3, c=np.concatenate((curve_color,curve_color)))

In [None]:
X, curve_color = datasets.make_swiss_roll(1500, noise=0.15)
X = X[:,[0,2]]

plt.figure(figsize=(15,5))

plt.subplot(1, 3, 1)
plt.scatter(X[:,0],X[:,1], s=3, c=curve_color)
plt.xlim(-15,15)
plt.ylim(-15,15)

T1 = np.array([[-1,0],[0,1]])
plt.subplot(1, 3, 2)
plt.scatter(X.dot(T1)[:,0],X.dot(T1)[:,1], s=3, c=curve_color)
plt.xlim(-15,15)
plt.ylim(-15,15)

X = np.vstack((X, X.dot(T1)))
plt.subplot(1, 3, 3)
plt.scatter(X[:,0],X[:,1], s=3, c=np.concatenate((curve_color,curve_color)))
plt.xlim(-15,15)
plt.ylim(-15,15)

In [None]:
X,y = datasets.make_moons(n_samples=500, noise=0.03)
X = np.vstack((X, X[y==0,:]+[0,0.3],X[y==1,:]-[0,0.3]))
y = np.concatenate((y,y[y==0]+2,y[y==1]+2))
plt.scatter(X[:,0],X[:,1],s=3, c=y)