In [None]:
#importing important packages necessary to notebook
import numpy as np
from sklearn.datasets import load_digits #ready available data set
from sklearn.model_selection import train_test_split  #function that divides up data
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt  #plotting
import seaborn as sns            #and visualization
import pandas as pd
import sklearn.cluster as cluster
%matplotlib inline

In [None]:
#styling notebook
sns.set(style='white', context='notebook', rc={'figure.figsize':(14,10)})

In [None]:
#getting the data set
#compute as both int and float and show side by side
data_csv_int =  pd.read_csv("f444wnew.csv")
data_csv_float = pd.read_csv("f444wnew.csv")
#list indices must be integers or slices, not float, error from below when tried to do floats
#now updating to include both (8/9/22)
data_csv_int = data_csv_int.astype('int')
data_csv_float = data_csv_float.astype('float')
data_csv_int.head()
#type(data_csv) - is a dataframe

In [None]:
data_csv_float.head()   #can change to int

In [None]:
#seeing how many rows
#data_csv.xcentroid.value_counts()
#result: Name: xcentroid, Length: 17401, dtype: int64

#organize by area instead
data_csv_int.area.value_counts()
data_csv_float.area.value_counts()
pd.set_option("display.max_rows", 10)  #can change the number to "None" to print all the values
print(data_csv_int.area.value_counts())
print(data_csv_float.area.value_counts())

In [None]:
#print(data_csv.local_background.value_counts())

In [None]:
#beyond the 2d visualization, umap helps see internal structure of the data, intuition
#need umap object
import umap

In [None]:
reducer = umap.UMAP()

In [None]:
#cleaning up data

#as integers
data_int = data_csv_int[
    [
        #"xcentroid",  #remove coordinate position of the 
        #"ycentroid",  #remove
        "area",
        "semimajor_sigma",
        "semiminor_sigma",
        #"orientation", #remove, might use in future
        "eccentricity",
        #"min_value",#remove
        #"max_value",#remove
        #"local_background",#remove
        "R0_R3", #plotting values relative to R3 instead of R0
        "R1_R3",	
        "R2_R3",
        "gini",  #relative distribution of flux values - how much of the flux is in the brightest pixels
        "fwhm",
        "cxx",
        "cxy",
        "cyy"
    ]
].values

#as floats
data_float = data_csv_float[
    [
        "area",
        "semimajor_sigma",
        "semiminor_sigma",
        "eccentricity",
        "R0_R3", #plotting values relative to R3 instead of R0
        "R1_R3",	
        "R2_R3",
        "gini",  #relative distribution of flux values - how much of the flux is in the brightest pixels
        "fwhm",
        "cxx",
        "cxy",
        "cyy"
    ]
].values

#convert each feature into z-scores (number of standard deviations from the mean) for comparability
scaled_data_int = StandardScaler().fit_transform(data_int)
scaled_data_float = StandardScaler().fit_transform(data_float)  

In [None]:
#training the reducer so it can learn about the manifold
#returns as array
#int version
embedding_int = reducer.fit_transform(scaled_data_int)
embedding_int.shape
#float version
embedding_float = reducer.fit_transform(scaled_data_float)
embedding_float.shape
#2d is the default

#running this changed the embedding slighly each time

# save numpy array as csv file
from numpy import asarray
from numpy import savetxt
# save to csv file
savetxt('new_int.csv', embedding_int, delimiter=',')
savetxt('new_float.csv', embedding_float, delimiter=',')

## Attempting to plot without c, c=local bg and then area

In [None]:
f, axarr = plt.subplots(1,2)

#treat axarr as an array, from left to right

#first panel
axarr[0].scatter(
    embedding_int[:, 0],
    embedding_int[:, 1])
axarr[0].set_title('UMAP Integer Projection') 
#second panel
axarr[1].scatter(
    embedding_float[:, 0],
    embedding_float[:, 1])
axarr[1].set_title('UMAP Float Projection')

#add more space between the figures
f.subplots_adjust(wspace = 0.5)

#plt.gca().set_aspect('equal', 'datalim')
axarr[0].set_aspect('equal')
axarr[1].set_aspect('equal')

In [None]:
#what if i just ignore c...
#this is weird, how does it know what data i am talking about??
    #through the embedding
plt.scatter(
    embedding_int[:, 0],
    embedding_int[:, 1]
    #list indices must be integers or slices, not float
    #c=data_csv
   ) 
#this is attempting to plot out the stars, could just be the first 859 objects
#try running for less objects until in same cluster, in testing that, found that even the first 100 objects
#are not simply plotted together, they are also scattered
for i in range(859):
    plt.scatter(
        embedding_int[i,0],
        embedding_int[i, 1], color='r')
    
    
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the dataset with stars highlighted', fontsize=24)

#data got flipped in the reordering???

#get all in one cluster, to can automatically say what are stars

In [None]:
plt.scatter(
    embedding_float[:, 0],
    embedding_float[:, 1]
    #list indices must be integers or slices, not float
    #c=data_csv
   ) 
#this is attempting to plot out the stars, could just be the first 859 objects
#try running for less objects until in same cluster, in testing that, found that even the first 100 objects
#are not simply plotted together, they are also scattered
for i in range(859):
    plt.scatter(
        embedding_float[i,0],
        embedding_float[i, 1], color='r')
    
    
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP Float projection of the dataset with stars highlighted', fontsize=24)

# Kmeans on umap projection

In [None]:
fig = plt.figure(figsize=(10,10))
#using 15 neighbors, changing this changed the coloring
kmeans_labels = cluster.KMeans(n_clusters=10).fit_predict(embedding) #scaled data is the reducer created
plt.scatter(embedding[:, 0], embedding[:, 1], c=kmeans_labels, s=0.1, cmap='Spectral');

#friends of friends algorithm
#minimal spanning tree - kruzkil method

In [None]:
#hopefully simpler before tackling area
plt.scatter(
    embedding[:, 0],
    embedding[:, 1],
    #list indices must be integers or slices, not float
    c=[sns.color_palette()[x] for x in data_csv.local_background.map({"0":0})]) 
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the dataset', fontsize=24)

In [None]:
#all the parameters that can be defined
'''UMAP(a=None, angular_rp_forest=False, b=None,
     force_approximation_algorithm=False, init='spectral', learning_rate=1.0,
     local_connectivity=1.0, low_memory=False, metric='euclidean',
     metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, output_metric='euclidean',
     output_metric_kwds=None, random_state=42, repulsion_strength=1.0,
     set_op_mix_ratio=1.0, spread=1.0, target_metric='categorical',
     target_metric_kwds=None, target_n_neighbors=-1, target_weight=0.5,
     transform_queue_size=4.0, transform_seed=42, unique=False, verbose=False)'''