In [1]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import sys
sys.path.append("C:/Users/mattj/zoobot")
from zoobot import label_metadata, schemas
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from scipy.optimize import linear_sum_assignment as linear_assignment
import time

In [2]:
def findChoice(frac):
    choice = np.zeros_like(frac)
    choice[np.arange(len(frac)), frac.argmax(1)] = 1
    return choice

def getQuestionClasses(auto_f, volunteers, question):
    qcol_name = question.text+'_total-votes'
    fcol_names = [(cols.text+'_fraction') for cols in question.answers]
    anscol_names = [cols.text for cols in question.answers]
    valid_feats = []
    
    valid_vol = volunteers.query('`{}`/`smooth-or-featured_total-votes` >= 0.5'.format(qcol_name))
    valid_idx = valid_vol.index.tolist()
    vol_results = valid_vol[fcol_names].values
    
    auto_values = auto_f.values
    
    for i in valid_idx:
        valid_feats.append(auto_values[i])
        
    rounded_vol_results = findChoice(np.asarray(vol_results))
    support = len(rounded_vol_results)
    
    pred_results = KMeans(n_clusters=len(fcol_names), verbose=0).fit_predict(valid_feats)
    
    vol_classes = np.argmax(rounded_vol_results, axis=1)
    
    return valid_idx, support, anscol_names, np.array(pred_results), np.array(vol_classes)

In [3]:
def _make_cost_m(cm):
    s = np.max(cm)
    return (- cm + s)

def labelMap(vol, pred):
    cm = confusion_matrix(vol, pred)
    indexes = linear_assignment(_make_cost_m(cm))
    indexes = np.asarray(indexes)
    return indexes[1]
    
def convertLabels(lmap, pred):
    conv_preds = []
    for i in range(len(pred)):
        conv_preds.append(lmap[pred[i]])
    return np.array(conv_preds)

In [4]:
auto_features = pd.read_csv("../autoencoder/extracted_features_vaev2.csv")

In [5]:
auto_features = auto_features.drop('file_loc',axis=1)

In [6]:
decals_test = pd.read_csv('D:/Ilifu_data/decals_ilifu_test.csv')
schema = schemas.Schema(label_metadata.decals_pairs, label_metadata.get_gz2_and_decals_dependencies(label_metadata.decals_pairs))

{smooth-or-featured, indices 0 to 2, asked after None: (0, 2), disk-edge-on, indices 3 to 4, asked after smooth-or-featured_featured-or-disk, index 1: (3, 4), has-spiral-arms, indices 5 to 6, asked after disk-edge-on_no, index 4: (5, 6), bar, indices 7 to 9, asked after disk-edge-on_no, index 4: (7, 9), bulge-size, indices 10 to 14, asked after disk-edge-on_no, index 4: (10, 14), how-rounded, indices 15 to 17, asked after smooth-or-featured_smooth, index 0: (15, 17), edge-on-bulge, indices 18 to 20, asked after disk-edge-on_yes, index 3: (18, 20), spiral-winding, indices 21 to 23, asked after has-spiral-arms_yes, index 5: (21, 23), spiral-arm-count, indices 24 to 29, asked after has-spiral-arms_yes, index 5: (24, 29), merging, indices 30 to 33, asked after None: (30, 33)}


In [7]:
total_report = {}
runAmount = 10
total_time = {}
for question in label_metadata.decals_pairs:
        total_report[question] = {
            'precision': 0,
            'recall': 0,
            'f1': 0,
            'support': 0
        }
for question in label_metadata.decals_pairs:
    total_time[question] = {}
    print('Starting Clustering for ',question)
    start = time.time()
    for i in range(runAmount):
        idxs, support, anscols, valid_preds, valid_vol = getQuestionClasses(auto_features, decals_test, schema.get_question(question))
        lmap = labelMap(valid_vol, valid_preds)
        conv_preds = convertLabels(lmap, valid_preds)
        question_report = precision_recall_fscore_support(y_pred=conv_preds, y_true=valid_vol, average='weighted')
        total_report[question]['precision'] += question_report[0]
        total_report[question]['recall'] += question_report[1]
        total_report[question]['f1'] += question_report[2]
    end = time.time()
    total_report[question]['support'] = support
    total_report[question]['precision'] /= runAmount
    total_report[question]['recall'] /= runAmount
    total_report[question]['f1'] /= runAmount
    total_time[question]['total'] = end - start
    total_time[question]['avg'] = total_time[question]['total']/runAmount
    print('Question: ',question,' Completed ', runAmount, ' times and avearged.')
    print('--------------------------------------------------------------')

Starting Clustering for  smooth-or-featured




Question:  smooth-or-featured  Completed  10  times and avearged.
--------------------------------------------------------------
Starting Clustering for  disk-edge-on




Question:  disk-edge-on  Completed  10  times and avearged.
--------------------------------------------------------------
Starting Clustering for  has-spiral-arms




Question:  has-spiral-arms  Completed  10  times and avearged.
--------------------------------------------------------------
Starting Clustering for  bar




Question:  bar  Completed  10  times and avearged.
--------------------------------------------------------------
Starting Clustering for  bulge-size




Question:  bulge-size  Completed  10  times and avearged.
--------------------------------------------------------------
Starting Clustering for  how-rounded




Question:  how-rounded  Completed  10  times and avearged.
--------------------------------------------------------------
Starting Clustering for  edge-on-bulge




Question:  edge-on-bulge  Completed  10  times and avearged.
--------------------------------------------------------------
Starting Clustering for  spiral-winding




Question:  spiral-winding  Completed  10  times and avearged.
--------------------------------------------------------------
Starting Clustering for  spiral-arm-count




Question:  spiral-arm-count  Completed  10  times and avearged.
--------------------------------------------------------------
Starting Clustering for  merging




Question:  merging  Completed  10  times and avearged.
--------------------------------------------------------------


In [8]:
report_df = pd.DataFrame.from_dict(total_report, orient='index')
report_df

Unnamed: 0,precision,recall,f1,support
smooth-or-featured,0.564931,0.367714,0.424255,49917
disk-edge-on,0.652912,0.544808,0.581992,15417
has-spiral-arms,0.721528,0.665711,0.688582,11234
bar,0.449459,0.363379,0.378616,11234
bulge-size,0.392579,0.236087,0.268077,11234
how-rounded,0.465795,0.380126,0.401976,32465
edge-on-bulge,0.547169,0.340212,0.39186,2544
spiral-winding,0.427273,0.364726,0.382524,7456
spiral-arm-count,0.440397,0.183825,0.216079,7456
merging,0.750315,0.259775,0.35395,49268


In [9]:
time_df = pd.DataFrame.from_dict(total_time, orient='index')
time_df

Unnamed: 0,total,avg
smooth-or-featured,14.274148,1.427415
disk-edge-on,5.948299,0.59483
has-spiral-arms,2.902267,0.290227
bar,4.162944,0.416294
bulge-size,5.971362,0.597136
how-rounded,9.284432,0.928443
edge-on-bulge,1.727217,0.172722
spiral-winding,3.53334,0.353334
spiral-arm-count,4.23293,0.423293
merging,18.855416,1.885542
