In [5]:
import pandas as pd
import numpy as np
import sys
sys.path.append("C:/Users/mattj/zoobot")

from zoobot import label_metadata, schemas
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from scipy.optimize import linear_sum_assignment as linear_assignment
import skfuzzy
import time

In [6]:
def findChoice(frac):
    choice = np.zeros_like(frac)
    choice[np.arange(len(frac)), frac.argmax(1)] = 1
    return choice

def getQuestionClasses(auto_f, volunteers, question):
    qcol_name = question.text+'_total-votes'
    fcol_names = [(cols.text+'_fraction') for cols in question.answers]
    anscol_names = [cols.text for cols in question.answers]
    
    valid_vol = volunteers.query('`{}`/`smooth-or-featured_total-votes` >= 0.5'.format(qcol_name))
    valid_idx = valid_vol.index.tolist()
    vol_results = valid_vol[fcol_names].values
    
    auto_values = auto_f.values
    
    valid_feats = auto_values[valid_idx]
        
    rounded_vol_results = findChoice(np.asarray(vol_results))
    support = len(rounded_vol_results)
    
    _,u,_,_,_,_,_ = skfuzzy.cmeans(np.transpose(valid_feats), c=len(fcol_names), m=2, error=1e-9, maxiter=300)
    pred_results = np.argmax(u, axis=0)

    vol_classes = np.argmax(rounded_vol_results, axis=1)
    
    return valid_idx, support, anscol_names, np.array(pred_results), np.array(vol_classes)

In [7]:
def _make_cost_m(cm):
    s = np.max(cm)
    return (- cm + s)

def labelMap(vol, pred):
    cm = confusion_matrix(vol, pred)
    indexes = linear_assignment(_make_cost_m(cm))
    indexes = np.asarray(indexes)
    return indexes[1]
    
def convertLabels(lmap, pred):
    conv_preds = np.zeros(len(pred), dtype=int)
    for i in range(len(pred)):
        conv_preds[i] = lmap[pred[i]]
    return np.array(conv_preds)

In [8]:
auto_features = pd.read_csv("../autoencoder/extracted_features.csv")

In [9]:
auto_features = auto_features.drop('file_loc',axis=1)

In [10]:
decals_test = pd.read_csv('D:/Ilifu_data/decals_ilifu_test.csv')
schema = schemas.Schema(label_metadata.decals_pairs, label_metadata.get_gz2_and_decals_dependencies(label_metadata.decals_pairs))

{smooth-or-featured, indices 0 to 2, asked after None: (0, 2), disk-edge-on, indices 3 to 4, asked after smooth-or-featured_featured-or-disk, index 1: (3, 4), has-spiral-arms, indices 5 to 6, asked after disk-edge-on_no, index 4: (5, 6), bar, indices 7 to 9, asked after disk-edge-on_no, index 4: (7, 9), bulge-size, indices 10 to 14, asked after disk-edge-on_no, index 4: (10, 14), how-rounded, indices 15 to 17, asked after smooth-or-featured_smooth, index 0: (15, 17), edge-on-bulge, indices 18 to 20, asked after disk-edge-on_yes, index 3: (18, 20), spiral-winding, indices 21 to 23, asked after has-spiral-arms_yes, index 5: (21, 23), spiral-arm-count, indices 24 to 29, asked after has-spiral-arms_yes, index 5: (24, 29), merging, indices 30 to 33, asked after None: (30, 33)}


In [11]:
total_report = {}
# seeds = [6589,4598,2489,9434,7984,1238,6468,5165,3246,8646]
seeds = 1
total_time = {}
for question in label_metadata.decals_pairs:
        total_report[question] = {
            'precision': 0,
            'recall': 0,
            'f1': 0,
            'support': 0
        }
for question in label_metadata.decals_pairs:
    total_time[question] = {}
    print('Starting Clustering for ',question)
    start = time.time()
    for seed in range(seeds):
        idxs, support, anscols, valid_preds, valid_vol = getQuestionClasses(auto_features, decals_test, schema.get_question(question))
        lmap = labelMap(valid_vol, valid_preds)
        conv_preds = convertLabels(lmap, valid_preds)
        question_report = precision_recall_fscore_support(y_pred=conv_preds, y_true=valid_vol, average='weighted')
        total_report[question]['precision'] += question_report[0]
        total_report[question]['recall'] += question_report[1]
        total_report[question]['f1'] += question_report[2]
    end = time.time()
    total_report[question]['support'] = support
    total_report[question]['precision'] /= seeds
    total_report[question]['recall'] /= seeds
    total_report[question]['f1'] /= seeds
    total_time[question]['total'] = end - start
    total_time[question]['avg'] = total_time[question]['total']/seeds
    print('Question: ',question,' Completed ', seeds, ' times and avearged.')
    print('--------------------------------------------------------------')


Starting Clustering for  smooth-or-featured
Question:  smooth-or-featured  Completed  1  times and avearged.
--------------------------------------------------------------
Starting Clustering for  disk-edge-on
Question:  disk-edge-on  Completed  1  times and avearged.
--------------------------------------------------------------
Starting Clustering for  has-spiral-arms
Question:  has-spiral-arms  Completed  1  times and avearged.
--------------------------------------------------------------
Starting Clustering for  bar
Question:  bar  Completed  1  times and avearged.
--------------------------------------------------------------
Starting Clustering for  bulge-size
Question:  bulge-size  Completed  1  times and avearged.
--------------------------------------------------------------
Starting Clustering for  how-rounded
Question:  how-rounded  Completed  1  times and avearged.
--------------------------------------------------------------
Starting Clustering for  edge-on-bulge


  _warn_prf(average, modifier, msg_start, len(result))


Question:  edge-on-bulge  Completed  1  times and avearged.
--------------------------------------------------------------
Starting Clustering for  spiral-winding
Question:  spiral-winding  Completed  1  times and avearged.
--------------------------------------------------------------
Starting Clustering for  spiral-arm-count
Question:  spiral-arm-count  Completed  1  times and avearged.
--------------------------------------------------------------
Starting Clustering for  merging
Question:  merging  Completed  1  times and avearged.
--------------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
report_df = pd.DataFrame.from_dict(total_report, orient='index')
time_df = pd.DataFrame.from_dict(total_time, orient='index')

In [13]:
report_df

Unnamed: 0,precision,recall,f1,support
smooth-or-featured,0.558775,0.525933,0.537451,49917
disk-edge-on,0.663747,0.554687,0.592729,15351
has-spiral-arms,0.683657,0.552646,0.598381,11340
bar,0.414554,0.333951,0.325809,11340
bulge-size,0.206103,0.037302,0.006379,11340
how-rounded,0.521948,0.447168,0.415513,32641
edge-on-bulge,0.059529,0.165383,0.086625,2467
spiral-winding,0.663658,0.242234,0.157326,7501
spiral-arm-count,0.354179,0.04666,0.015224,7501
merging,0.74781,0.536604,0.613393,49271


In [14]:
time_df

Unnamed: 0,total,avg
smooth-or-featured,410.759517,410.759517
disk-edge-on,94.221868,94.221868
has-spiral-arms,70.217895,70.217895
bar,93.268349,93.268349
bulge-size,96.125883,96.125883
how-rounded,277.656479,277.656479
edge-on-bulge,22.965458,22.965458
spiral-winding,64.123813,64.123813
spiral-arm-count,81.989435,81.989435
merging,314.983061,314.983061


In [None]:
report_df.to_csv("../clustering/fuzzy_cluster_accuracy.csv")
time_df.to_csv("../clustering/fuzzy_time.csv")