In [1]:
import plotly.express as px
import pandas as pd
from pylabel import importer

from get_dataset_stats import get_feature_classes, get_class_count
from get_dataset_stats import min_occurences


ModuleNotFoundError: No module named 'get_dataset_stats'

In [3]:
from pylabel import importer, dataset
import numpy as np
import pandas as pd
import os

def rm_min_classes(dataset, min_nb_occurrences):
    '''remove classes that does not reach the minimum number of occurences/class
    #Inputs :
    - dataset : a coco file read with importer.ImportCoco() from pylabel
    - min_nb_occurrences : an integer that is the minimum number of occurrences of a class to be kept in the dataset (remove under representated classes)
    # Output : 
    - the same dataset with classes that have less than the minimum number of occurrences removed.
    '''
    grouped_by_class = dataset.df.groupby(by='cat_name', axis=0).count()
    under_represented_classes = grouped_by_class[grouped_by_class['img_folder'] < min_nb_occurrences].index
    deleted_classes=list(under_represented_classes.values)
    index_to_remove = dataset.df[dataset.df.cat_name.isin(deleted_classes)].index
    dataset.df.drop(index=index_to_remove, inplace=True)
    return dataset

def rm_tiles_without_annot(dataset):
    '''remove tiles without annotations
    #Input:
    - dataset : a coco file read with importer.ImportCoco() from pylabel
    # Output:
    - the same dataset with rows that do not contains any info in the 'cat_name' column removed.
    '''
    ind_images_without_annot = dataset.df.loc[dataset.df['cat_name']==''].index
    dataset.df.drop(index=ind_images_without_annot, inplace=True)
    return dataset


class COCOStats():
    ''' A class to explore the basic stats of a coco file'''
    def __init__(self, coco_path, min_nb_occurrences=None):
        self.dataset = self.process_coco(coco_path, min_nb_occurrences)

    def process_coco(self, coco_path, min_nb_occurrences):
        ''' remove the samples without annotations and removed underrepresetned classes if needed'''        
        dataset = importer.ImportCoco(path=coco_path, name="dataset")  
        dataset = rm_tiles_without_annot(dataset)
        if min_nb_occurrences:
            dataset = rm_min_classes(dataset, min_nb_occurrences)
        return dataset
    
    def get_class_stats(self):
        '''print the number of occurrences per classes'''
        print(f"Classes:{self.dataset.analyze.classes}")
        print(f"Number of classes: {self.dataset.analyze.num_classes}")
        print(f"Class counts:\n{self.dataset.analyze.class_counts}")
        
    def get_nb_images(self):
        print(f"Number of images: {self.dataset.analyze.num_images}")
    
    def export_stats(self, export_path):
        ''' Export the stats in csv format at the export_path'''
        self.dataset.analyze.class_counts.to_csv(export_path)


class COCOSplitter(COCOStats):
    ''' Split a coco file into a train, test and (optional) validation coco files. Proportions of class annotations are kept into the splits.
    # Inputs:
    - coco_path : a path to a coco file
    - export_dir : a directory where the splits of the coco files will be exported
    - coco_train_name : the name of the file with the annotations for training
    - coco_test_name : the name of the file with the annotations for testing
    - coco_val_name : the name of the file with the annotations for validation
    - min_nb_occurrences : an integer that is the minimum number of occurrences of a class to be kept in the dataset (remove under representated classes)
    - train_pct : the fraction (float) of annotations that will go into the train coco file
    - val_pct : the fraction (float) of annotations that will go into the validation coco file
    - test_pct : the fraction (float) of annotations that will go into the test coco file
    - batch_size
    # Outputs:
    Splits of the coco files are exported in COCO format in the export_dir mentionned.
    '''
    def __init__(self, coco_path, export_dir, coco_train_name, coco_test_name, coco_val_name, min_nb_occurrences=None, train_pct=.8, val_pct=.1, test_pct=.1, batch_size=8):
        self.dataset = self.process_coco(coco_path, min_nb_occurrences)
        self.export_dir = export_dir
        self.coco_train_name = coco_train_name
        self.coco_test_name = coco_test_name
        self.coco_val_name = coco_val_name
        
        
    def create_train_test_val_datasets(self):
        dataset_train = importer.ImportCoco(path=coco_path, name="trainset")
        dataset_val = importer.ImportCoco(path=coco_path, name="valset")
        dataset_test = importer.ImportCoco(path=coco_path, name="testset")
        return dataset_train, dataset_val, dataset_test
    
    def split_coco(self):
        self.dataset.splitter.StratifiedGroupShuffleSplit(train_pct=.8, val_pct=.1, test_pct=.1, batch_size=2)
        
        self.dataset.analyze.ShowClassSplits()

        df_train = dataset.Dataset(self.dataset.df.query("split == 'train'"))
        df_val = dataset.Dataset(self.dataset.df.query("split == 'val'"))
        df_test = dataset.Dataset(self.dataset.df.query("split == 'test'"))
        
        df_train.export.ExportToCoco(output_path=os.path.join(self.export_dir, self.coco_train_name, '.json'))
        df_val.export.ExportToCoco(output_path=os.path.join(self.export_dir, self.coco_val_name, '.json'))
        df_test.export.ExportToCoco(output_path=os.path.join(self.export_dir, self.coco_test_name, '.json'))
        


In [4]:
stats = COCOStats('../../coco/no_overlap/coco_species.json')

In [5]:
stats.export_stats('../../stats.csv')

In [4]:
metrics = pd.read_csv('../../logs/models_metrics.csv', header=0)
features = ['species', 'genus', 'lht']
classes_list = []

for feature in features:
    coco_path = f'../../coco/no_overlap/coco_{feature}.json'
    classes_list.append(get_feature_classes(coco_path, 20))



In [1]:
from pylabel import importer, dataset
import numpy as np
import yaml
import pandas as pd

def rm_min_classes(dataset, min_nb_occurrences):
    '''remove classes that does not reach the minimum number of occurences/class
    #Inputs :
    - dataset : a coco file read with importer.ImportCoco() from pylabel
    - min_nb_occurrences : an integer that is the minimum number of occurrences of a class to be kept in the dataset (remove under representated classes)
    # Output : 
    - the same dataset with classes that have less than the minimum number of occurrences removed.
    '''
    grouped_by_class = dataset.df.groupby(by='cat_name', axis=0).count()
    under_represented_classes = grouped_by_class[grouped_by_class['img_folder'] < min_nb_occurrences].index
    deleted_classes=list(under_represented_classes.values)
    index_to_remove = dataset.df[dataset.df.cat_name.isin(deleted_classes)].index
    dataset.df.drop(index=index_to_remove, inplace=True)
    return dataset

def rm_tiles_without_annot(dataset):
    '''remove tiles without annotations
    #Input:
    - dataset : a coco file read with importer.ImportCoco() from pylabel
    # Output:
    - the same dataset with rows that do not contains any info in the 'cat_name' column removed.
    '''
    ind_images_without_annot = dataset.df.loc[dataset.df['cat_name']==''].index
    dataset.df.drop(index=ind_images_without_annot, inplace=True)
    return dataset


class COCOStats():
    ''' A class to explore the basic stats of a coco file'''
    def __init__(self, coco_path, min_nb_occurrences=None):
        self.dataset = self.process_coco(coco_path, min_nb_occurrences)

    def process_coco(self, coco_path, min_nb_occurrences):
        ''' remove the samples without annotations and removed underrepresetned classes if needed'''        
        dataset = importer.ImportCoco(path=coco_path, name="dataset")  
        dataset = rm_tiles_without_annot(dataset)
        if min_nb_occurrences:
            dataset = rm_min_classes(dataset, min_nb_occurrences)
        return dataset
    
    def get_class_stats(self):
        '''print the number of occurrences per classes'''
        print(f"Classes:{self.dataset.analyze.classes}")
        print(f"Number of classes: {self.dataset.analyze.num_classes}")
        print(f"Class counts:\n{self.dataset.analyze.class_counts}")
        
    def get_nb_images(self):
        print(f"Number of images: {self.dataset.analyze.num_images}")
    
    def export_stats(self, export_path):
        ''' Export the stats in csv format at the export_path'''
        self.dataset.analyze.class_counts.to_csv(export_path)


class COCOSplitter(COCOStats):
    def __init__(self, coco_path, min_nb_occurrences=None, train_pct=.8, val_pct=.1, test_pct=.1, batch_size=):
        self.dataset = self.process_coco(coco_path, min_nb_occurrences)
        
    def create_train_test_val_datasets(self):
        dataset_train = importer.ImportCoco(path=coco_path, name="trainset")
        dataset_val = importer.ImportCoco(path=coco_path, name="valset")
        dataset_test = importer.ImportCoco(path=coco_path, name="testset")
        return dataset_train, dataset_val, dataset_test
    
    def split_coco(self):
        ''' Split a coco file into a train, test and (optional) validation coco files.'''
        self.dataset.splitter.StratifiedGroupShuffleSplit(train_pct=.8, val_pct=.1, test_pct=.1, batch_size=2)
        
        self.dataset.analyze.ShowClassSplits()

        df_train.df = self.dataset.df.query("split == 'train'")
        df_val.df = self.dataset.df.query("split == 'val'")
        df_test.df = self.dataset.df.query("split == 'test'")
        
        df_train.name = f'coco_train80'
        df_train.export.ExportToCoco('./')
              
        



In [5]:
splitter = COCOSplitter(coco_path=coco_path, min_nb_occurrences=500)

In [6]:
splitter.split_coco()

  for _, group in subject_grouped_df_main:
  df_train = df_train.append(pd.DataFrame(group), ignore_index=True)
  df_val = df_val.append(pd.DataFrame(group), ignore_index=True)
  df_test = df_test.append(pd.DataFrame(group), ignore_index=True)
  batch_df = batch_df.append(group)
  batch_df = batch_df.append(group)
  mse_loss_diff_train = calc_mse_loss(df_train) - calc_mse_loss(df_train.append(batch_df, ignore_index=True))
  mse_loss_diff_val = calc_mse_loss(df_val) - calc_mse_loss(df_val.append(batch_df, ignore_index=True))
  mse_loss_diff_test = calc_mse_loss(df_test) - calc_mse_loss(df_test.append(batch_df, ignore_index=True))
  df_train = df_train.append(batch_df, ignore_index=True)
  batch_df = batch_df.append(group)
  batch_df = batch_df.append(group)
  mse_loss_diff_train = calc_mse_loss(df_train) - calc_mse_loss(df_train.append(batch_df, ignore_index=True))
  mse_loss_diff_val = calc_mse_loss(df_val) - calc_mse_loss(df_val.append(batch_df, ignore_index=True))
  mse_loss_diff_tes

AttributeError: 'DataFrame' object has no attribute 'export'

In [3]:
classes_list[0] = ['AP-' + s  for s in classes_list[0]]
classes_list[0]

['AP-tenuis',
 'AP-histrix',
 'AP-rus',
 'AP-hemprichi',
 'AP-Lutea',
 'AP-sp1',
 'AP-lobata',
 'AP-cylibdrica',
 'AP-digitifera',
 'AP-retiformis',
 'AP-divaricata',
 'AP-anae',
 'AP-hyacinthus',
 'AP-humilis',
 'AP-gemmacea',
 'AP-fungites',
 'AP-pectinata',
 'AP-daedalea',
 'AP-speciosa',
 'AP-muricata',
 'AP-fascicularis',
 'AP-exaesa',
 'AP-halicora',
 'AP-astreata',
 'AP-complanata',
 'AP-cylindrica',
 'AP-palifera',
 'AP-heliopora',
 'AP-Muricata 1',
 'AP-lutea',
 'AP-sp']

In [4]:
classes_list[0].extend(['model', 'AP', 'AP50','AP75', 'APs', 'APm', 'APl'])
classes_list[0]

['AP-tenuis',
 'AP-histrix',
 'AP-rus',
 'AP-hemprichi',
 'AP-Lutea',
 'AP-sp1',
 'AP-lobata',
 'AP-cylibdrica',
 'AP-digitifera',
 'AP-retiformis',
 'AP-divaricata',
 'AP-anae',
 'AP-hyacinthus',
 'AP-humilis',
 'AP-gemmacea',
 'AP-fungites',
 'AP-pectinata',
 'AP-daedalea',
 'AP-speciosa',
 'AP-muricata',
 'AP-fascicularis',
 'AP-exaesa',
 'AP-halicora',
 'AP-astreata',
 'AP-complanata',
 'AP-cylindrica',
 'AP-palifera',
 'AP-heliopora',
 'AP-Muricata 1',
 'AP-lutea',
 'AP-sp',
 'model',
 'AP',
 'AP50',
 'AP75',
 'APs',
 'APm',
 'APl']

In [5]:
metrics_filtered = metrics.filter(classes_list[0])

In [6]:
classes_list[0].remove('model')

In [7]:
metrics_melted = pd.melt(
    metrics_filtered, 
    id_vars='model', 
    value_vars=classes_list[0],
    value_name='Average precision')

In [8]:
metrics_melted.dropna(axis=0, how='any', inplace=True)

In [9]:
metrics_species = metrics_melted[metrics_melted['model'].isin(['20221229_15:09:14_ITER4000_X101_ae102021only', '20230202_13:33:19_X101_allsites_ITER20000', '20230202_11:07:44_X101_allsites_ITER10000'])]

In [10]:
fig = px.bar(metrics_species, x="model", y="Average precision", color="model", barmode="relative",
             facet_col="variable", facet_col_wrap=5, facet_row_spacing=0.04, facet_col_spacing=0.04,
             height=2000, width = 1500, title= 'AP : model species trained on aeroport site only and on all the 3 sites of acquisition')
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.show()

In [11]:
fig.write_html('/home/justine/Documents/G2OI/collaborations/isabel/CR/AP_species.html')