In [7]:
import gc
import os
import cv2
import sys
import json
import tqdm
import time
import timm
import torch
import random
import sklearn.metrics

from PIL import Image
from pathlib import Path
from functools import partial
from contextlib import contextmanager

import numpy as np
import scipy as sp
import pandas as pd
import torch.nn as nn

from torch.optim import Adam, SGD
from scipy.special import softmax
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader, Dataset
from albumentations import Compose, Normalize, Resize
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.metrics import f1_score, accuracy_score, top_k_accuracy_score


seed = 2022
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

os.environ["CUDA_VISIBLE_DEVICES"]="2"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [8]:
#train_metadata = pd.read_csv("../../../resources/DF20/DanishFungi2020_train_metadata_DEV.csv")
#test_metadata = pd.read_csv("../../../resources/DF20/DanishFungi2020_test_metadata_DEV.csv")

train_metadata = pd.read_csv("/home/data2/CZP/SnakeCLEF/matadata/SnakeCLEF2022-TrainMetadata.csv")
print(len(train_metadata))
# vali_metadata = pd.read_csv("/home/data3/changhao/Datasets/FGVC2022_Fungi/DF20-val_metadata.csv")
# print(len(vali_metadata))
test_metadata = pd.read_csv("/home/data2/CZP/SnakeCLEF/matadata/SnakeCLEF2022-TestMetadata.csv")
print(len(test_metadata))

270251
48280


In [10]:
# del test_metadata['Unnamed: 0']

In [11]:
# vali_metadata.columns

In [12]:
test_metadata.columns

Index(['observation_id', 'endemic', 'country', 'code', 'file_path'], dtype='object')

In [15]:
#是否要命名相同
train_metadata = train_metadata.rename(columns={'observation_id':'ObservationId'})
test_metadata = test_metadata.rename(columns={'observation_id':'ObservationId'})
# vali_metadata = vali_metadata.rename(columns={'gbifID':'ObservationId'})

# test_metadata = test_metadata.rename(columns={'Location_lvl0':'locality', 'Location_lvl3':'level0Name','Location_lvl2':'level1Name','Location_lvl1':'level2Name'}) 

In [16]:
train_metadata.ObservationId.value_counts()

5156122      71
86508209     45
59455956     40
71574003     37
7009836      37
             ..
18285523      1
18285487      1
18282458      1
18281482      1
106435952     1
Name: ObservationId, Length: 158698, dtype: int64

In [17]:
metadata = pd.concat([train_metadata,test_metadata])
print(len(metadata))
all_data = pd.concat([train_metadata])  #不包含测试集
print(len(all_data))

318531
270251


In [18]:
#index_mapping = {int(item.split('.')[0].split('-')[1]):int(item.split('.')[0].split('-')[0]) for idx,item in enumerate(set(all_data['image_path']))}

In [19]:
all_data = all_data.drop_duplicates(subset=['ObservationId'])
print(len(all_data))
metadata =metadata.drop_duplicates(subset=['ObservationId'])
print(len(metadata))

158698
187129


In [20]:
# vali_metadata.class_id.value_counts()[-300:]

In [21]:
metadata = metadata.reset_index(drop=True)
all_data = all_data.reset_index(drop=True)

In [13]:
metadata.Habitat = metadata.Habitat.replace(np.nan, 'unknown', regex=True)
metadata.Substrate = metadata.Substrate.replace(np.nan, 'unknown', regex=True)
all_data.Habitat = metadata.Habitat.replace(np.nan, 'unknown', regex=True)
all_data.Substrate = metadata.Substrate.replace(np.nan, 'unknown', regex=True)

### 简单分析

In [23]:
metadata.columns
metadata

Unnamed: 0,ObservationId,endemic,binomial_name,country,code,class_id,file_path
0,2670823,True,Zamenis lineatus,Sicily,IT,1567.0,1990/Zamenis_lineatus/3001242.jpg
1,96294178,False,Liasis olivaceus,Queensland,AU,816.0,1990/Liasis_olivaceus/159953206.jpeg
2,70108926,False,Xenoxybelis argenteus,Amazonas,VE,1561.0,1990/Xenoxybelis_argenteus/113910655.jpg
3,117935,False,Aspidelaps lubricus,Northern Cape,ZA,81.0,1990/Aspidelaps_lubricus/168477.JPG
4,125266,False,Dipsina multimaculata,Karas,unknown,495.0,1990/Dipsina_multimaculata/177336.JPG
...,...,...,...,...,...,...,...
187124,106374002,False,,Masvingo,ZW,,178681579.jpeg
187125,106374202,False,,Masvingo,ZW,,178698418.jpeg
187126,106391807,False,,La Pampa,AR,,178714280.jpg
187127,106408578,False,,Western Australia,AU,,178756573.jpg


In [15]:
name = 'locality'

In [16]:
len(all_data[name].value_counts())

9002

In [17]:
len(metadata[name].value_counts())

10539

In [18]:
# for i in list(metadata[name].value_counts().reset_index()['index']):
#     if i not in list(all_data[name].value_counts().reset_index()['index']):
#         print(i)

In [19]:
#测试集countryCode特有的 RE BE IE CH，这个可以用，但是之前不存在的国家我们也不应该置0，而是不使用后处理
metadata[(metadata['countryCode']=='RE')|(metadata['countryCode']=='BE')|(metadata['countryCode']=='IE')|(metadata['countryCode']=='CH')]

Unnamed: 0,ObservationId,eventDate,year,month,day,countryCode,locality,taxonID,scientificName,kingdom,...,ImageUniqueID,Substrate,rightsHolder,Latitude,Longitude,CoorUncert,Habitat,image_path,class_id,filename
180977,3358359419,2021-09-24T00:00:00,,9.0,24.0,BE,Molenkouter,,,,...,,soil,,,,,park/churchyard,,,0-3358359419.JPG
181789,3399806382,2021-11-05T00:00:00,,11.0,5.0,IE,Dawnaknoelane River,,,,...,,mosses,,,,,Unmanaged deciduous woodland,,,0-3399806382.JPG
197805,3126946324,2021-06-02T00:00:00,,6.0,2.0,RE,Piton Tortue,,,,...,,wood chips or mulch,,,,,other habitat,,,0-3126946324.JPG
213627,3311359309,2021-06-20T00:00:00,,6.0,20.0,BE,Bw Casteau Resort S A,,,,...,,wood and roots of living trees,,,,,garden,,,0-3311359309.JPG
214459,3358349372,2021-09-22T00:00:00,,9.0,22.0,BE,Bos,,,,...,,soil,,,,,park/churchyard,,,0-3358349372.JPG
215317,3399805417,2021-11-07T00:00:00,,11.0,7.0,IE,Knockroe,,,,...,,leaf or needle litter,,,,,Deciduous woodland,,,0-3399805417.JPG
226080,3340157449,2021-08-23T00:00:00,,8.0,23.0,CH,Rezlibergli,,,,...,,soil,,,,,Mixed woodland (with coniferous and deciduous ...,,,0-3340157449.JPG
232370,3358363367,2021-09-22T00:00:00,,9.0,22.0,BE,Bos,,,,...,,soil,,,,,park/churchyard,,,0-3358363367.JPG


In [20]:
#测试集Substrate特有的,这种直接-1就完事了
metadata[metadata['Substrate']=='spiders']

Unnamed: 0,ObservationId,eventDate,year,month,day,countryCode,locality,taxonID,scientificName,kingdom,...,ImageUniqueID,Substrate,rightsHolder,Latitude,Longitude,CoorUncert,Habitat,image_path,class_id,filename
181165,3380898495,2021-10-03T00:00:00,,10.0,3.0,DK,Risgårde,,,,...,,spiders,,,,,natural grassland,,,0-3380898495.JPG
183799,3355967414,2021-09-18T00:00:00,,9.0,18.0,DK,Skansebakken,,,,...,,spiders,,,,,natural grassland,,,0-3355967414.JPG
199684,3410061329,2021-11-12T00:00:00,,11.0,12.0,DK,Hornbæk Havn,,,,...,,spiders,,,,,dune,,,0-3410061329.JPG
202272,3392601328,2021-10-17T00:00:00,,10.0,17.0,DK,Søholt Storskov,,,,...,,spiders,,,,,bog,,,0-3392601328.JPG
208714,3417635304,2021-12-05T00:00:00,,12.0,5.0,DK,Bistrup Hegn,,,,...,,spiders,,,,,natural grassland,,,0-3417635304.JPG
218883,3027625304,2021-01-24T00:00:00,,1.0,24.0,DK,Espergærde,,,,...,,spiders,,,,,gravel or clay pit,,,0-3027625304.JPG
232546,3380901444,2021-10-01T00:00:00,,10.0,1.0,DK,Almindingen,,,,...,,spiders,,,,,Forest bog,,,0-3380901444.JPG


In [21]:
all_data['class_id'].unique()

array([1273,  708,  535, ...,  436,  414,  331])

In [22]:
#观察二级分类是否正确，测试集看不了
class_mapping = {label:idx for idx,label in enumerate(set(all_data['genus']))}
all_data['genus_id'] = all_data['genus'].map(class_mapping)

class_to_genus = np.zeros(len(all_data['class_id'].unique()))
for species in all_data['class_id'].unique():
    #print(all_data[all_data['class_id'] == species]['genus_id'])
    class_to_genus[species] = all_data[all_data['class_id'] == species]['genus_id'].unique()[0]

In [23]:
class_to_genus[0]

58.0

### Extracting Species distribution

In [24]:
class_priors = np.zeros(len(all_data['class_id'].unique()))
for species in all_data['class_id'].unique():
    class_priors[species] = len(all_data[all_data['class_id'] == species])

class_priors = class_priors/sum(class_priors)

### Extracting species-month distribution

In [25]:
month_distributions = {}

for _, observation in tqdm.tqdm(all_data.iterrows(), total=len(all_data)):
    month = str(observation.month)
    class_id = observation.class_id
    if month not in month_distributions:        
        month_distributions[month] = np.zeros(len(all_data['class_id'].unique()))
    else:
        month_distributions[month][class_id] += 1

for key, value in month_distributions.items():
    month_distributions[key] = value / sum(value)

100%|██████████| 177170/177170 [00:14<00:00, 12229.80it/s]


### Extracting species-habitat distribution

In [26]:
habitat_distributions = {}

for _, observation in tqdm.tqdm(all_data.iterrows(), total=len(all_data)):
    habitat = observation.Habitat
    class_id = observation.class_id
    if habitat not in habitat_distributions:        
        habitat_distributions[habitat] = np.zeros(len(all_data['class_id'].unique()))
    else:
        habitat_distributions[habitat][class_id] += 1

for key, value in habitat_distributions.items():
    habitat_distributions[key] = value / sum(value)

100%|██████████| 177170/177170 [00:14<00:00, 12304.83it/s]


### Extracting species-substrate distribution

In [27]:
substrate_distributions = {}

for _, observation in tqdm.tqdm(all_data.iterrows(), total=len(all_data)):
    substrate = observation.Substrate
    class_id = observation.class_id
    if substrate not in  substrate_distributions:        
        substrate_distributions[substrate] = np.zeros(len(all_data['class_id'].unique()))
    else:
        substrate_distributions[substrate][class_id] += 1

for key, value in substrate_distributions.items():
    substrate_distributions[key] = value / sum(value)

100%|██████████| 177170/177170 [00:14<00:00, 12326.23it/s]


## Predicting with trained network

In [28]:
# Directly read our logits csv document
train_csv =pd.read_csv('/home/data3/changhao/WorkSpaceRecord/FGVC2022_Fungi/fungi_score_0421/trainscore_tf_efficientnet_b7_ns_freeze2_Crop10_acc.csv')
test_csv = pd.read_csv('/home/data3/changhao/WorkSpaceRecord/FGVC2022_Fungi/fungi_score_0421/testscore_tf_efficientnet_b7_ns_freeze2_Crop10_acc.csv')

In [29]:
scores = pd.DataFrame()
scores['ObservationId'] = train_csv['ObservationId']
preds_raw = softmax(train_csv.iloc[:,1:],axis=1)
scores = pd.concat([scores['ObservationId'], preds_raw], axis=1)
group_scores = scores.groupby(['ObservationId']).mean().reset_index()
train_csv = group_scores


In [30]:
scores = pd.DataFrame()
scores['ObservationId'] = test_csv['ObservationId']
preds_raw = softmax(test_csv.iloc[:,1:],axis=1)
scores = pd.concat([scores['ObservationId'], preds_raw], axis=1)
group_scores = scores.groupby(['ObservationId']).mean().reset_index()
test_csv = group_scores




In [31]:
train_csv = pd.merge( train_csv, all_data, on='ObservationId')
test_csv = pd.merge( test_csv, metadata, how='left',on='ObservationId')


In [32]:
train_csv['predId'] = np.argmax(np.array(train_csv.iloc[:, 1:1605]), axis=1)

In [33]:
train_csv_data = train_csv[train_csv.ObservationId.isin( train_metadata['ObservationId'])]
vali_csv_data = train_csv[train_csv.ObservationId.isin( vali_metadata['ObservationId'])]

In [34]:
vanilla_f1 = f1_score(train_csv_data['class_id'], train_csv_data['predId'], average='macro')
vanilla_accuracy = accuracy_score(train_csv_data['class_id'], train_csv_data['predId'])
vanilla_recall_3 = top_k_accuracy_score(train_csv_data['class_id'], np.array(train_csv_data.iloc[:, 1:1605]), k=3)
vanilla_recall_5 = top_k_accuracy_score(train_csv_data['class_id'], np.array(train_csv_data.iloc[:, 1:1605]), k=5)
vanilla_recall_10 = top_k_accuracy_score(train_csv_data['class_id'], np.array(train_csv_data.iloc[:, 1:1605]), k=10)

print('Vanilla:', vanilla_f1, vanilla_accuracy, vanilla_recall_3, vanilla_recall_5, vanilla_recall_10)

Vanilla: 0.986697949602212 0.9887209686626807 0.9997115644623381 0.9999399092629871 0.9999759637051948


In [35]:
vanilla_f1 = f1_score(vali_csv_data['class_id'], vali_csv_data['predId'], average='macro')
vanilla_accuracy = accuracy_score(vali_csv_data['class_id'], vali_csv_data['predId'])
vanilla_recall_3 = top_k_accuracy_score(vali_csv_data['class_id'], np.array(vali_csv_data.iloc[:, 1:1605]), k=3)
vanilla_recall_5 = top_k_accuracy_score(vali_csv_data['class_id'], np.array(vali_csv_data.iloc[:, 1:1605]), k=5)
vanilla_recall_10 = top_k_accuracy_score(vali_csv_data['class_id'], np.array(vali_csv_data.iloc[:, 1:1605]), k=10)

print('Vanilla:', vanilla_f1, vanilla_accuracy, vanilla_recall_3, vanilla_recall_5, vanilla_recall_10)

Vanilla: 0.908249052816525 0.9348529517509168 0.9860142374343855 0.9921622204645143 0.9956137197094989


In [36]:
GT_lbls = vali_csv_data['class_id']
preds_raw = list(np.array(vali_csv_data.iloc[:, 1:1605]))
#image_paths.extend(paths)
months = vali_csv_data['month']
subs = vali_csv_data['Substrate']
habitats = vali_csv_data['Habitat']

In [37]:
test_GT_lbls = test_csv['class_id']
test_preds_raw = list(np.array(test_csv.iloc[:, 1:1605]))
#image_paths.extend(paths)
test_months = test_csv['month']
test_subs = test_csv['Substrate']
test_habitats = test_csv['Habitat']

### Weighting by Habitat

In [38]:
wrong_predictions_H = []
weighted_predictions_H = []
weighted_predictions_raw_H = []
prior_ratio_H = []

for lbl, preds, hab in tqdm.tqdm(zip(GT_lbls, preds_raw, habitats), total=len(GT_lbls)):
    
    habitat_dist = habitat_distributions[hab]
    #preds = softmax(preds)
    
    p_habitat = (preds * habitat_dist) / sum(preds * habitat_dist)
    prior_ratio = p_habitat / class_priors
    max_index = np.argmax(prior_ratio * preds)        
    
    prior_ratio_H.append(prior_ratio)
    weighted_predictions_raw_H.append(prior_ratio * preds)
    weighted_predictions_H.append(max_index)
    
    if lbl != max_index:
        wrong_predictions_H.append([lbl, hab])

f1 = f1_score(vali_csv_data['class_id'], weighted_predictions_H, average='macro')
accuracy = accuracy_score(vali_csv_data['class_id'], weighted_predictions_H)
recall_3 = top_k_accuracy_score(vali_csv_data['class_id'], weighted_predictions_raw_H, k=3)
print('Habitat:', f1, accuracy, recall_3)
print('Habitat dif:', np.around(f1-vanilla_f1, 3), np.around((accuracy-vanilla_accuracy) * 100, 2), np.around((recall_3-vanilla_recall_3)*100, 2))

100%|██████████| 27814/27814 [00:09<00:00, 2832.84it/s]


Habitat: 0.9170229995081411 0.9383404041130367 0.9875242683540663
Habitat dif: 0.009 0.35 0.15


In [39]:
#test_wrong_predictions_H = []
test_weighted_predictions_H = []
test_weighted_predictions_raw_H = []
test_prior_ratio_H = []

for preds, hab in tqdm.tqdm(zip(test_preds_raw, test_habitats), total=len(test_habitats)):
    
    habitat_dist = habitat_distributions[hab]
    #preds = softmax(preds)
    
    p_habitat = (preds * habitat_dist) / sum(preds * habitat_dist)
    prior_ratio = p_habitat / class_priors
    max_index = np.argmax(prior_ratio * preds)        
    
    test_prior_ratio_H.append(prior_ratio)
    test_weighted_predictions_raw_H.append(prior_ratio * preds)
    test_weighted_predictions_H.append(max_index)
    

100%|██████████| 59420/59420 [00:19<00:00, 3047.00it/s]


### Weighting by Month

In [48]:
wrong_predictions_M = []
weighted_predictions_M = []
weighted_predictions_raw_M = []
prior_ratio_M = []

for lbl, preds, month in tqdm.tqdm(zip(GT_lbls, preds_raw, months), total=len(GT_lbls)):
    
    month_dist = month_distributions[str(float(month))]
    #preds = softmax(preds)
    
    p_month = (preds * month_dist) / sum(preds * month_dist)
    prior_ratio = p_month / class_priors        
    max_index = np.argmax(prior_ratio * preds)     
    
    prior_ratio_M.append(prior_ratio)
    weighted_predictions_raw_M.append(prior_ratio * preds)
    weighted_predictions_M.append(max_index)
    
    if lbl != max_index:
        wrong_predictions_M.append([lbl, month])

f1 = f1_score(vali_csv_data['class_id'], weighted_predictions_M, average='macro')
accuracy = accuracy_score(vali_csv_data['class_id'], weighted_predictions_M)
recall_3 = top_k_accuracy_score(vali_csv_data['class_id'], weighted_predictions_raw_M, k=3)
print('Month:', f1, accuracy, recall_3)
print('Month dif:', np.around(f1-vanilla_f1, 3), np.around((accuracy-vanilla_accuracy) * 100, 2), np.around((recall_3-vanilla_recall_3)*100, 2))

100%|██████████| 27814/27814 [00:09<00:00, 2879.28it/s]


Month: 0.9145712608940828 0.937117998130438 0.9871287840655785
Month dif: 0.006 0.23 0.11


In [43]:
test_weighted_predictions_M = []
test_weighted_predictions_raw_M = []
test_prior_ratio_M = []

for preds, month in tqdm.tqdm(zip(test_preds_raw, test_months), total=len(test_months)):
    
    month_dist = month_distributions[str(float(month))]
    #preds = softmax(preds)
    
    p_month = (preds * month_dist) / sum(preds * month_dist)
    prior_ratio = p_month / class_priors        
    max_index = np.argmax(prior_ratio * preds)     
    
    test_prior_ratio_M.append(prior_ratio)
    test_weighted_predictions_raw_M.append(prior_ratio * preds)
    test_weighted_predictions_M.append(max_index)


100%|██████████| 59420/59420 [00:20<00:00, 2903.56it/s]


### Weighting by Substrate

In [42]:
wrong_predictions_S = []
weighted_predictions_S = []
weighted_predictions_raw_S = []
prior_ratio_S = []

for lbl, preds, sub in tqdm.tqdm(zip(GT_lbls, preds_raw, subs), total=len(GT_lbls)):

    substrate_dist = substrate_distributions[sub]
    #preds = softmax(preds)
    
    p_substrate = (preds * substrate_dist) / sum(preds * substrate_dist)
    prior_ratio = p_substrate / class_priors
    max_index = np.argmax(prior_ratio * preds)     
    
    prior_ratio_S.append(prior_ratio)
    weighted_predictions_raw_S.append(prior_ratio * preds)
    weighted_predictions_S.append(max_index)
    
    if lbl != max_index:
        wrong_predictions_S.append([lbl, sub])
        
f1 = f1_score(vali_csv_data['class_id'], weighted_predictions_S, average='macro')
accuracy = accuracy_score(vali_csv_data['class_id'], weighted_predictions_S)
recall_3 = top_k_accuracy_score(vali_csv_data['class_id'], weighted_predictions_raw_S, k=3)
print('Substrate:', f1, accuracy, recall_3)
print('Substrate dif:', np.around(f1-vanilla_f1, 3), np.around((accuracy-vanilla_accuracy) * 100, 2), np.around((recall_3-vanilla_recall_3)*100, 2))

100%|██████████| 27814/27814 [00:09<00:00, 2839.29it/s]


Substrate: 0.9110449371879563 0.936003451499245 0.9867692528942259
Substrate dif: 0.003 0.12 0.08


In [46]:
test_weighted_predictions_S = []
test_weighted_predictions_raw_S = []
test_prior_ratio_S = []

for preds, sub in tqdm.tqdm(zip(test_preds_raw, test_subs), total=len(test_subs)):

    if sub == 'spiders':
        p_substrate = preds #直接跳过
    else:
        substrate_dist = substrate_distributions[sub]
        #preds = softmax(preds)
        
        p_substrate = (preds * substrate_dist) / sum(preds * substrate_dist)
    prior_ratio = p_substrate / class_priors
    max_index = np.argmax(prior_ratio * preds)     
    
    test_prior_ratio_S.append(prior_ratio)
    test_weighted_predictions_raw_S.append(prior_ratio * preds)
    test_weighted_predictions_S.append(max_index)

100%|██████████| 59420/59420 [00:20<00:00, 2886.73it/s]


### Weighting by Month, Substrate and Habitat

In [53]:
from scipy.special import softmax

wrong_predictions_all = []
merged_predictions = []
merged_predictions_raw = []

wrong_predictions_all_genus = []
merged_predictions_genus = []

for lbl, o, m, s, h in tqdm.tqdm(zip(GT_lbls, preds_raw, prior_ratio_M, prior_ratio_S, prior_ratio_H), total=len(prior_ratio_M)):
    
    #preds = softmax(preds)
 
    m_pred = (preds * m * s * h) / sum((preds * m * s * h))
    max_index = np.argmax(m_pred)
    
    merged_predictions_raw.append(m_pred)    
    merged_predictions.append(max_index)
    
    merged_predictions_genus.append(class_to_genus[max_index])
    
    if lbl != max_index:
        wrong_predictions_all.append([lbl, max_index])
    
        if class_to_genus[lbl] != class_to_genus[max_index]:
            wrong_predictions_all_genus.append([lbl, max_index])
            
f1 = f1_score(vali_csv_data['class_id'], merged_predictions, average='macro')
accuracy = accuracy_score(vali_csv_data['class_id'], merged_predictions)
recall_3 = top_k_accuracy_score(vali_csv_data['class_id'], merged_predictions_raw, k=3)
recall_5 = top_k_accuracy_score(vali_csv_data['class_id'], merged_predictions_raw, k=5)
recall_10 = top_k_accuracy_score(vali_csv_data['class_id'], merged_predictions_raw, k=10)

print('All:', f1, accuracy, recall_3, recall_5, recall_10)
print('All dif:', np.around(f1-vanilla_f1, 3), np.around((accuracy-vanilla_accuracy) * 100, 2), np.around((recall_3-vanilla_recall_3)*100, 2))

100%|██████████| 27814/27814 [00:09<00:00, 3019.61it/s]


All: 0.9258198795500877 0.9425469188178615 0.9890702523908823 0.9942475012583591 0.9972675630977206
All dif: 0.018 0.77 0.31


In [52]:
# f1 = f1_score(test_metadata['genus_id'], merged_predictions_genus, average='macro')
# accuracy = accuracy_score(test_metadata['genus_id'], merged_predictions_genus)
# print('Genera lvl performance:', np.around(f1*100, 2), np.around(accuracy*100), 2)

In [54]:
test_merged_predictions = []
test_merged_predictions_raw = []


for o, m, s, h in tqdm.tqdm(zip(test_preds_raw, test_prior_ratio_M, test_prior_ratio_S, test_prior_ratio_H), total=len(test_prior_ratio_M)):
    
    #preds = softmax(preds)
 
    m_pred = (preds * m * s * h) / sum((preds * m * s * h))
    max_index = np.argmax(m_pred)
    
    test_merged_predictions_raw.append(m_pred)    
    test_merged_predictions.append(max_index)
    


  m_pred = (preds * m * s * h) / sum((preds * m * s * h))
100%|██████████| 59420/59420 [00:20<00:00, 2895.11it/s]


In [55]:
test_merged_predictions

[889,
 1128,
 671,
 1401,
 1184,
 386,
 93,
 836,
 386,
 356,
 1213,
 492,
 1543,
 854,
 1437,
 522,
 1182,
 1420,
 1395,
 1484,
 1058,
 553,
 100,
 223,
 1184,
 1486,
 496,
 744,
 981,
 885,
 1274,
 213,
 1357,
 885,
 1482,
 99,
 836,
 608,
 703,
 471,
 553,
 1482,
 1014,
 223,
 1182,
 1491,
 195,
 196,
 469,
 671,
 957,
 88,
 857,
 489,
 1411,
 489,
 1491,
 1401,
 401,
 1500,
 1491,
 553,
 1039,
 1437,
 357,
 147,
 1401,
 1543,
 130,
 471,
 136,
 580,
 1059,
 1274,
 975,
 249,
 1595,
 1395,
 467,
 262,
 1020,
 672,
 1014,
 1184,
 739,
 1491,
 88,
 538,
 492,
 492,
 1128,
 489,
 1575,
 1440,
 147,
 1491,
 88,
 856,
 580,
 1175,
 374,
 88,
 1182,
 1136,
 356,
 267,
 1401,
 170,
 1595,
 185,
 591,
 1491,
 695,
 1128,
 1182,
 1491,
 1400,
 1182,
 1483,
 1437,
 75,
 1437,
 934,
 1483,
 124,
 1175,
 124,
 1491,
 1543,
 685,
 692,
 1121,
 1061,
 111,
 740,
 687,
 170,
 856,
 1296,
 973,
 170,
 223,
 1543,
 387,
 1491,
 1483,
 1596,
 74,
 1175,
 88,
 739,
 1438,
 1491,
 1014,
 745,
 538,
 12

In [59]:
submit = pd.DataFrame()
# ObservationId,ClassId
submit['ObservationId'] = test_csv['ObservationId']
submit['ClassId'] = np.argmax(np.array(test_csv.iloc[:, 1:1605]), axis=1)



In [82]:
LABELS= []
for i in list(metadata[metadata['Substrate']=='spiders']['ObservationId']):
    print(submit[submit["ObservationId"]==i].index[0])
    submit.loc[submit[submit["ObservationId"]==i].index[0],'ClassId'] = -1
 


38199
32371
54213
45533
56729
3833
38521


In [84]:
submit.to_csv('lala.csv', index=False, header=True)

In [85]:
submit.head(5)

Unnamed: 0,ObservationId,ClassId
0,3008822340,889
1,3008822343,1128
2,3008822344,671
3,3008822345,1401
4,3008822346,1184


In [86]:
submit[submit["ObservationId"]==3355967414]

Unnamed: 0,ObservationId,ClassId
32371,3355967414,-1


In [61]:
metadata[metadata['Substrate']=='spiders']

Unnamed: 0,ObservationId,eventDate,year,month,day,countryCode,locality,taxonID,scientificName,kingdom,...,ImageUniqueID,Substrate,rightsHolder,Latitude,Longitude,CoorUncert,Habitat,image_path,class_id,filename
181165,3380898495,2021-10-03T00:00:00,,10.0,3.0,DK,Risgårde,,,,...,,spiders,,,,,natural grassland,,,0-3380898495.JPG
183799,3355967414,2021-09-18T00:00:00,,9.0,18.0,DK,Skansebakken,,,,...,,spiders,,,,,natural grassland,,,0-3355967414.JPG
199684,3410061329,2021-11-12T00:00:00,,11.0,12.0,DK,Hornbæk Havn,,,,...,,spiders,,,,,dune,,,0-3410061329.JPG
202272,3392601328,2021-10-17T00:00:00,,10.0,17.0,DK,Søholt Storskov,,,,...,,spiders,,,,,bog,,,0-3392601328.JPG
208714,3417635304,2021-12-05T00:00:00,,12.0,5.0,DK,Bistrup Hegn,,,,...,,spiders,,,,,natural grassland,,,0-3417635304.JPG
218883,3027625304,2021-01-24T00:00:00,,1.0,24.0,DK,Espergærde,,,,...,,spiders,,,,,gravel or clay pit,,,0-3027625304.JPG
232546,3380901444,2021-10-01T00:00:00,,10.0,1.0,DK,Almindingen,,,,...,,spiders,,,,,Forest bog,,,0-3380901444.JPG


In [87]:
submit.ClassId.value_counts()

937     465
492     451
1486    438
1491    434
88      407
       ... 
353       1
197       1
154       1
930       1
1521      1
Name: ClassId, Length: 1561, dtype: int64

In [90]:
all_data.class_id.value_counts()

1486    1119
492     1012
687      970
88       921
1491     853
        ... 
1387       7
1055       7
530        6
498        6
1545       5
Name: class_id, Length: 1604, dtype: int64

In [93]:
submit.shape

(59420, 2)

In [92]:
all_data.shape

(177170, 35)