In [2]:
import sys 
import math 
import os 

import numpy as np  
import pandas as pd 
import csv
from numpy import save  

sys.path.append('../../src') 

from gdsc_utils import download_directory, PROJECT_DIR 
from config import DEFAULT_BUCKET  

os.chdir(PROJECT_DIR) 

In [3]:
download_directory('data/', None, DEFAULT_BUCKET)

In [2]:
# Definition of function for calculating class weights
def calc_class_weights (df_sort, version):
    """
    Calculates and returns class weights for given classes.

    Parameters
    ----------
    df_sort: DataFrame, given metadata sorted by label (species)
    version: int, version of classweights (this function has 3 different versions)
             1: total_number_of_files / number_of_files_per_class
             2: 1 - (number_of_files_per_class / total_number_of_files)
             3: total_number_of_files / ((number_of_classes) * number_of_files_per_class)

    Returns
    -------
    class_weights (.npy): class weights per class
    """
    
    cnt = 0
    amount = []
    cl = 0
    
    # Calculation of total length per species
    for i in range(len(df_sort)):
        if df_sort.loc[i, "label"] == cl:
              cnt += 1
              if i == (len(df_sort)-1):
                    amount.append(cnt)
                    break
        else:
             amount.append(cnt)
             cnt = 1
             cl += 1
    
    class_total= np.array(amount)
    total = len(df_sort) 

    # Calculation of class weights
    if version == 1:
         result = total / class_total
    elif version == 2:
         result = 1 - (class_total / total)
    elif version == 3:
         result = total / ((cl+1) * class_total)
    else:
         print("This is not a given version for class weights for this function.")

    return result

In [None]:
# Sort metadata by label 
df_3_5 = pd.read_csv('data/production_data/3-5s_crop/metadata.csv')                 # 3.5 sec cropped files    
df_35_sort = df_3_5.sort_values(by=['label'], ascending=True, ignore_index=True)
df_6 = pd.read_csv('data/production_data/6s_crop/metadata.csv')                     # 6 sec. cropped files
df_6_sort = df_6.sort_values(by=['label'], ascending=True, ignore_index=True)

# Calculate class weights
cw1_3_5 = calc_class_weights(df_35_sort,1)
cw2_3_5 = calc_class_weights(df_35_sort,2)
cw3_3_5 = calc_class_weights(df_35_sort,3)
cw1_6 = calc_class_weights(df_6_sort,1)
cw2_6 = calc_class_weights(df_6_sort,2)
cw3_6 = calc_class_weights(df_6_sort,3)

# Save class weights
save('class_weights/class_weights_1.npy',cw1_3_5)
save('class_weights/class_weights_2.npy',cw2_3_5)
save('class_weights/class_weights_3.npy',cw3_3_5)
save('class_weights/class_weights6_1.npy',cw1_6)
save('class_weights/class_weights6_2.npy',cw2_6)
save('class_weights/class_weights6_3.npy',cw3_6)