In [123]:
from glob import glob
import numpy as np
import os
import pandas as pd
import tabulate

In [124]:
target_file = '.txt'
target_folder = 'E:/Fragmentos/'
csv_file = 'assets/class_ana.csv'

In [125]:
# Look for files in the target folder with the target extension
# Recursive find in subfolders
files = glob(target_folder + '/*/*' + target_file, recursive=True)
print('Found {} files'.format(len(files)))

Found 1052 files


In [126]:
# read csv to link names and class
df = pd.read_csv(csv_file, sep=',', names=['Id', 'Name', 'Class'], header=0)
# Third column to int
df['Class'] = df['Class'].astype(int)
# Create a dictionary with first column as key and second column as value
class_dict = dict(zip(df['Name'], df['Class']))

In [127]:
class_dict.keys()

dict_keys(['AL_03A', 'AL_03B', 'AL_03C', 'AL_03D', 'AL_03E', 'AL_03F', 'AL_03H', 'AL_03I', 'AL_03J', 'AL_03K', 'AL_03L', 'AL_05A', 'AL_05B', 'AL_05F', 'AL_05G', 'AL_05H', 'AL_05I', 'AL_05J', 'AL_05K', 'AL_05L', 'AL_06B', 'AL_06C', 'AL_06D', 'AL_06E', 'AL_06F', 'AL_06G', 'AL_06H', 'AL_06I', 'AL_06J', 'AL_06K', 'AL_06L', 'AL_07A', 'AL_07B', 'AL_07C', 'AL_07F', 'AL_07H', 'AL_07I', 'AL_07J', 'AL_07K', 'AL_08A', 'AL_08B', 'AL_09A', 'AL_10A', 'AL_10B', 'AL_10C', 'AL_11C', 'AL_11D', 'AL_11E', 'AL_12A', 'AL_12B', 'AL_12D', 'AL_13B', 'AL_14B', 'AL_16A', 'AL_16B', 'AL_16C', 'AL_16E', 'AT_001', 'AT_002', 'AT_003', 'AT_004', 'AT_005', 'AT_006', 'AT_007', 'AT_009', 'AT_011', 'AT_012', 'AT_013', 'AT_014', 'AT_015', 'AT_016', 'AT_017', 'AT_018', 'AT_019', 'AT_020', 'AT_022', 'AT_023', 'AT_024', 'AT_025', 'AT_026', 'AT_027', 'AT_028', 'AT_029', 'AT_030', 'AT_031', 'AT_032', 'AT_033', 'BA_1_1', 'BA_1_2', 'BA_101_1', 'BA_101_2', 'BA_101_3', 'BA_101_4', 'BA_102_1', 'BA_102_2', 'BA_102_3', 'BA_102_4', 'BA

In [145]:
# Classify files per class
classes = {}
not_id_folders = 0

for file in files:
    root_folder = os.path.dirname(file)
    root_folder = os.path.basename(root_folder)
    if root_folder not in class_dict.keys():
        not_id_folders += 1
        continue
        
    class_id = class_dict[root_folder]
    if class_id not in classes:
            classes[class_id] = []
    classes[class_id].append(file)
    
print('Not identified folders: {}'.format(not_id_folders))

Not identified folders: 51


In [146]:
# Print classes and number of files per class
for class_id in classes.keys():
    print('Class: {} - {} files'.format(class_id, len(classes[class_id])))

Class: 11 - 96 files
Class: 10 - 321 files
Class: 6 - 50 files
Class: 7 - 48 files
Class: 5 - 54 files
Class: 8 - 269 files
Class: 9 - 19 files
Class: 1 - 26 files
Class: 3 - 66 files
Class: 4 - 10 files
Class: 2 - 42 files


In [147]:
print(classes.keys())
print('Found {} classes'.format(len(classes.keys())))

dict_keys([11, 10, 6, 7, 5, 8, 9, 1, 3, 4, 2])
Found 11 classes


In [148]:
occupancy_intervals = [[15, 20], [20, 30], [30, 100]]
matrix = np.zeros((len(classes.keys()), len(occupancy_intervals)))

In [150]:
for i, class_id in enumerate(classes.keys()):
    print('Class: {}'.format(class_id))
    for file in classes[class_id]:
        print('File: {}'.format(file))
        df = pd.read_csv(file, sep='\t')
        percentages = df['Percentage'].tolist()
        
        for percentage in percentages:
            for j, interval in enumerate(occupancy_intervals):
                if interval[0] <= percentage * 100 < interval[1]:
                    matrix[class_id - 1, j] += 1
                    break

Class: 11
File: E:/Fragmentos\AL_03A\AL_03A_metadata.txt
File: E:/Fragmentos\AL_03B\AL_03B_metadata.txt
File: E:/Fragmentos\AL_03C\AL_03C_metadata.txt
File: E:/Fragmentos\AL_03D\AL_03D_metadata.txt
File: E:/Fragmentos\AL_03E\AL_03E_metadata.txt
File: E:/Fragmentos\AL_03F\AL_03F_metadata.txt
File: E:/Fragmentos\AL_03H\AL_03H_metadata.txt
File: E:/Fragmentos\AL_03I\AL_03I_metadata.txt
File: E:/Fragmentos\AL_03J\AL_03J_metadata.txt
File: E:/Fragmentos\AL_03K\AL_03K_metadata.txt
File: E:/Fragmentos\AL_03L\AL_03L_metadata.txt
File: E:/Fragmentos\AL_09A\AL_09A_metadata.txt
File: E:/Fragmentos\BA_102_4\BA_102_4_metadata.txt
File: E:/Fragmentos\BA_107_1\BA_107_1_metadata.txt
File: E:/Fragmentos\BA_122_1\BA_122_1_metadata.txt
File: E:/Fragmentos\BA_138_1\BA_138_1_metadata.txt
File: E:/Fragmentos\BA_139_1\BA_139_1_metadata.txt
File: E:/Fragmentos\BA_141_1\BA_141_1_metadata.txt
File: E:/Fragmentos\BA_142_3\BA_142_3_metadata.txt
File: E:/Fragmentos\BA_144_3\BA_144_3_metadata.txt
File: E:/Fragmento

In [151]:
matrix_relative = matrix / np.sum(matrix, axis=0)
matrix_relative = np.round(matrix_relative, 4)

In [152]:
headers = [str(interval[0]) + '-' + str(interval[1]) for interval in occupancy_intervals]
print(tabulate.tabulate(matrix_relative, headers=headers))

  15-20    20-30    30-100
-------  -------  --------
 0.0267   0.027     0.0253
 0.0422   0.0435    0.0414
 0.0687   0.0677    0.0655
 0.0106   0.0098    0.0098
 0.0563   0.0549    0.0534
 0.0528   0.052     0.0488
 0.0482   0.0484    0.0477
 0.2836   0.276     0.2655
 0.0194   0.0195    0.0187
 0.3015   0.3079    0.3252
 0.09     0.0932    0.0986


In [155]:
# Create a LaTeX table from matrices
latex = ''
for header in headers:
    latex += header
    if header != headers[-1]:
        latex += ' & '
    else:
        latex += ' \\\\\n'

sorted_keys = sorted(classes.keys())
for i, class_id in enumerate(sorted_keys):
    if i % 2 == 1:
        latex += '\\rowcolor[HTML]{EFEFEF}\n'
    latex += 'Class ' + str(class_id) + ' & '
    for j, interval in enumerate(occupancy_intervals):
        latex += str(int(matrix[class_id - 1, j]))
        rounded_percentage = np.round(matrix_relative[class_id - 1, j] * 100, 2)
        latex += ' (' + str(rounded_percentage) + '\%)'
        if j < len(occupancy_intervals) - 1:
            latex += ' & '
    latex += ' \\\\\n'

latex += '\\\\\midrule\n'
latex += 'Total & '
for j, interval in enumerate(occupancy_intervals):
    latex += str(int(np.sum(matrix[:, j])))
    if j < len(occupancy_intervals) - 1:
        latex += ' & '

latex += ' \\\\\n'
print(latex)

15-20 & 20-30 & 30-100 \\
Class 1 & 2437 (2.67\%) & 2436 (2.7\%) & 2174 (2.53\%) \\
\rowcolor[HTML]{EFEFEF}
Class 2 & 3843 (4.22\%) & 3930 (4.35\%) & 3556 (4.14\%) \\
Class 3 & 6259 (6.87\%) & 6115 (6.77\%) & 5629 (6.55\%) \\
\rowcolor[HTML]{EFEFEF}
Class 4 & 970 (1.06\%) & 889 (0.98\%) & 843 (0.98\%) \\
Class 5 & 5137 (5.63\%) & 4963 (5.49\%) & 4586 (5.34\%) \\
\rowcolor[HTML]{EFEFEF}
Class 6 & 4812 (5.28\%) & 4701 (5.2\%) & 4195 (4.88\%) \\
Class 7 & 4392 (4.82\%) & 4376 (4.84\%) & 4099 (4.77\%) \\
\rowcolor[HTML]{EFEFEF}
Class 8 & 25859 (28.36\%) & 24939 (27.6\%) & 22808 (26.55\%) \\
Class 9 & 1767 (1.94\%) & 1761 (1.95\%) & 1603 (1.87\%) \\
\rowcolor[HTML]{EFEFEF}
Class 10 & 27485 (30.15\%) & 27817 (30.79\%) & 27939 (32.52\%) \\
Class 11 & 8205 (9.0\%) & 8420 (9.32\%) & 8473 (9.86\%) \\
\\\midrule
Total & 91166 & 90347 & 85905 \\
