# Clean Lab - Procesamiento de Etiquetas

In [None]:
from cleanlab import Datalab

SEED = 456  # para reproducibilidad

In [None]:
# Cargar embeddings y etiquetas
# En principio, cualquier csv con los embeddings. Tanto en origen como en cantidad
df = pd.read_csv('output_file.csv')
embeddings_array = df.drop(columns=['label', 'file name']).values
y = df['label'].values

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict

# Modelo base para la determinación
model = RandomForestClassifier(random_state=SEED)

# fold cross-validation
num_crossval_folds = 15  # puede disminuir este valor para reducir el tiempo de ejecución o aumentarlo para obtener mejores resultados
pred_probs = cross_val_predict(
    estimator=model, X=embeddings_array, y=y, cv=num_crossval_folds, method="predict_proba"
)

In [None]:
from sklearn.metrics import accuracy_score

predicted_labels = pred_probs.argmax(axis=1)
cv_accuracy = accuracy_score(df.label.values, predicted_labels)
print(f"Cross-validated estimate of accuracy on held-out data: {cv_accuracy}")

Cross-validated estimate of accuracy on held-out data: 0.7181690140845071


In [None]:
lab = Datalab(df, label_name="label")
lab.find_issues(pred_probs=pred_probs, issue_types={"label":{}})

Finding label issues ...

Audit complete. 1117 issues found in the dataset.


In [None]:
lab.report()

Dataset Information: num_examples: 7100, num_classes: 4

Here is a summary of various issues found in your data:

issue_type  num_issues
     label        1117

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


----------------------- label issues -----------------------

About this issue:
	Examples whose given label is estimated to be potentially incorrect
    (e.g. due to annotation error) are flagged as having label issues.
    

Number of examples with this issue: 1117
Overall dataset quality in terms of this issue: 0.8785

Examples representing most severe instances of this issue:
      is_label_issue  label_score  given_label  predicted_label
1636            True          0.0            3                2
1642            True          0.0            3     

In [None]:
label_issues = lab.get_issues("label")
label_issues.head()

Unnamed: 0,is_label_issue,label_score,given_label,predicted_label
0,True,0.05,0,2
1,True,0.14,0,2
2,True,0.11,0,2
3,True,0.1,0,2
4,True,0.09,0,1


In [None]:
identified_label_issues = label_issues[label_issues["is_label_issue"] == True]
lowest_quality_labels = identified_label_issues.sort_values("label_score").index

print(f"Here are indices of the most likely errors: \n {lowest_quality_labels.values}")

Here are indices of the most likely errors: 
 [1634 1661 1601 ...  706  807  450]


In [None]:
df['file name'].iloc[lowest_quality_labels]

1634    2022-06-14--05-57-50_1__segment1.wav
1661    2022-06-14--09-56-21_1__segment4.wav
1601    2022-06-14--00-02-46_1__segment4.wav
1642    2022-06-14--06-57-25_1__segment3.wav
1605    2022-06-14--01-01-54_1__segment2.wav
                        ...                 
673     2022-06-09--14-05-39_2__segment5.wav
96      2022-06-06--09-28-32_2__segment0.wav
706     2022-06-09--17-03-21_2__segment2.wav
807     2022-06-10--01-43-29_2__segment1.wav
450     2022-06-08--19-20-13_2__segment3.wav
Name: file name, Length: 1117, dtype: object

In [None]:
results = pd.DataFrame()
results['file name'] = df['file name'].iloc[lowest_quality_labels]
results['label original'] = df['label'].iloc[lowest_quality_labels]
results['label predicted'] = label_issues['predicted_label'].iloc[lowest_quality_labels]

# Guardar el DataFrame a un archivo
results = results.sort_values(by='file name')
results.to_csv('resultado_posibles_errores_sort.csv', index=False)

In [None]:
lab.find_issues(pred_probs=pred_probs, features=embeddings_array)

Finding null issues ...
Finding label issues ...




Finding outlier issues ...
Finding near_duplicate issues ...
Finding non_iid issues ...
Finding class_imbalance issues ...
Finding underperforming_group issues ...

Audit complete. 1682 issues found in the dataset.




In [None]:
lab.report()

Dataset Information: num_examples: 7100, num_classes: 4

Here is a summary of various issues found in your data:

    issue_type  num_issues
         label        1117
       outlier         530
near_duplicate          34
       non_iid           1

Learn about each issue: https://docs.cleanlab.ai/stable/cleanlab/datalab/guide/issue_type_description.html
See which examples in your dataset exhibit each issue via: `datalab.get_issues(<ISSUE_NAME>)`

Data indices corresponding to top examples of each issue are shown below.


----------------------- label issues -----------------------

About this issue:
	Examples whose given label is estimated to be potentially incorrect
    (e.g. due to annotation error) are flagged as having label issues.
    

Number of examples with this issue: 1117
Overall dataset quality in terms of this issue: 0.8785

Examples representing most severe instances of this issue:
      is_label_issue  label_score  given_label  predicted_label
1636            True      

# Procesar etiquetas erróneas

In [None]:
import pandas as pd

# Leer el primer CSV con las detecciones de posibles malas etiquetas
detections_df = pd.read_csv('resultado_posibles_errores_sort.csv')

# Leer el segundo CSV con los descriptores de los archivos
descriptors_df = pd.read_csv('output_file_50mfcc_4000Hz.csv')

# Extraer el nombre base del archivo (sin el segmento)
detections_df['file_base'] = detections_df['file name'].apply(lambda x: x.split('__segment')[0])

# Contar cuántos segmentos de cada archivo base tienen la misma label predicted
predicted_labels = (
    detections_df.groupby(['file_base', 'label predicted'])
    .size()
    .reset_index(name='count')
)

# Filtrar los casos donde 3 o más segmentos tienen la misma label predicted
predicted_labels = predicted_labels[predicted_labels['count'] >= 3]

# Crear un diccionario para mapear los archivos base a la nueva etiqueta
file_base_to_new_label = predicted_labels.set_index('file_base')['label predicted'].to_dict()

# Definir una función para asignar la nueva etiqueta si corresponde
def assign_new_label(row):
    file_base = row['file name'].split('__segment')[0]
    return file_base_to_new_label.get(file_base, row['label'])

# Crear la nueva columna 'label_updated' en el DataFrame de descriptores
descriptors_df['label_updated'] = descriptors_df.apply(assign_new_label, axis=1)

# Guardar el DataFrame actualizado en un nuevo archivo CSV
descriptors_df.to_csv('output_file_50mfcc_4000Hz_CLEAN.csv', index=False)

print("El proceso ha finalizado y el archivo ha sido guardado.")

El proceso ha finalizado y el archivo ha sido guardado.
