In [1]:
import librosa
import os
import pandas as pd
import numpy as np

# Set current working directory to parent folder
os.chdir(os.path.abspath(".."))

In [20]:
# Lee el archivo que identifica como los audios estan divididos
df_conv = pd.read_csv(f"data/MSPCORPUS/Time_Labels/conversation_parts.txt", delimiter=";", header=None, names=['Conversation_Part', 'start_time', 'end_time'])

# Creamos dos columnas con el podcast number y la parte de ese podcast
df_conv['PC_Num'] = df_conv['Conversation_Part'].apply(lambda x: x[17:21]).astype(int)
df_conv['Part_Num'] = df_conv['Conversation_Part'].apply(lambda x: x[22:23]).astype(int)

In [21]:
mem = {}

def add_sync_time_columns(row):
    if row.Part_Num == 1:
        st = 0
        mem[row.PC_Num] = row.start_time
    else:
        st = row.start_time - mem[row.PC_Num]
        
    row['m_start_time'] = st
    row['m_end_time'] = row.end_time - mem[row.PC_Num]
    
    return row

# Una fila con los audios en formato inicio: 0 y final: final - inicio
df_conv = df_conv.apply(lambda row: add_sync_time_columns(row), axis=1)
df_conv = df_conv[['Conversation_Part', 'm_start_time', 'm_end_time', 'PC_Num', 'Part_Num']]
df_conv['Audio_Name'] = df_conv['Conversation_Part'].apply(lambda x: x[0:21]) + ".wav"
df_conv = df_conv.rename({'m_start_time':'start_time','m_end_time':'end_time'}, axis = 1)

In [38]:
# Obtener anotadores y emoción por parte
emociones = ['Arousal','Dominance','Valence']
X = []

for emocion in emociones:
    
    for file in os.listdir(f'data/MSPCORPUS/Annotations/{emocion}'):
        
        conv_part = file[:-8]
        emotion = emocion
        annotator = file[-7:-4]
        
        x = []
        x.append(conv_part)
        x.append(emotion)
        x.append(annotator)
        
        X.append(x)

# Guardamos resultado en un dataframe
df_expand = pd.DataFrame(X, columns = ['Conversation_Part','Emotion','Annotator'])

In [41]:
# Juntamos el dataframe expandido con el base
df_annotations = pd.merge(df_conv, df_expand, how = 'left', on = 'Conversation_Part')

In [48]:
df_annotations.head()

Unnamed: 0,Conversation_Part,start_time,end_time,PC_Num,Part_Num,Audio_Name,Emotion,Annotator
0,MSP-Conversation_0021_1,0.0,306.0304,21,1,MSP-Conversation_0021.wav,Arousal,5
1,MSP-Conversation_0021_1,0.0,306.0304,21,1,MSP-Conversation_0021.wav,Arousal,6
2,MSP-Conversation_0021_1,0.0,306.0304,21,1,MSP-Conversation_0021.wav,Arousal,7
3,MSP-Conversation_0021_1,0.0,306.0304,21,1,MSP-Conversation_0021.wav,Arousal,8
4,MSP-Conversation_0021_1,0.0,306.0304,21,1,MSP-Conversation_0021.wav,Arousal,9


#### Agregar tipos de los archivos

In [50]:
# Cargamos archivo de texto con los tipos
with open('data/MSPCORPUS/partitions.txt') as f:
    txt_file = f.readlines()
    
list_types = [i.split(';') for i in txt_file]
df_types = pd.DataFrame(list_types, columns = ['Audio_Name','Type'])

# Formato para merge
df_types['Type'] = df_types['Type'].str.replace('\n','')
df_types['Audio_Name'] = df_types['Audio_Name'] + '.wav'

In [58]:
df_types.head()

Unnamed: 0,Audio_Name,Type
0,MSP-Conversation_0021.wav,Train
1,MSP-Conversation_0023.wav,Train
2,MSP-Conversation_0035.wav,Train
3,MSP-Conversation_0047.wav,Test
4,MSP-Conversation_0061.wav,Train


In [61]:
df_annotations = pd.merge(df_annotations, df_types, how = 'left', on = 'Audio_Name')

#### Resultado final

In [65]:
df_annotations = df_annotations[['Audio_Name','Conversation_Part','Emotion','Annotator','PC_Num','Part_Num','Type','start_time','end_time']]

In [66]:
df_annotations.head()

Unnamed: 0,Audio_Name,Conversation_Part,Emotion,Annotator,PC_Num,Part_Num,Type,start_time,end_time
0,MSP-Conversation_0021.wav,MSP-Conversation_0021_1,Arousal,5,21,1,Train,0.0,306.0304
1,MSP-Conversation_0021.wav,MSP-Conversation_0021_1,Arousal,6,21,1,Train,0.0,306.0304
2,MSP-Conversation_0021.wav,MSP-Conversation_0021_1,Arousal,7,21,1,Train,0.0,306.0304
3,MSP-Conversation_0021.wav,MSP-Conversation_0021_1,Arousal,8,21,1,Train,0.0,306.0304
4,MSP-Conversation_0021.wav,MSP-Conversation_0021_1,Arousal,9,21,1,Train,0.0,306.0304


In [71]:
df_annotations.to_excel('df_annotations.xlsx', index = False)