In [4]:
import json
import os
import shutil
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def process_prompt(prompt):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Tokenize the prompt
    tokens = word_tokenize(prompt)

    # Remove stop words and lemmatize with POS tagging
    processed_tokens = []
    for token in tokens:
        if token.isalpha() and token not in stop_words:
            pos = get_wordnet_pos(token)
            processed_token = lemmatizer.lemmatize(token, pos)
            processed_tokens.append(processed_token)

    return ' '.join(processed_tokens)

def load_and_process_prompts(prompt_file):
    with open(prompt_file, 'r') as file:
        prompts = json.load(file)
    
    return {prompt['original_class']: process_prompt(prompt['prompt']) for prompt in prompts}

def purge_directory(path):
    if os.path.exists(path):
        shutil.rmtree(path)
    os.makedirs(path, exist_ok=True)

def convert_tsv_to_jsonl(input_file, base_output_dir, prompts):
    processed_filepaths = set()
    max_file_size = 0.5 * 1024 * 1024 

    with open(input_file, 'r') as tsv:
        for line in tsv:
            parts = line.strip().split('\t')

            if len(parts) in [6, 7]:
                filepath, event_class, start, end, file_duration, split = parts[:6]
            else:
                continue

            # Construct the full path to the audio file
            full_file_path = os.path.join('.', filepath)  # Adjust this if the base directory is different

            # Check the file size
            if os.path.getsize(full_file_path) > max_file_size:
                continue  # Skip files larger than 1 MB

            if filepath in processed_filepaths:
                continue

            if event_class.startswith("D") and split in ['train', 'valid', 'test_internal']:
                split_dir = {'train': 'train', 'valid': 'validation', 'test_internal': 'test_internal'}[split]
                output_dir = os.path.join(base_output_dir, split_dir)
                relative_path = filepath
                data = {
                    "path": relative_path,
                    "duration": float(file_duration),
                    "sample_rate": 22050,
                    "amplitude": None,
                    "weight": None,
                    "info_path": None
                }

                os.makedirs(output_dir, exist_ok=True)
                output_file = os.path.join(output_dir, 'data.jsonl')
                with open(output_file, 'a', encoding='utf-8') as jsonl:
                    jsonl.write(json.dumps(data, ensure_ascii=False) + '\n')
                processed_filepaths.add(filepath)

                # Create metadata file
                metadata_path = os.path.join('metadata', relative_path.replace('.wav', '.json'))
                os.makedirs(os.path.dirname(metadata_path), exist_ok=True)
                description = prompts.get(event_class, "No description available")
                with open(metadata_path, 'w', encoding='utf-8') as metadata_file:
                    json.dump({"description": description}, metadata_file, ensure_ascii=False)

purge_directory('metadata')
purge_directory('./egs/cochldb')

# Load prompts and convert TSV to JSONL
prompts = load_and_process_prompts('prompts.json')
convert_tsv_to_jsonl('data_table.tsv', './egs/cochldb', prompts)


[nltk_data] Downloading package punkt to /home/afournier/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/afournier/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/afournier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/afournier/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/afournier/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
import json
import os
import shutil
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download required NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def process_prompt(prompt):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Tokenize the prompt
    tokens = word_tokenize(prompt)

    # Remove stop words and lemmatize with POS tagging
    processed_tokens = []
    for token in tokens:
        if token.isalpha() and token not in stop_words:
            pos = get_wordnet_pos(token)
            processed_token = lemmatizer.lemmatize(token, pos)
            processed_tokens.append(processed_token)

    return ' '.join(processed_tokens)

def load_and_process_prompts(prompt_file):
    with open(prompt_file, 'r') as file:
        prompts = json.load(file)
    
    return {prompt['original_class']: process_prompt(prompt['prompt']) for prompt in prompts}

unique_events = set()
def convert_tsv_to_jsonl(input_file, base_output_dir, prompts):
    processed_filepaths = set()
    max_file_size = 0.5 * 1024 * 1024 

    with open(input_file, 'r') as tsv:
        for line in tsv:
            parts = line.strip().split('\t')

            if len(parts) in [6, 7]:
                filepath, event_class, start, end, file_duration, split = parts[:6]
            else:
                continue

            # Construct the full path to the audio file
            full_file_path = os.path.join('.', filepath)  # Adjust this if the base directory is different

            # Check the file size
            if os.path.getsize(full_file_path) > max_file_size:
                continue  # Skip files larger than 1 MB

            if filepath in processed_filepaths:
                continue

            if event_class.startswith("D") and split in ['train']:
                unique_events.add(event_class)

# Load prompts and convert TSV to JSONL
prompts = load_and_process_prompts('prompts.json')
convert_tsv_to_jsonl('data_table.tsv', './egs/cochldb', prompts)


[nltk_data] Downloading package punkt to /home/afournier/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/afournier/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/afournier/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/afournier/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/afournier/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
unique_events

{'Dash_camera_sensor',
 'Dishes_and_pots_and_pans',
 'Dog',
 'Dog_bark',
 'Dog_growl',
 'Dog_howl',
 'Dog_whine',
 'Door',
 'Door_close',
 'Door_open',
 'Door_open_or_close',
 'Door_password',
 'Door_slam',
 'Doorbell',
 'Doorlock_alarm',
 'Doorlock_close',
 'Doorlock_open',
 'Doorlock_open_or_close',
 'Double_bass',
 'Double_clap',
 'Drawer',
 'Drawer_open_or_close',
 'Drill',
 'Drinking',
 'Drinking_straw',
 'Drip',
 'Driving',
 'Drum',
 'Duck'}