In [None]:
import os
import shutil
import whisper
import pandas as pd
from pydub import AudioSegment
from google.colab import drive
import pandas as pd
import random


!pip install openai-whisper pydub pandas


In [None]:
root_folder = '/content/drive/MyDrive/Phonetics_Lab/c' # e d f b c
destination_folder = '/content/drive/MyDrive/Phonetics_Lab/SWEAR9'

# Keywords for the search
search_words = ['–ø–∏–∑–¥–µ—Ü', '–±–ª—è', '—Ö—É–π', '—Å—É–∫–∞', '–µ–±–∞—Ç—å', '–ø–∏–∑–¥–∞', '–±–ª—è–¥—å', '—Ö—É–π–Ω—è', '–ø–æ—Ö—É–π']   #'–±–ª—è—Ç—å', '–Ω–∞—Ö—É–π',

if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Function to iter every file in every folder
def process_folder(folder_path):
    for foldername, subfolders, filenames in os.walk(folder_path):
        for filename in filenames:
            if filename.endswith('.txt'):  # if file is a transcript
                txt_file_path = os.path.join(foldername, filename)
                wav_file_path = os.path.join(foldername, filename.replace('.txt', '.wav'))  # corresponding .wav file

                # transcript reading
                with open(txt_file_path, 'r', encoding='utf-8') as txt_file:
                    transcription = txt_file.read()

                # checking weather there is one of the keywords in a transcript
                if any(word in transcription for word in search_words):
                    # copies .wav file –∏ .txt to the folder SWEAR
                    shutil.copy(wav_file_path, destination_folder)
                    shutil.copy(txt_file_path, destination_folder)

process_folder(root_folder)

#No such file or directory: '/content/drive/MyDrive/Phonetics_Lab/d/b/8d746c3f-f435-423b-8ad0-56e572e77da3.wav'


folder_path = '/content/drive/MyDrive/Phonetics_Lab/SWEAR9/audio_chunks'
elements = os.listdir(folder_path)
print(f"Total number of elements: {len(elements)}")

In [None]:
output_transcripts_file = '/content/drive/MyDrive/Phonetics_Lab/SWEAR/audio_chunks/chunks_transcripts.txt'
input_audio_dir = '/content/drive/MyDrive/Phonetics_Lab/SWEAR'
output_audio_dir = '/content/drive/MyDrive/Phonetics_Lab/SWEAR/audio_chunks'
os.makedirs(output_audio_dir, exist_ok=True)

model = whisper.load_model("small")  # use "tiny" for faster inference

# function to process a single audio file
def process_audio(audio_path, transcript_path, output_audio_dir, output_transcripts_file):
    base_name = os.path.splitext(os.path.basename(audio_path))[0]

    # audio loading using pydub
    audio = AudioSegment.from_wav(audio_path)
    duration_ms = len(audio)

    # transcribe using Whisper
    result = model.transcribe(audio_path, language="russian", word_timestamps=True, fp16=False)

    with open(output_transcripts_file, 'a', encoding='utf-8') as out_f:
        word_idx = 0
        for segment in result["segments"]:
            for word_info in segment["words"]:
                word = word_info["word"].strip()
                start_sec = word_info["start"]
                end_sec = word_info["end"]

                # seconds to milliseconds
                start_ms = int(start_sec * 1000)
                end_ms = int(end_sec * 1000)

                # clamp to audio duration boundaries
                start_ms = max(0, min(start_ms, duration_ms))
                end_ms = max(0, min(end_ms, duration_ms))

                # extraction word audio chunk
                chunk_audio = audio[start_ms:end_ms]

                # saving the chunk
                chunk_filename = f"{base_name}_chunk{word_idx}.wav"
                chunk_path = os.path.join(output_audio_dir, chunk_filename)
                chunk_audio.export(chunk_path, format="wav")

                # writing the transcript
                out_f.write(f"{chunk_filename}\t{word}\t{start_sec:.2f}\t{end_sec:.2f}\n")

                word_idx += 1

# going through all audio files in the folder
for filename in os.listdir(input_audio_dir):
    if filename.endswith('.wav'):
        base = filename[:-4]
        audio_path = os.path.join(input_audio_dir, filename)
        transcript_path = os.path.join(input_audio_dir, base + '.txt')
        process_audio(audio_path, transcript_path, output_audio_dir, output_transcripts_file)


txt_path = '/content/drive/MyDrive/Phonetics_Lab/SWEAR/audio_chunks/chunks_transcripts.txt'
csv_path = '/content/drive/MyDrive/Phonetics_Lab/SWEAR/audio_chunks/chunks_transcripts.csv'

# open the text file
df = pd.read_csv(txt_path, sep='\t', header=None, names=["filename", "text", "start", "end"])

# to lowercase and remove punctuation
df["text"] = df["text"].str.lower().str.translate(str.maketrans('', '', string.punctuation))

# saving to CSV
df.to_csv(csv_path, index=False, encoding='utf-8-sig')

print("CSV is saved:", csv_path)


In [None]:
# === Settings ===
csv_path = "/content/drive/MyDrive/Phonetics_Lab/SWEAR/audio_chunks/chunks_transcripts.csv"  # path to CSV
audio_dir = '/content/drive/MyDrive/Phonetics_Lab/SWEAR/audio_chunks'  # folder with audio files
output_dir = "/content/drive/MyDrive/Phonetics_Lab/SWEAR/balanced_dataset"  # where to save the new dataset

# Keywords
keywords = ['–ø–∏–∑–¥–µ—Ü', '–±–ª—è', '—Ö—É–π', '—Å—É–∫–∞', '–µ–±–∞—Ç—å', '–ø–∏–∑–¥–∞', '–±–ª—è–¥—å', '—Ö—É–π–Ω—è', '–ø–æ—Ö—É–π']  # '–±–ª—è—Ç—å', '–Ω–∞—Ö—É–π',

# Create the output folder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Load the CSV file
df = pd.read_csv(csv_path)

df['text'] = df['text'].fillna('').astype(str)

# Separate audio with keywords and without
contains_keyword = df[df['text'].apply(lambda x: any(k in x.lower() for k in keywords))]
no_keyword = df.drop(contains_keyword.index)

# Balance the dataset
n_target = min(len(contains_keyword), len(no_keyword))
balanced_df = pd.concat([
    contains_keyword.sample(n=n_target, random_state=42),
    # no_keyword.sample(n=n_target, random_state=42)
]).reset_index(drop=True)

print(f'{n_target} + {n_target} = {2 * n_target} audio files will be selected.')

# Copy files to the new folder with renaming
for _, row in balanced_df.iterrows():
    src_path = os.path.join(audio_dir, row['filename'])
    safe_text = row['text'].replace(' ', '_').replace('/', '_')[:40]  # prevent long/invalid filenames
    new_name = f"{safe_text}_{row['filename']}"
    dst_path = os.path.join(output_dir, new_name)
    shutil.copy(src_path, dst_path)

print('The balanced dataset has been saved to:', output_dir)


In [None]:
folder_path = "/content/drive/MyDrive/Phonetics_Lab/SWEAR9/balanced_dataset"

swear_words = ['–ø–∏–∑–¥–µ—Ü', '–±–ª—è', '—Ö—É–π', '—Å—É–∫–∞', '–µ–±–∞—Ç—å', '–ø–∏–∑–¥–∞', '–±–ª—è–¥—å', '—Ö—É–π–Ω—è', '–ø–æ—Ö—É–π']
# FOR ALL WORDS WITH "–ô", THE SCRIPT SOMETIMES ASSIGNS LABEL 0 INSTEAD OF 1 ‚Äî NEED TO MANUALLY CHECK AND REPLACE "–ô" WITH THE CORRECT CHARACTER FROM THE KEYBOARD
keyword_set = set(keywords)

data = []

# iterate all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.wav'):  # if it's an audio file
        base_word = filename.split('_')[0].lower()  # word before the first underscore, converted to lowercase
        label = 1 if base_word in swear_words else 0
        data.append([filename, base_word, label])

df = pd.DataFrame(data, columns=['filename', 'word', 'label'])

# saved to CSV
csv_path = os.path.join(folder_path, 'audio_labels.csv')
df.to_csv(csv_path, index=False, encoding='utf-8-sig')

print("CSV saved to:", csv_path)


In [None]:
# STATISTICS

def process_folder(folder_path):
    csv_path = os.path.join(folder_path, "audio_labels.csv")
    if not os.path.exists(csv_path):
        return None

    df = pd.read_csv(csv_path)
    if not {'filename', 'word', 'label'}.issubset(df.columns):
        return None

    total = len(df)
    count_1 = (df['label'] == 1).sum()
    count_0 = total - count_1

    stats = {
        "folder": os.path.basename(folder_path),
        "total_audio": total,
        "label_1_count": count_1,
        "label_1_percent": round(count_1 / total * 100, 2),
        "label_0_count": count_0,
        "label_0_percent": round(count_0 / total * 100, 2),
    }

    word_counts = Counter()
    keyword_counts = {k: 0 for k in keywords}

    for _, row in df.iterrows():
        word = str(row['word']).lower().strip()
        label = row['label']
        word_counts[word] += 1
        if word in keyword_set and label == 1:
            keyword_counts[word] += 1

    for kw in keywords:
        stats[f"keyword_{kw}_count"] = keyword_counts[kw]
        stats[f"keyword_{kw}_percent"] = round(keyword_counts[kw] / total * 100, 2)

    # top-20 non-keyword words
    for k in keyword_set:
        word_counts.pop(k, None)
    top_20 = word_counts.most_common(20)
    for i, (word, count) in enumerate(top_20, 1):
        stats[f"top_{i}_word"] = word
        stats[f"top_{i}_count"] = count
        stats[f"top_{i}_percent"] = round(count / total * 100, 2)

    return stats

# statistics for all folders
base_dir = "/content/drive/MyDrive/Phonetics_Lab/DATASET"
all_stats = []
for folder in os.listdir(base_dir):
    full_path = os.path.join(base_dir, folder)
    if os.path.isdir(full_path):
        stat = process_folder(full_path)
        if stat:
            all_stats.append(stat)

# Print the results
df_stats = pd.DataFrame(all_stats)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
print(df_stats)


In [None]:
# VISUALISATION OF STATISTICS

# 1. Total number of audio files per folder
plt.figure(figsize=(10, 5))
plt.bar(df_stats["folder"], df_stats["total_audio"])
plt.title("Total Number of Audio Files per Folder")
plt.ylabel("Number of Audio Files")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 2. Count and percentage of label == 1
fig, ax = plt.subplots(2, 1, figsize=(10, 8))
ax[0].bar(df_stats["folder"], df_stats["label_1_count"], color='orange')
ax[0].set_title("Count of label == 1")
ax[0].set_ylabel("Count")
ax[1].bar(df_stats["folder"], df_stats["label_1_percent"], color='red')
ax[1].set_title("Percentage of label == 1")
ax[1].set_ylabel("%")
for a in ax:
    a.set_xticks(range(len(df_stats["folder"])))
    a.set_xticklabels(df_stats["folder"], rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 3. Bar chart for each keyword (aggregated across all folders)
keywords = ['–ø–∏–∑–¥–µ—Ü', '–±–ª—è', '—Ö—É–π', '—Å—É–∫–∞', '–µ–±–∞—Ç—å', '–ø–∏–∑–¥–∞', '—Ö—É–π–Ω—è', '–±–ª—è—Ç—å', '–Ω–∞—Ö—É–π', '–ø–æ—Ö—É–π']  # '–±–ª—è–¥—å'
total_audio = df_stats["total_audio"].sum()
keyword_sums = {
    kw: df_stats[f"keyword_{kw}_count"].sum()
    for kw in keywords
}

plt.figure(figsize=(12, 6))
plt.bar(keyword_sums.keys(), keyword_sums.values(), color='purple')
plt.title("Total Count of Keywords (label == 1) Across All Folders")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# 4. Top-10 most frequent non-keyword words across all folders
word_counts = {}
for i in range(1, 21):
    word_col = f"top_{i}_word"
    count_col = f"top_{i}_count"
    for _, row in df_stats.iterrows():
        word = row.get(word_col)
        count = row.get(count_col, 0)
        if pd.notna(word):
            word_counts[word] = word_counts.get(word, 0) + count

top10 = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:10]
words, counts = zip(*top10)

plt.figure(figsize=(10, 5))
plt.bar(words, counts, color='green')
plt.title("Top-10 Most Frequent Non-Keyword Words")
plt.ylabel("Frequency")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
#CREATING 2 FOLDERS FOR LABEL 1 AND FOR LABEL 0

# new directories
label_1_dir = os.path.join(base_dir, "label_1")
label_0_dir = os.path.join(base_dir, "label_0")
os.makedirs(label_1_dir, exist_ok=True)
os.makedirs(label_0_dir, exist_ok=True)

rows_1 = []
rows_0 = []

audio_exts = ['.wav']

for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    csv_path = os.path.join(folder_path, "audio_labels.csv")

    if os.path.isdir(folder_path) and os.path.exists(csv_path):
        df = pd.read_csv(csv_path)

        for _, row in df.iterrows():
            fname = str(row['filename'])
            label = row['label']
            word = row['word']
            # search for file with extension
            found = False
            for ext in audio_exts:
                src_path = os.path.join(folder_path, fname)
                if not fname.lower().endswith(ext):
                    src_path_full = src_path + ext
                else:
                    src_path_full = src_path

                if os.path.exists(src_path_full):
                    dst_folder = label_1_dir if label == 1 else label_0_dir
                    dst_path = os.path.join(dst_folder, os.path.basename(src_path_full))
                    shutil.copy2(src_path_full, dst_path)

                    new_row = {
                        "filename": os.path.basename(dst_path),
                        "word": word,
                        "label": label
                    }
                    if label == 1:
                        rows_1.append(new_row)
                    else:
                        rows_0.append(new_row)
                    found = True
                    break

            if not found:
                print(f"File not found: {fname} (in folder {folder})")

# save the new CSVs
pd.DataFrame(rows_1).to_csv(os.path.join(label_1_dir, "audio_labels.csv"), index=False)
pd.DataFrame(rows_0).to_csv(os.path.join(label_0_dir, "audio_labels.csv"), index=False)

print("Audio files and CSVs have been separated into folders.")


In [None]:
base_dir = "/content/drive/MyDrive/Phonetics_Lab/DATASET"
folders = ["label_1", "label_0"]

for folder in folders:
    folder_path = os.path.join(base_dir, folder)
    csv_path = os.path.join(folder_path, "audio_labels.csv")

    # Count audio files (all files except .csv)
    audio_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and not f.endswith('.csv')]
    audio_count = len(audio_files)

    # Count rows in CSV
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)
        csv_count = len(df)
    else:
        csv_count = 0

    print(f"{folder}: {audio_count} audio files, {csv_count} rows in CSV")


In [None]:
# REMOVING DUPLICATES IN CSV

folder_path = "/content/drive/MyDrive/Phonetics_Lab/DATASET/label_0"
csv_path = os.path.join(folder_path, "audio_labels.csv")

# Load CSV
df = pd.read_csv(csv_path)
valid_filenames = set(df['filename'].astype(str))

# List all files in the folder
all_files = os.listdir(folder_path)
audio_exts = ['.wav']

# Filter audio files and delete the extra ones
deleted_count = 0

for file in all_files:
    if any(file.endswith(ext) for ext in audio_exts):
        if file not in valid_filenames:
            try:
                os.remove(os.path.join(folder_path, file))
                print(f"üóëÔ∏è Deleted: {file}")
                deleted_count += 1
            except Exception as e:
                print(f"‚ö†Ô∏è Failed to delete {file}: {e}")

print(f"\n‚úÖ Total deleted: {deleted_count} extra audio files.")


In [None]:
# Keyword statistics


keywords = ['–ø–∏–∑–¥–µ—Ü', '–±–ª—è', '—Ö—É–π', '—Å—É–∫–∞', '–µ–±–∞—Ç—å', '–ø–∏–∑–¥–∞', '–±–ª—è–¥—å', '—Ö—É–π–Ω—è', '–±–ª—è—Ç—å', '–Ω–∞—Ö—É–π', '–ø–æ—Ö—É–π']
keyword_set = set(keywords)

# Path to the file
csv_path = "/content/drive/MyDrive/Phonetics_Lab/DATASET/label_1/audio_labels.csv"

# Read the CSV
df = pd.read_csv(csv_path)

# Filter rows where label == 1
df_label_1 = df[df['label'] == 1]

# Convert to lowercase and remove surrounding spaces
df_label_1['word'] = df_label_1['word'].astype(str).str.strip().str.lower()

# Count all words
word_counts = df_label_1['word'].value_counts()

# Split into keywords and others
keyword_stats = word_counts[word_counts.index.isin(keyword_set)]
other_stats = word_counts[~word_counts.index.isin(keyword_set)]

print("Keyword statistics:")
print(keyword_stats.sort_values(ascending=False))

print("\n Other words found among label==1:")
print(other_stats.sort_values(ascending=False))


In [None]:
# CHECK IF THERE ARE ANY KEYWORDS AMONG LABEL 0

# Path to the CSV with label 0
csv_path = "/content/drive/MyDrive/Phonetics_Lab/DATASET/label_0/audio_labels.csv"

# Load and preprocess
df = pd.read_csv(csv_path)
df['word'] = df['word'].astype(str).str.strip().str.lower()

# Count keywords among label == 0
keyword_counts = df['word'].value_counts()
keyword_in_zeros = keyword_counts[keyword_counts.index.isin(keyword_set)]

if keyword_in_zeros.empty:
    print("No keywords found among files with label == 0.")
else:
    print("Keywords detected among label == 0:")
    print(keyword_in_zeros.sort_values(ascending=False))

# Read the CSV
df = pd.read_csv(csv_path)
df['word'] = df['word'].astype(str).str.strip().str.lower()

# Mask rows with keywords
mask = df['word'].isin(keywords)
to_delete = df[mask]

# Delete corresponding audio files
for fname in to_delete['filename']:
    file_path = os.path.join(folder_path, fname)
    if os.path.exists(file_path):
        os.remove(file_path)
        print(f"Deleted file: {file_path}")
    else:
        print(f"‚ö†Ô∏è File not found: {file_path}")

# Update the CSV
df_clean = df[~mask]
df_clean.to_csv(csv_path, index=False)
print(f"Removed {len(to_delete)} rows from the CSV.")


In [None]:
# KEEP A MAXIMUM OF 600 OCCURRENCES

# Folder with label_1
folder_path = "/content/drive/MyDrive/Phonetics_Lab/DATASET/label_1"
csv_path = os.path.join(folder_path, "audio_labels.csv")

# Load and filter
df = pd.read_csv(csv_path)
df['word'] = df['word'].astype(str).str.strip().str.lower()

# All rows with the word "–±–ª—è—Ç—å"
bword_df = df[df['word'] == '–±–ª—è—Ç—å']

# Keep a random sample of 600
keep_bword = bword_df.sample(n=600, random_state=42)

# Rows to delete
drop_bword = bword_df[~bword_df['filename'].isin(keep_bword['filename'])]

# Delete corresponding audio files
for fname in drop_bword['filename']:
    fpath = os.path.join(folder_path, fname)
    if os.path.exists(fpath):
        os.remove(fpath)
        print(f"Deleted file: {fpath}")
    else:
        print(f"File not found: {fpath}")

# Update the CSV
df_new = df[~((df['word'] == '–±–ª—è—Ç—å') & (~df['filename'].isin(keep_bword['filename'])))]
df_new.to_csv(csv_path, index=False)

print(f"Kept 600 audio files with the word '–±–ª—è—Ç—å'. Deleted {len(drop_bword)} rows and files.")


In [None]:
# AUGMENTATION

import torchaudio
import torchaudio.transforms as T
import torch
from pathlib import Path

csv_path = Path("/content/drive/MyDrive/Phonetics_Lab/DATASET/label_1/audio_labels.csv")
audio_folder = Path("/content/drive/MyDrive/Phonetics_Lab/DATASET/label_1")
base_folder = audio_folder
aug_folder = base_folder / "augmented"
aug_folder.mkdir(exist_ok=True)

target_sr = 16000
target_count = 458

# Keywords
keywords = ['–ø–æ—Ö—É–π']  # can be extended
df = pd.read_csv(csv_path)
df['word'] = df['word'].astype(str).str.strip().str.lower()

# Only label == 1 samples with keywords
df_key = df[(df['label'] == 1) & (df['word'].isin(keywords))].copy()

# Augmentations: noise
def augment_waveform(waveform, sr):
    transforms = [
        T.Vol(gain=0.02),  # noise/amplification
        T.PitchShift(sample_rate=sr, n_steps=2),
    ]
    augmented = [transform(waveform) for transform in transforms]
    return augmented

# Generate augmentations
augmented_rows = []
generated_count = 0

for kw in keywords:
    df_kw = df_key[df_key['word'] == kw]
    count = len(df_kw)

    if count >= target_count:
        continue  # nothing to do

    needed = target_count - count
    print(f"üîÅ Augmenting '{kw}': {count} ‚Üí {target_count} (need to add {needed})")

    samples = df_kw.sample(n=needed, replace=True, random_state=42)

    for _, row in samples.iterrows():
        old_name = row['filename']
        word = row['word']
        src_path = audio_folder / old_name

        if not src_path.exists():
            print(f"‚ö†Ô∏è File not found: {src_path}")
            continue

        try:
            waveform, sr = torchaudio.load(str(src_path))
        except Exception as e:
            print(f"‚ö†Ô∏è Error loading {src_path}: {e}")
            continue

        for aug_wav in augment_waveform(waveform, sr):
            new_name = f"aug_{old_name}"
            new_path = aug_folder / new_name

            if new_path.exists():
                continue  # skip already existing file

            torchaudio.save(str(new_path), aug_wav, sr)

            augmented_rows.append({
                "filename": new_name,
                "word": word,
                "label": 1
            })

            generated_count += 1
            print(f"‚úÖ Generated: {generated_count}/{needed}")
            break  # only one variant per iteration

print(f"\n Total new audio files created: {generated_count}")

# Save new CSV
df_aug = pd.DataFrame(augmented_rows)
df_aug.to_csv(aug_folder / "audio_labels.csv", index=False)
print(f"Saved {len(augmented_rows)} new samples to {aug_folder}")


In [None]:
# FILTERING LABEL 0 AUDIOS CONTAINING BANNED WORDS

# the path to the root folder containing subfolders with audio and CSV files
root_dir = "/content/drive/MyDrive/Phonetics_Lab/SWEAR"
output_dir = '/content/drive/MyDrive/Phonetics_Lab/DATASET/selected_audio'
os.makedirs(output_dir, exist_ok=True)

# list of banned words (used for filtering)
banned_words = {'—è', '–Ω–∞', '–Ω–µ', '–≤', '—ç—Ç–æ', '–∫', '–∞', '–º–µ–Ω—è', '—Ç–∞–∫', '—Ç—ã',
                '–ø–∏–∑–¥–µ—Ü', '–±–ª—è', '—Ö—É–π', '—Å—É–∫–∞', '–µ–±–∞—Ç—å', '–ø–∏–∑–¥–∞', '–±–ª—è–¥—å',
                '—Ö—É–π–Ω—è', '–±–ª—è—Ç—å', '–Ω–∞—Ö—É–π', '–ø–æ—Ö—É–π'}

selected = []

# scan folders for CSV and corresponding audio files
#    and filter out rows containing banned words
for dirpath, _, filenames in os.walk(root_dir):
    for filename in filenames:
        if filename.endswith('.csv'):
            csv_path = os.path.join(dirpath, filename)
            with open(csv_path, newline='', encoding='utf-8') as f:
                reader = csv.reader(f)
                for row in reader:
                    if len(row) < 2:
                        continue
                    audio_filename = row[0].strip()
                    word = row[1].strip().lower()
                    if word in banned_words:
                        continue  # skip audio with banned words
                    audio_path = os.path.join(dirpath, audio_filename)
                    if os.path.isfile(audio_path):
                        selected.append((audio_path, word))

# shuffle and keep up to 1200 clean samples
random.shuffle(selected)
selected = selected[:1200]

# selected audio to the output folder with renamed filenames
for src_path, word in selected:
    base = os.path.basename(src_path)
    new_name = f"{word}_{base}"
    dest_path = os.path.join(output_dir, new_name)
    shutil.copy2(src_path, dest_path)

print(f"Copied {len(selected)} clean audio files to {output_dir}")

src_dir = '/content/drive/MyDrive/Phonetics_Lab/DATASET/selected_audio'
dst_dir = '/content/drive/MyDrive/Phonetics_Lab/DATASET/label_0'

# create the destination folder if it doesn't exist
os.makedirs(dst_dir, exist_ok=True)

# move all files from source to destination
for filename in os.listdir(src_dir):
    src_path = os.path.join(src_dir, filename)
    dst_path = os.path.join(dst_dir, filename)
    if os.path.isfile(src_path):
        shutil.move(src_path, dst_path)

print(f"All files have been moved from {src_dir} to {dst_dir}")

