In [None]:
#filter tsv files
import pandas as pd
import os

# Import the drive module from Google Colab
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import os

# Paths to your files
devtsv_path = "/content/drive/My Drive/ASR/dev.tsv"
traintsv_path = "/content/drive/My Drive/ASR/train.tsv"
testtsv_path = "/content/drive/My Drive/ASR/test.tsv"
audio_folder = "/content/drive/My Drive/clips_wav"

# Paths to save filtered files
filtered_dev_path = "/content/drive/My Drive/ASR/filtered_dev.tsv"
filtered_train_path = "/content/drive/My Drive/ASR/filtered_train.tsv"
filtered_test_path = "/content/drive/My Drive/ASR/filtered_test.tsv"

# Get a set of available audio files in the audio folder (without extensions)
audio_files = set(f.replace('.wav', '') for f in os.listdir(audio_folder) if f.endswith('.wav'))

# Function to filter and modify paths in a TSV file
def filter_tsv(input_tsv_path, output_tsv_path):
    try:
        # Load the TSV file
        df = pd.read_csv(input_tsv_path, sep='\t', on_bad_lines='skip', engine='python')

        # Ensure the 'path' column is a string and replace .mp3 with .wav
        df['path'] = df['path'].astype(str).str.replace('.mp3', '.wav', regex=False)

        # Filter the rows based on whether the file exists in the audio folder
        df['file_exists'] = df['path'].apply(lambda x: os.path.basename(x).replace('.wav', '') in audio_files)

        # Filter out rows where the file does not exist
        filtered_df = df[df['file_exists']].copy()  # Make a copy of the filtered DataFrame

        # Drop the helper column
        filtered_df.drop(columns=['file_exists'], inplace=True)

        # Save the updated TSV file
        filtered_df.to_csv(output_tsv_path, sep='\t', index=False)
        print(f"Updated {input_tsv_path} and saved to {output_tsv_path}")
    except Exception as e:
        print(f"Error processing {input_tsv_path}: {e}")

# Filter each TSV file
filter_tsv(devtsv_path, filtered_dev_path)
filter_tsv(traintsv_path, filtered_train_path)
filter_tsv(testtsv_path, filtered_test_path)


Updated /content/drive/My Drive/ASR/dev.tsv and saved to /content/drive/My Drive/ASR/filtered_dev.tsv
Updated /content/drive/My Drive/ASR/train.tsv and saved to /content/drive/My Drive/ASR/filtered_train.tsv
Updated /content/drive/My Drive/ASR/test.tsv and saved to /content/drive/My Drive/ASR/filtered_test.tsv


In [None]:
# Paths to your files
audio_folder = "/content/drive/My Drive/clips_wav"

# Step 4: Get list of audio files in 'clips_wav'
audio_files = set(f.split('.')[0] for f in os.listdir(audio_folder) if f.endswith('.wav'))
print(f"Found {len(audio_files)} audio files.")


Found 12962 audio files.


In [None]:
import pandas as pd

# Paths to your TSV files
devtsv_path = "/content/drive/My Drive/ASR/dev.tsv"
traintsv_path = "/content/drive/My Drive/ASR/train.tsv"
testtsv_path = "/content/drive/My Drive/ASR/test.tsv"

# Function to read TSV files into DataFrames
def read_tsv_resiliently(file_path):
    try:
        # Read the TSV file, skipping bad lines and using the 'python' engine for compatibility
        df = pd.read_csv(file_path, sep='\t', on_bad_lines='skip', engine='python')
        print(f"Successfully loaded {file_path}")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Read the TSV files
train_df = read_tsv_resiliently(traintsv_path)
test_df = read_tsv_resiliently(testtsv_path)
dev_df = read_tsv_resiliently(devtsv_path)

# Optional: Print the first few rows of the DataFrames to verify
if train_df is not None:
    print(train_df.head())
if test_df is not None:
    print(test_df.head())
if dev_df is not None:
    print(dev_df.head())


Successfully loaded /content/drive/My Drive/ASR/train.tsv
Successfully loaded /content/drive/My Drive/ASR/test.tsv
Successfully loaded /content/drive/My Drive/ASR/dev.tsv
                                           client_id  \
0  76220259d4c614b1876412e489524ede62b700927b202d...   
1  76220259d4c614b1876412e489524ede62b700927b202d...   
2  76220259d4c614b1876412e489524ede62b700927b202d...   
3  76220259d4c614b1876412e489524ede62b700927b202d...   
4  76220259d4c614b1876412e489524ede62b700927b202d...   

                            path  \
0  common_voice_luo_40228998.mp3   
1  common_voice_luo_40228999.mp3   
2  common_voice_luo_40229003.mp3   
3  common_voice_luo_40229004.mp3   
4  common_voice_luo_40229005.mp3   

                                         sentence_id  \
0  0b6d23f21850cf38f2da23b85e5b9d243e327f4e1a987c...   
1  0093b37219cd1409b042fc7c111546d1338a34dfa32d04...   
2  11417bdb395c6a8e9095f673662428f371c1ffea41a4d7...   
3  1199d8ab1254d0772d3efa7f1f6113881809fc6c80aff5..

In [None]:
# Change the .mp3 extension to .wav in the 'path' column for all DataFrames
def replace_mp3_with_wav(df):
    if df is not None:
        # Replace '.mp3' with '.wav' in the 'path' column
        df['path'] = df['path'].str.replace('.mp3', '.wav', regex=False)
        print("Replaced .mp3 with .wav in the 'path' column.")
    return df

# Apply the function to each DataFrame
train_df = replace_mp3_with_wav(train_df)
test_df = replace_mp3_with_wav(test_df)
dev_df = replace_mp3_with_wav(dev_df)

# Optional: Print the updated first few rows of the DataFrames to verify
if train_df is not None:
    print(train_df.head())
if test_df is not None:
    print(test_df.head())
if dev_df is not None:
    print(dev_df.head())


Replaced .mp3 with .wav in the 'path' column.
Replaced .mp3 with .wav in the 'path' column.
Replaced .mp3 with .wav in the 'path' column.
                                           client_id  \
0  76220259d4c614b1876412e489524ede62b700927b202d...   
1  76220259d4c614b1876412e489524ede62b700927b202d...   
2  76220259d4c614b1876412e489524ede62b700927b202d...   
3  76220259d4c614b1876412e489524ede62b700927b202d...   
4  76220259d4c614b1876412e489524ede62b700927b202d...   

                            path  \
0  common_voice_luo_40228998.wav   
1  common_voice_luo_40228999.wav   
2  common_voice_luo_40229003.wav   
3  common_voice_luo_40229004.wav   
4  common_voice_luo_40229005.wav   

                                         sentence_id  \
0  0b6d23f21850cf38f2da23b85e5b9d243e327f4e1a987c...   
1  0093b37219cd1409b042fc7c111546d1338a34dfa32d04...   
2  11417bdb395c6a8e9095f673662428f371c1ffea41a4d7...   
3  1199d8ab1254d0772d3efa7f1f6113881809fc6c80aff5...   
4  058ef183ec6c447b759aab31e

In [None]:
def clean_dataframe(df):
    # Retain only the 'path' and 'sentence' columns
    df = df[['path', 'sentence']].copy()

    # Remove leading/trailing whitespace in 'path' column
    df.loc[:, 'path'] = df['path'].str.strip()

    return df

# Clean each DataFrame
df_train_clean = clean_dataframe(train_df)
df_test_clean = clean_dataframe(test_df)
df_dev_clean = clean_dataframe(dev_df)

# Display the cleaned DataFrames
print("Cleaned Train DataFrame:")
print(df_train_clean.head())

print("\nCleaned Test DataFrame:")
print(df_test_clean.head())

print("\nCleaned Dev DataFrame:")
print(df_dev_clean.head())

Cleaned Train DataFrame:
                            path  \
0  common_voice_luo_40228998.wav   
1  common_voice_luo_40228999.wav   
2  common_voice_luo_40229003.wav   
3  common_voice_luo_40229004.wav   
4  common_voice_luo_40229005.wav   

                                            sentence  
0  ber ka inyiso pachi kaluwore gi weche miwacho ...  
1  Ne iparo ni gik ma kamago ne nyalo kelo chandr...  
2     A. mar Bedo ng'at maduong' en gima tek ahinya.  
3                         Nyaka bed ni nitie chik ma  
4    Sani weche mag ohala isomo embalariany ng’enyne  

Cleaned Test DataFrame:
                            path  \
0  common_voice_luo_40609150.wav   
1  common_voice_luo_40636306.wav   
2  common_voice_luo_40258074.wav   
3  common_voice_luo_40833291.wav   
4  common_voice_luo_40820771.wav   

                                            sentence  
0                               waduto wakalo e pek.  
1                                  Wayu nyinge ng’a?  
2                    

In [None]:
# Step 6: Identify the entries in the 'path' columns
# Assuming the column is named 'path' and contains the full file paths (without extension)
train_paths = set(train_df['path'].str.split('/').str[-1].str.split('.').str[0])
test_paths = set(test_df['path'].str.split('/').str[-1].str.split('.').str[0])
dev_paths = set(dev_df['path'].str.split('/').str[-1].str.split('.').str[0])


In [None]:
# Step 7: Filter out entries where the 'path' does not correspond to an audio file in 'clips_wav'
train_filtered = train_df[train_df['path'].str.split('/').str[-1].str.split('.').str[0].isin(audio_files)]
test_filtered = test_df[test_df['path'].str.split('/').str[-1].str.split('.').str[0].isin(audio_files)]
dev_filtered = dev_df[dev_df['path'].str.split('/').str[-1].str.split('.').str[0].isin(audio_files)]

# Step 8: Output the results
print(f"Filtered Train DataFrame shape: {train_filtered.shape}")
print(f"Filtered Test DataFrame shape: {test_filtered.shape}")
print(f"Filtered Dev DataFrame shape: {dev_filtered.shape}")

Filtered Train DataFrame shape: (2498, 13)
Filtered Test DataFrame shape: (734, 13)
Filtered Dev DataFrame shape: (1570, 13)


In [None]:
def verify_paths(df, audio_files):
    # Extract the file name (without extension) from the 'path' column in the DataFrame
    paths = set(df['path'].str.split('/').str[-1].str.split('.').str[0])

    # Check for paths in the DataFrame that do not match any audio file name in the folder
    missing_files = paths.difference(audio_files)

    # Print a warning message if there are any missing files
    if len(missing_files) == 0:
        print("All paths in the DataFrame correspond to existing audio files.")
        return True
    else:
        print(f"Warning: There are {len(missing_files)} paths that do not correspond to any audio files in the folder: {missing_files}")
        return False

# Verify paths in train, test, and dev DataFrames
train_verified = verify_paths(train_df, audio_files)
test_verified = verify_paths(test_df, audio_files)
dev_verified = verify_paths(dev_df, audio_files)




In [None]:
# Step 10: Save the filtered DataFrames to new TSV files, regardless of verification
train_filtered.to_csv('/content/filtered_train.tsv', sep='\t', index=False)
test_filtered.to_csv('/content/filtered_test.tsv', sep='\t', index=False)
dev_filtered.to_csv('/content/filtered_dev.tsv', sep='\t', index=False)

print("Filtered data has been saved to new TSV files, regardless of path verification.")


Filtered data has been saved to new TSV files, regardless of path verification.
