# Exploratory Data Analysis

In this notebook, we will explore the automated transcripts we generated for AccentsDB dataset using Wav2Vec 2.0 and perform some basic analysis on it.

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../accentsDB/transcripts/labels/transcripts.csv')

In [6]:
df.sort_values(by='file_path', inplace=True)

In [39]:
df

Unnamed: 0,transcription,file_path
0,THE POB JERKED THE LISH AS HE SAW A FELINE SHAPE,indian_s02_722.txt
1,THE SHIP WAS TORN APART ON THE SHARP REEF,indian_s02_044.txt
2,HOP OVER THE FENCE AND PLUNGE IN,indian_s02_050.txt
3,A PINK SHELL WAS FOUND ON THE SANDY BEACH,indian_s02_736.txt
4,A TAM SQUIRREL MAKES A NICE PECT,indian_s02_078.txt
...,...,...
17308,A FRESH START WILL WORK SUCH WONDERS,american_s08_209.txt
17309,PEEP UNDER THE TINT AND SEE THE CLOWNS,american_s08_553.txt
17310,SLASH THE GOLD CLOTH INTO FINE RIBBONS,american_s08_235.txt
17311,THE LARGE HOUSE HAD HOT WATER TAPS,american_s08_221.txt


In [11]:
# Get only american english transcriptions
df_american = df[df['file_path'].str.contains('american')]
df_american

Unnamed: 0,transcription,file_path
15963,THE BRIDCEH CAN NOO SLADE ON THE SMOOTH PLANKS,american_s01_001.txt
15924,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,american_s01_002.txt
15923,IT'S EASY TO TELL THE DEPTH OF A RAL,american_s01_003.txt
15849,THESE DAYS THE CHICKEN RAGIS A RARE GUSH,american_s01_004.txt
15846,RICE IS OFTEN SERVED IN RANBLLS,american_s01_005.txt
...,...,...
16844,THE GRASS AND BUSHES WERE WET WITH DEW,american_s08_738.txt
16819,THE BLIND MAN COUNTED HIS ALD COINS,american_s08_739.txt
17162,A SEVERE STORM TORE DOWN THE BARN,american_s08_740.txt
17130,SHE CALLED HIS NAME MANY TIMES,american_s08_741.txt


In [12]:
df_australian = df[df['file_path'].str.contains('australian')]

In [13]:
df_british = df[df['file_path'].str.contains('british')]

In [14]:
df_bangladeshi = df[df['file_path'].str.contains('bangla')]

In [15]:
df_indian = df[df['file_path'].str.contains('indian')]

In [16]:
df_malayalam = df[df['file_path'].str.contains('malayalam')]

In [17]:
df_odiya = df[df['file_path'].str.contains('odiya')]

In [18]:
df_telugu = df[df['file_path'].str.contains('telugu')]

In [19]:
df_welsh = df[df['file_path'].str.contains('welsh')]

In [35]:
# Compare all the transcriptions side by side using the transcript column
df_all = df_american["transcription"].to_frame().reset_index(drop=True).merge(
    df_australian["transcription"].to_frame().reset_index(drop=True), 
    left_index=True, 
    right_index=True,
    suffixes=('', '_australian')
).merge(
    df_british["transcription"].to_frame().reset_index(drop=True), 
    left_index=True, 
    right_index=True,
    suffixes=('', '_british')
).merge(
    df_bangladeshi["transcription"].to_frame().reset_index(drop=True), 
    left_index=True, 
    right_index=True,
    suffixes=('', '_bangladeshi')
).merge(
    df_indian["transcription"].to_frame().reset_index(drop=True), 
    left_index=True, 
    right_index=True,
    suffixes=('', '_indian')
).merge(
    df_malayalam["transcription"].to_frame().reset_index(drop=True), 
    left_index=True, 
    right_index=True,
    suffixes=('', '_malayalam')
).merge(
    df_odiya["transcription"].to_frame().reset_index(drop=True), 
    left_index=True, 
    right_index=True,
    suffixes=('', '_odiya')
).merge(
    df_telugu["transcription"].to_frame().reset_index(drop=True), 
    left_index=True, 
    right_index=True,
    suffixes=('', '_telugu')
).merge(
    df_welsh["transcription"].to_frame().reset_index(drop=True), 
    left_index=True, 
    right_index=True,
    suffixes=('', '_welsh')
)

df_all.head()

Unnamed: 0,transcription,transcription_australian,transcription_british,transcription_bangladeshi,transcription_indian,transcription_malayalam,transcription_odiya,transcription_telugu,transcription_welsh
0,THE BRIDCEH CAN NOO SLADE ON THE SMOOTH PLANKS,THE BIRCH CANOE SLEET ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRD CANOS SLID ON THE SMOOTH PLANKS,THE BIRCH GANOOS LID ON THE SMOOTH PLANKS,THE BIDS CAN OW SLAY IT ON SMOTH BLANKS,THE BIRDS CANER SLIED ON DE SMOT BLANKS,THE BUD CANOU SLEAR DOWN THES MOT PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS
1,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLEW THE SHEET TO THE DOCK BLUE BACKGROUND,GLEW THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,BLEW THE SHEET TO THE DARK BLUE BAGROUND,BLUE DE SET TO DA BLUE BACGOUN,GLUETY SEAT TO THE BARAC BLUE BACKGROUND,GLAVE E SHEET TO THE DARK BLUE BACK GRUND
2,IT'S EASY TO TELL THE DEPTH OF A RAL,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A WORL,IT'S EASY TO TELL THE DEPTH OF A VILLE,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THAT APT OFABELL,IT'S EASY TO TELL THE DEFT OF A WILL,IT IS EASY TO TELL THEE DATA FEVELLI,IT'S EASY TO TELL THE DEPTH OF A WELL
3,THESE DAYS THE CHICKEN RAGIS A RARE GUSH,THESE DAYS OF CHICKEN LEG IS A RED DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THES DEY SAY TIKAN LEGGI SA RABDISH,THER'S DEYS A TICKEN LEG IS A READIS,AN THES DAYS A CHICKENLYGIZY REDDISH,THESE DAYS ARE CHICKEN LEG IS A RAREDISH
4,RICE IS OFTEN SERVED IN RANBLLS,RICE IS OFTEN SERVED IN ROUND BALLS,RICE IS OFTEN SERVED IN ROUND BOWS,RICE IS OFTEN SERVED IN ROUND BOLTS,RICE IS OFTEN SERVED IN ROUND BULLS,RISE IS OFTEN SERVED IN ROWN BOWS,LACES OFTEN SERVED IN ROWN BALLS,RACES OFTEN SERUDE IN DRONDE BULLS,RICE IS OFTEN SERVED IN ROUND BOWLS


In [36]:
df_all

Unnamed: 0,transcription,transcription_australian,transcription_british,transcription_bangladeshi,transcription_indian,transcription_malayalam,transcription_odiya,transcription_telugu,transcription_welsh
0,THE BRIDCEH CAN NOO SLADE ON THE SMOOTH PLANKS,THE BIRCH CANOE SLEET ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRD CANOS SLID ON THE SMOOTH PLANKS,THE BIRCH GANOOS LID ON THE SMOOTH PLANKS,THE BIDS CAN OW SLAY IT ON SMOTH BLANKS,THE BIRDS CANER SLIED ON DE SMOT BLANKS,THE BUD CANOU SLEAR DOWN THES MOT PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS
1,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLEW THE SHEET TO THE DOCK BLUE BACKGROUND,GLEW THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,BLEW THE SHEET TO THE DARK BLUE BAGROUND,BLUE DE SET TO DA BLUE BACGOUN,GLUETY SEAT TO THE BARAC BLUE BACKGROUND,GLAVE E SHEET TO THE DARK BLUE BACK GRUND
2,IT'S EASY TO TELL THE DEPTH OF A RAL,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A WORL,IT'S EASY TO TELL THE DEPTH OF A VILLE,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THAT APT OFABELL,IT'S EASY TO TELL THE DEFT OF A WILL,IT IS EASY TO TELL THEE DATA FEVELLI,IT'S EASY TO TELL THE DEPTH OF A WELL
3,THESE DAYS THE CHICKEN RAGIS A RARE GUSH,THESE DAYS OF CHICKEN LEG IS A RED DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THES DEY SAY TIKAN LEGGI SA RABDISH,THER'S DEYS A TICKEN LEG IS A READIS,AN THES DAYS A CHICKENLYGIZY REDDISH,THESE DAYS ARE CHICKEN LEG IS A RAREDISH
4,RICE IS OFTEN SERVED IN RANBLLS,RICE IS OFTEN SERVED IN ROUND BALLS,RICE IS OFTEN SERVED IN ROUND BOWS,RICE IS OFTEN SERVED IN ROUND BOLTS,RICE IS OFTEN SERVED IN ROUND BULLS,RISE IS OFTEN SERVED IN ROWN BOWS,LACES OFTEN SERVED IN ROWN BALLS,RACES OFTEN SERUDE IN DRONDE BULLS,RICE IS OFTEN SERVED IN ROUND BOWLS
...,...,...,...,...,...,...,...,...,...
737,THE GRASS AND BUSHES WERE WET WITH DEW,THE GRASS AND BUSHES WERE WET WITH DEW,THE GRASS AND BUSHES WERE WET WITH DEW,ON THE ISLANDS THE SEA BREEZE IS SOFT AND MILD,THE GROSS AND BUSHES WERE RED WITH DEW,THE BOX IS HELD BY BY HIM BRIGHT RED,ON THE ISLANDS THE SEA BREED IS SOFT AND MILD,THIS WILL LEAD THE WORLD TO MORE SOUND AN FURY,THE GRASS AND BUSHES WERE WET WITH DEW
738,THE BLIND MAN COUNTED HIS OLD COINS,THE BLIND MAN COUNTED HIS OLD COINS,THE BLIND MAN COUNTED HIS OLD COINS,THE PLEA BEGAN AS SOON AS WE SAT DOWN,THE BLIND MAN COUNTED HIS OLD COINS,SNAPPER TO MAKE PURE ICE YOU FREESE WARTE,THE PLA BE DI NIT YO NIGHT BE SIDE DOWN,ARD SALL BEFORE YOU FRATE IGA,THE BLIND MAN COUNTED HIS OLD COINS
739,A SEVERE STORM TORE DOWN THE BARN,ISOF ESTORM TORE DOWN THE BARN,A SEVERE STORM TORED DOWN THE BARN,THIS WILL LEAD THE WORLD TO MORE SOUND AND FURRY,A SEVERE STORM TOR DOWN THE BARN,THE FIRST WANT WAM GID SNAP AND SNAPER SNAPPED...,THE SWILL LEAD THE WORLD TO MORE SOUND UN FUTY,THE RUSHFOR FUNDS REACHERE SPEAK TUSDAY,A SEVERE STORM TORE DOWN THE BARN
740,SHE CALLED HIS NAME MANY TIMES,SHE CALLED HIS NAME MANY TIMES,SHE CALLED HIS NAME MANY TIMES,I SAWL BEFORE YOU FRY THE EGG,SHE CALLED HIS NAME MANY DIMES,E JIM THE FINTE AND HURRY OF THE BANK,I'RD SALT BEFORE YOU FRET THE INK,THE BARTS LUCUSTARC WAED AND LONESOME,SHE CALLED HIS NAME MANY TIMES


In [37]:
df_all.to_csv('../accentsDB/transcripts/labels/transcripts_comparison.csv', index=False)

# Clean labels

In [18]:
df = pd.read_csv('../accentsDB/transcripts/labels/transcripts_comparison.csv')

In [12]:
df_british = df['transcription_british']
df_british

0       THE BIRCH CANOE SLID ON THE SMOOTH PLANKS
1      GLEU THE SHEET TO THE DARK BLUE BACKGROUND
2           IT'S EASY TO TELL THE DEPTH OF A WELL
3         THESE DAYS A CHICKEN LEG IS A RARE DISH
4             RICE IS OFTEN SERVED IN ROUND BOWLS
                          ...                    
737        THE GRASS AND BUSHES WERE WET WITH DEW
738           THE BLIND MAN COUNTED HIS OLD COINS
739            A SEVERE STORM TORED DOWN THE BARN
740                SHE CALLED HIS NAME MANY TIMES
741           WHEN YOU HEAR THE BELL COME QUICKLY
Name: transcription_british, Length: 742, dtype: object

In [7]:
df_british.to_csv('../accentsDB/transcripts/labels/transcripts_british.csv', index=False)

In [24]:
df_british = pd.read_csv('../accentsDB/transcripts/labels/transcripts_british.csv')

In [25]:
# replace american, welsh, australian and indian columns with british
df['transcription'] = df_british['transcription_british']
df['transcription_australian'] = df_british['transcription_british']
df['transcription_indian'] = df_british['transcription_british']
df['transcription_welsh'] = df_british['transcription_british']
df

Unnamed: 0,transcription,transcription_australian,transcription_british,transcription_bangladeshi,transcription_indian,transcription_malayalam,transcription_odiya,transcription_telugu,transcription_welsh
0,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIDS CAN OW SLAY IT ON SMOTH BLANKS,THE BIRDS CANER SLIED ON DE SMOT BLANKS,THE BUD CANOU SLEAR DOWN THES MOT PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS
1,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLEU THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,BLEW THE SHEET TO THE DARK BLUE BAGROUND,BLUE DE SET TO DA BLUE BACGOUN,GLUETY SEAT TO THE BARAC BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND
2,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A VILLE,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THAT APT OFABELL,IT'S EASY TO TELL THE DEFT OF A WILL,IT IS EASY TO TELL THEE DATA FEVELLI,IT'S EASY TO TELL THE DEPTH OF A WELL
3,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THES DEY SAY TIKAN LEGGI SA RABDISH,THER'S DEYS A TICKEN LEG IS A READIS,AN THES DAYS A CHICKENLYGIZY REDDISH,THESE DAYS A CHICKEN LEG IS A RARE DISH
4,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RISE IS OFTEN SERVED IN ROWN BOWS,LACES OFTEN SERVED IN ROWN BALLS,RACES OFTEN SERUDE IN DRONDE BULLS,RICE IS OFTEN SERVED IN ROUND BOWLS
...,...,...,...,...,...,...,...,...,...
737,THE GRASS AND BUSHES WERE WET WITH DEW,THE GRASS AND BUSHES WERE WET WITH DEW,THE GRASS AND BUSHES WERE WET WITH DEW,ON THE ISLANDS THE SEA BREEZE IS SOFT AND MILD,THE GRASS AND BUSHES WERE WET WITH DEW,THE BOX IS HELD BY BY HIM BRIGHT RED,ON THE ISLANDS THE SEA BREED IS SOFT AND MILD,THIS WILL LEAD THE WORLD TO MORE SOUND AN FURY,THE GRASS AND BUSHES WERE WET WITH DEW
738,THE BLIND MAN COUNTED HIS OLD COINS,THE BLIND MAN COUNTED HIS OLD COINS,THE BLIND MAN COUNTED HIS OLD COINS,THE PLEA BEGAN AS SOON AS WE SAT DOWN,THE BLIND MAN COUNTED HIS OLD COINS,SNAPPER TO MAKE PURE ICE YOU FREESE WARTE,THE PLA BE DI NIT YO NIGHT BE SIDE DOWN,ARD SALL BEFORE YOU FRATE IGA,THE BLIND MAN COUNTED HIS OLD COINS
739,A SEVERE STORM TORED DOWN THE BARN,A SEVERE STORM TORED DOWN THE BARN,A SEVERE STORM TORED DOWN THE BARN,THIS WILL LEAD THE WORLD TO MORE SOUND AND FURRY,A SEVERE STORM TORED DOWN THE BARN,THE FIRST WANT WAM GID SNAP AND SNAPER SNAPPED...,THE SWILL LEAD THE WORLD TO MORE SOUND UN FUTY,THE RUSHFOR FUNDS REACHERE SPEAK TUSDAY,A SEVERE STORM TORED DOWN THE BARN
740,SHE CALLED HIS NAME MANY TIMES,SHE CALLED HIS NAME MANY TIMES,SHE CALLED HIS NAME MANY TIMES,I SAWL BEFORE YOU FRY THE EGG,SHE CALLED HIS NAME MANY TIMES,E JIM THE FINTE AND HURRY OF THE BANK,I'RD SALT BEFORE YOU FRET THE INK,THE BARTS LUCUSTARC WAED AND LONESOME,SHE CALLED HIS NAME MANY TIMES


In [10]:
df_bangladeshi = df['transcription_bangladeshi']
df_bangladeshi

0             THE BIRCH CANOE SLID ON THE SMOOTH PLANKS
1            GLUE THE SHEET TO THE DARK BLUE BACKGROUND
2                IT'S EASY TO TELL THE DEPTH OF A VILLE
3               THESE DAYS A CHICKEN LEG IS A RARE DISH
4                   RICE IS OFTEN SERVED IN ROUND BOWLS
                             ...                       
737      ON THE ISLANDS THE SEA BREEZE IS SOFT AND MILD
738               THE PLEA BEGAN AS SOON AS WE SAT DOWN
739    THIS WILL LEAD THE WORLD TO MORE SOUND AND FURRY
740                       I SAWL BEFORE YOU FRY THE EGG
741         THE RUSH FOR FANTS REACHED ITS PEAK TUESDAY
Name: transcription_bangladeshi, Length: 742, dtype: object

In [11]:
df_bangladeshi.to_csv('../accentsDB/transcripts/labels/transcripts_bangladeshi.csv', index=False)

In [26]:
df_bangladeshi = pd.read_csv('../accentsDB/transcripts/labels/transcripts_bangladeshi.csv')

In [27]:
# replace odiya column with bangladeshi
df['transcription_odiya'] = df_bangladeshi['transcription_bangladeshi']

# drop malayalam and telugu columns
df.drop(['transcription_malayalam', 'transcription_telugu'], axis=1, inplace=True)
df

Unnamed: 0,transcription,transcription_australian,transcription_british,transcription_bangladeshi,transcription_indian,transcription_odiya,transcription_welsh
0,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS
1,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLEU THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND
2,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A VILLE,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A VILLE,IT'S EASY TO TELL THE DEPTH OF A WELL
3,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH
4,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS
...,...,...,...,...,...,...,...
737,THE GRASS AND BUSHES WERE WET WITH DEW,THE GRASS AND BUSHES WERE WET WITH DEW,THE GRASS AND BUSHES WERE WET WITH DEW,ON THE ISLANDS THE SEA BREEZE IS SOFT AND MILD,THE GRASS AND BUSHES WERE WET WITH DEW,ON THE ISLANDS THE SEA BREEZE IS SOFT AND MILD,THE GRASS AND BUSHES WERE WET WITH DEW
738,THE BLIND MAN COUNTED HIS OLD COINS,THE BLIND MAN COUNTED HIS OLD COINS,THE BLIND MAN COUNTED HIS OLD COINS,THE PLEA BEGAN AS SOON AS WE SAT DOWN,THE BLIND MAN COUNTED HIS OLD COINS,THE PLAY BEGAN AS SOON AS WE SAT DOWN,THE BLIND MAN COUNTED HIS OLD COINS
739,A SEVERE STORM TORED DOWN THE BARN,A SEVERE STORM TORED DOWN THE BARN,A SEVERE STORM TORED DOWN THE BARN,THIS WILL LEAD THE WORLD TO MORE SOUND AND FURRY,A SEVERE STORM TORED DOWN THE BARN,THIS WILL LEAD THE WORLD TO MORE SOUND AND FURY,A SEVERE STORM TORED DOWN THE BARN
740,SHE CALLED HIS NAME MANY TIMES,SHE CALLED HIS NAME MANY TIMES,SHE CALLED HIS NAME MANY TIMES,I SAWL BEFORE YOU FRY THE EGG,SHE CALLED HIS NAME MANY TIMES,ADD SALT BEFORE YOU FRY THE EGG,SHE CALLED HIS NAME MANY TIMES


In [28]:
df.to_csv('../accentsDB/transcripts/labels/transcripts_comparison.csv', index=False)

In [80]:
df = pd.read_csv('../accentsDB/transcripts/labels/transcripts_comparison.csv')
df

Unnamed: 0,transcription_american,transcription_australian,transcription_british,transcription_bangladeshi,transcription_indian,transcription_odiya,transcription_welsh
0,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS
1,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLEU THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,GLUE THE SHEET TO THE DARK BLUE BACKGROUND
2,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A VILLE,IT'S EASY TO TELL THE DEPTH OF A WELL,IT'S EASY TO TELL THE DEPTH OF A VILLE,IT'S EASY TO TELL THE DEPTH OF A WELL
3,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH,THESE DAYS A CHICKEN LEG IS A RARE DISH
4,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS,RICE IS OFTEN SERVED IN ROUND BOWLS
...,...,...,...,...,...,...,...
737,THE GRASS AND BUSHES WERE WET WITH DEW,THE GRASS AND BUSHES WERE WET WITH DEW,THE GRASS AND BUSHES WERE WET WITH DEW,ON THE ISLANDS THE SEA BREEZE IS SOFT AND MILD,THE GRASS AND BUSHES WERE WET WITH DEW,ON THE ISLANDS THE SEA BREEZE IS SOFT AND MILD,THE GRASS AND BUSHES WERE WET WITH DEW
738,THE BLIND MAN COUNTED HIS OLD COINS,THE BLIND MAN COUNTED HIS OLD COINS,THE BLIND MAN COUNTED HIS OLD COINS,THE PLEA BEGAN AS SOON AS WE SAT DOWN,THE BLIND MAN COUNTED HIS OLD COINS,THE PLAY BEGAN AS SOON AS WE SAT DOWN,THE BLIND MAN COUNTED HIS OLD COINS
739,A SEVERE STORM TORED DOWN THE BARN,A SEVERE STORM TORED DOWN THE BARN,A SEVERE STORM TORED DOWN THE BARN,THIS WILL LEAD THE WORLD TO MORE SOUND AND FURRY,A SEVERE STORM TORED DOWN THE BARN,THIS WILL LEAD THE WORLD TO MORE SOUND AND FURY,A SEVERE STORM TORED DOWN THE BARN
740,SHE CALLED HIS NAME MANY TIMES,SHE CALLED HIS NAME MANY TIMES,SHE CALLED HIS NAME MANY TIMES,I SAWL BEFORE YOU FRY THE EGG,SHE CALLED HIS NAME MANY TIMES,ADD SALT BEFORE YOU FRY THE EGG,SHE CALLED HIS NAME MANY TIMES


In [82]:
import pandas as pd
import os
from tqdm import tqdm
import time

# Set the directory containing the audio files
audio_dir = '../accentsDB/transcripts/data'

# Create an empty list to store the transcriptions and file paths
transcripts = []

# Time the execution
start = time.time()

# Loop through all the files in the directory
for j, (root, dirs, files) in tqdm(enumerate(os.walk(audio_dir))):
    # order the files
    files.sort()
    for i, file in enumerate(files):
        if i <=741:
            # Check if the file is a Text file
            if file.endswith('.txt'):
                # file name before the first _ 
                file_name = file.split('_')[0]
            
                # Get the file path
                file_path = os.path.join(root, file)
                # Get the transcription
                transcription = df[df.columns[df.columns.str.contains(file_name)]].iloc[i].values[0]
                # Add the transcription and file path to the list
                transcripts.append({'transcription': transcription, 'file_path': file_path})

# Create a dataframe from the list of transcriptions
df_ = pd.DataFrame(transcripts)

# Save the dataframe to a csv file
df_.to_csv('../accentsDB/transcripts/labels/transcripts_clean_labels.csv', index=False)

# # Print the time taken to execute the script
print(f"Time taken: {time.time() - start} seconds")

25it [00:02, 10.31it/s]

Time taken: 2.462200880050659 seconds





In [84]:
df = pd.read_csv("../accentsDB/transcripts/labels/transcripts_clean_labels.csv")
df

Unnamed: 0,transcription,file_path
0,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,../transcripts/data/indian/speaker_02/indian_s...
1,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,../transcripts/data/indian/speaker_02/indian_s...
2,IT'S EASY TO TELL THE DEPTH OF A WELL,../transcripts/data/indian/speaker_02/indian_s...
3,THESE DAYS A CHICKEN LEG IS A RARE DISH,../transcripts/data/indian/speaker_02/indian_s...
4,RICE IS OFTEN SERVED IN ROUND BOWLS,../transcripts/data/indian/speaker_02/indian_s...
...,...,...
12609,THE GRASS AND BUSHES WERE WET WITH DEW,../transcripts/data/american/speaker_08/americ...
12610,THE BLIND MAN COUNTED HIS OLD COINS,../transcripts/data/american/speaker_08/americ...
12611,A SEVERE STORM TORED DOWN THE BARN,../transcripts/data/american/speaker_08/americ...
12612,SHE CALLED HIS NAME MANY TIMES,../transcripts/data/american/speaker_08/americ...


In [85]:
# remove ../transcripts/ from the file path
df['file_path'] = df['file_path'].apply(lambda x: x.replace("../transcripts/", ""))
# remove the .txt suffix and replace it with .wav
df['file_path'] = df['file_path'].apply(lambda x: x.replace(".txt", ".wav"))
# rename transcription column to text and file_path to file_name
df = df.rename(columns={"file_path": "file_name"})
df

Unnamed: 0,transcription,file_name
0,THE BIRCH CANOE SLID ON THE SMOOTH PLANKS,data/indian/speaker_02/indian_s02_001.wav
1,GLUE THE SHEET TO THE DARK BLUE BACKGROUND,data/indian/speaker_02/indian_s02_002.wav
2,IT'S EASY TO TELL THE DEPTH OF A WELL,data/indian/speaker_02/indian_s02_003.wav
3,THESE DAYS A CHICKEN LEG IS A RARE DISH,data/indian/speaker_02/indian_s02_004.wav
4,RICE IS OFTEN SERVED IN ROUND BOWLS,data/indian/speaker_02/indian_s02_005.wav
...,...,...
12609,THE GRASS AND BUSHES WERE WET WITH DEW,data/american/speaker_08/american_s08_738.wav
12610,THE BLIND MAN COUNTED HIS OLD COINS,data/american/speaker_08/american_s08_739.wav
12611,A SEVERE STORM TORED DOWN THE BARN,data/american/speaker_08/american_s08_740.wav
12612,SHE CALLED HIS NAME MANY TIMES,data/american/speaker_08/american_s08_741.wav


In [None]:
df.to_csv("../accentsDB/metadata.csv", index=False)