Download Dataset

The custom dataset collected by the "[speech2text](https://github.com/shenasa-ai/speech2text)" repository comprises audio recordings sourced from radio archives, gathered using a dedicated data crawler script. This dataset was likely curated to diversify training data for speech recognition models, potentially offering unique linguistic patterns and contexts not found in other publicly available datasets like Common Voice.

In [1]:
# download all_wave.zip
!gdown 1jyvhdZHn0s5Owkr21k5Ff-c96sIQLtEu
!unzip -qq all_wav.zip
# download Hamtech_VoiceDataset_Slice1.csv
!gdown 1vqvn0F0YYhEFbzLgP9wJ36vyInUnO5b5

Failed to retrieve file url:

	Too many users have viewed or downloaded this file recently. Please
	try accessing the file again later. If the file you are trying to
	access is particularly large or is shared with many people, it may
	take up to 24 hours to be able to view or download the file. If you
	still can't access a file after 24 hours, contact your domain
	administrator.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1jyvhdZHn0s5Owkr21k5Ff-c96sIQLtEu

but Gdown can't. Please check connections and permissions.
unzip:  cannot find or open all_wav.zip, all_wav.zip.zip or all_wav.zip.ZIP.
Downloading...
From: https://drive.google.com/uc?id=1vqvn0F0YYhEFbzLgP9wJ36vyInUnO5b5
To: /content/Hamtech_VoiceDataset_Slice1.csv
100% 2.87M/2.87M [00:00<00:00, 88.0MB/s]


# Preprocessing

In [2]:
# Download nessecary libraries
!pip install datasets==2.10.0 --quiet
!pip install transformers --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# import nessecary libraries
import pandas as pd
from datasets import Dataset
from functools import reduce
import re
import string
from transformers import Wav2Vec2CTCTokenizer,Wav2Vec2FeatureExtractor,Wav2Vec2Processor
import json
import librosa
import warnings
from os import path,system,mkdir


In [4]:
transcripts = pd.read_csv('Hamtech_VoiceDataset_Slice1.csv')
transcripts = transcripts.sort_values(by='confidence_level', ascending=False)
ds = Dataset.from_pandas(transcripts)
transcripts.head()

Unnamed: 0,wav_filename,wav_filesize,transcript,confidence_level
6979,./all_wav/Varzesh_VarzeshDarRoosta9_175.wav,140804,انسان با همه تعاریفی که در انسانیت,0.927557
22556,./all_wav/Tehran_Namaesh2_101.wav,141892,فرخنده مادر نوشین برای شرکت در دادگاه,0.927557
18372,./all_wav/Varzesh_SakooyeMann10_116.wav,147940,و در کنار تخصص چند وجهی بودن,0.927557
13222,./all_wav/Tehran_YekTehranDoa9_497.wav,167812,پخش همزمان رادیو ترتیل از ایران صدا,0.927557
18348,./all_wav/Varzesh_SakooyeMann0_74.wav,146468,بسیار زیادی در ورزش کشور خواهیم بود,0.927557


In [5]:
transcripts.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24366 entries, 6979 to 20919
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   wav_filename      24366 non-null  object 
 1   wav_filesize      24366 non-null  int64  
 2   transcript        24366 non-null  object 
 3   confidence_level  24366 non-null  float64
dtypes: float64(1), int64(1), object(2)
memory usage: 951.8+ KB


## Q1

In [18]:
# Define the list of characters to remove
char_removals = list(string.ascii_letters + string.digits) + ['۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '٨', '۹', '۰', '*', 'ء', 'ً', ':', '\u200f', '\u200c']

# Function to remove special characters from a single transcript
def remove_special_characters(batch):
  """
  Removes specified special characters from the 'transcript' field in a batch of data.

  Args:
    batch (dict): A dictionary containing a 'transcript' key with the text to clean.
    char_removals (list): List of characters or regex patterns to remove from each transcript.

  Returns:
      dict: Processed batch dictionary with 'transcript' field cleaned of specified characters.
  """
  ################################################Start Code##############################################
  cleaned_transcripts = ''
  transcript = batch['transcript']

  for st in char_removals:
    transcript = transcript.replace(st, '')
  cleaned_transcripts = transcript.strip()
  batch['transcript'] = cleaned_transcripts
  #################################################End Code###############################################
  return batch

# Apply the function in batches
def batchwise_remove_special_chars(ds, char_removals, batch_size=1000):
  """
  Batchwise removes specified special characters from the 'transcript' column in a given Dataset.

  Args:
    ds (datasets.Dataset): Dataset containing the 'transcript' column from which characters are to be removed.
    char_removals (list): List of characters or regex patterns to remove from each transcript.
    batch_size (int, optional): Batch size for processing, defaults to 1000.

  Returns:
      datasets.Dataset: Processed Dataset with characters removed from 'transcript' column.
    """
  total_records = len(ds)
  ################################################Start Code##############################################
  ds = ds.map(remove_special_characters)
  #################################################End Code###############################################
  return ds

ds_cleaned = batchwise_remove_special_chars(ds, char_removals)

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [20]:
# Sample data to test
sample_transcripts = [
  {"transcript": "البته رحمتی با پست\u200cهای مختلف در شبکه اجتماعی"},
  {"transcript": "به تارنمای بیمه البرز به نشانی www"},
  {"transcript": "اما نکته قابل توجه این رقابت‌ها بعد از برگزاری دور اول گروه اینکه تیم های کم نام و نشان جام اونقدرا هم ضعیف ظاهر نشدن"}
]

# Convert to dataset
sample_ds = Dataset.from_pandas(pd.DataFrame(sample_transcripts))

# Apply preprocessing
sample_cleaned = batchwise_remove_special_chars(sample_ds, char_removals)



# Define expected output
expected_output = [
  {"transcript": "البته رحمتی با پستهای مختلف در شبکه اجتماعی"},
  {"transcript": "به تارنمای بیمه البرز به نشانی"},
  {"transcript": "اما نکته قابل توجه این رقابتها بعد از برگزاری دور اول گروه اینکه تیم های کم نام و نشان جام اونقدرا هم ضعیف ظاهر نشدن"}
]


# Test the results
for i, record in enumerate(sample_cleaned):
  assert record['transcript'] == expected_output[i]['transcript'], f"Test failed at index {i}: {record['transcript']} != {expected_output[i]['transcript']}"

print("All tests passed!")

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

All tests passed!


## Q2

In [16]:
def generate_vocab_with_frequencies(ds):
  """
  Generates a vocabulary with frequencies from a Dataset containing 'transcript' field.

  Args:
    ds (datasets.Dataset): Dataset containing 'transcript' field from which to generate vocabulary.

  Returns:
    dict: A dictionary where keys are unique characters in the dataset and values are their frequencies.
  """

  vocab_dict = {}
  for transcript in ds['transcript']:
  ################################################Start Code##############################################
    for char in transcript:
        if char in vocab_dict:
            vocab_dict[char] += 1
        else:
            vocab_dict[char] = 1
  #################################################End Code###############################################
  return vocab_dict

In [17]:
mock_data = {
  'transcript': [
      'خواهش می کنم به هر حال امشب',
      'ماه ها پشت سر هم در این تیم',
      'تعداد پیست های دو و میدانی که',
      'البته رحمتی با پست\u200cهای مختلف در شبکه اجتماعی',
      'هر اونس پلاتین ۸۱۰ و هر اونس پالادیوم'
  ]
}

ds = Dataset.from_dict(mock_data)

def test_generate_vocab_with_frequencies():
  vocab_dict = generate_vocab_with_frequencies(ds)

  assert vocab_dict['م'] == 11

  print("Test passed!")

test_generate_vocab_with_frequencies()

Test passed!
