In [None]:
# Install the transformers library to access pre-trained models
!pip install transformers

# Import the necessary libraries and set up the automatic speech recognition (ASR) pipeline
from transformers import pipeline
from google.colab import drive
from glob import glob
import numpy as np  # Manages your Array operations
import pandas as pd
from tqdm import tqdm
from IPython.display import Audio

# Set up the ASR pipeline using a Swahili model
pipe = pipeline("automatic-speech-recognition", model="Akashpb13/Swahili_xlsr", device=0)

# Mount Google Drive to access your files
drive.mount('/content/drive')

# Read the CSV file containing information about the audio files
test = pd.read_csv('/content/drive/MyDrive/Models/SampleSubmission.csv')

# Extract information about the first few rows of the DataFrame
test.head()


Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.3-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m50.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
Ins

(…)13/Swahili_xlsr/resolve/main/config.json:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of the model checkpoint at Akashpb13/Swahili_xlsr were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at Akashpb13/Swahili_xlsr and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-

(…)_xlsr/resolve/main/tokenizer_config.json:   0%|          | 0.00/221 [00:00<?, ?B/s]

(…)b13/Swahili_xlsr/resolve/main/vocab.json:   0%|          | 0.00/408 [00:00<?, ?B/s]

(…)lsr/resolve/main/special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

(…)sr/resolve/main/preprocessor_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

RuntimeError: ignored

In [None]:
# Extract the audio files from the compressed archive
!tar xf "/content/drive/MyDrive/ASR/test0.tar.gz"

# Display an audio file using IPython's Audio widget
Audio("/content/test/common_voice_sw_27729935.mp3")


In [None]:
# Ignore warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Use the ASR pipeline to transcribe a single audio file
pipe("/content/test/common_voice_sw_27729935.mp3")


In [None]:
# Create a new column in the DataFrame to store the modified file paths
test["my_path"] = ["/content/test/" + i for i in test.path]

# Display the updated DataFrame
test


In [None]:
# Use the ASR pipeline to transcribe multiple audio files
pipe(["/content/test/common_voice_sw_27729935.mp3", "/content/test/common_voice_sw_35780884.mp3", "/content/test/common_voice_sw_36450168.mp3"])


In [None]:
# Store the results in a DataFrame
results = pipe(test.my_path.to_list())
result_list = [i["text"] for i in results]  # Extract the transcribed text

# Display the first 3 transcribed samples
result_list[:3]


In [None]:
# Create a submission DataFrame
sub = pd.DataFrame()
sub["path"] = test.path.to_list()
sub["sentence"] = result_list

# Save the submission to a CSV file
sub.to_csv("Bill's Submission no 3.csv", index=False)

# Display the first few rows of the submission DataFrame
sub.head()


In [None]:
# Perform ASR predictions on the entire dataset
res = []
for path in tqdm(test.path):
    res.append(pipe(f'/content/drive/MyDrive/asr/test_audios/{path}')['text'])

# Update the test DataFrame with the transcribed sentences
test['sentence'] = res

# Save the DataFrame to a CSV file for further analysis
test[['audio_ID', 'sentence']].to_csv('/content/drive/MyDrive/asr/res.csv', index=False)

# Display the updated test DataFrame
test
