----------------------------------------------------------

In [1]:
# Install required libraries
!pip install transformers datasets torch soundfile -q

# Import necessary libraries
import zipfile
import os
import pandas as pd
from datasets import Dataset
import soundfile as sf
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
#from google.colab import drive, files


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
!pip install evaluate -q


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
# Extract uploaded ZIP files



extract_dir = "tunisian_data"
os.makedirs(extract_dir, exist_ok=True)

"""zip_files = [
    "/content/language_annotation.zip",
    "/content/test_wavs.zip",
    "/content/TunSwitchTO.zip",
    "/content/TunSwitchCS.zip"
]"""
zip_files = [
    "../data/language_annotation.zip",
    "../data/test_wavs.zip",
    "../data/TunSwitchTO.zip",
    "../data/TunSwitchCS.zip"
]

for zip_path in zip_files:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
        print(f"Extracted {zip_path} to {extract_dir}")

Extracted ../data/language_annotation.zip to tunisian_data
Extracted ../data/test_wavs.zip to tunisian_data
Extracted ../data/TunSwitchTO.zip to tunisian_data
Extracted ../data/TunSwitchCS.zip to tunisian_data


In [None]:
# Prepare the dataset
audio_dirs = ["tunisian_data/TunSwitchCS", "tunisian_data/TunSwitchTO"]
#annotation_file = "tunisian_data/language_annotation.csv"  # Adjust based on extracted content
annotation_file = "tunisian_data/language_annotation/traincs.csv"

"""# Check if annotation file exists and adjust path if needed
if not os.path.exists(annotation_file):
    # Search for a likely annotation file
    for root, dirs, files in os.walk(extract_dir):
        for file in files:
            if file.endswith(".csv") or file.endswith(".txt"):
                annotation_file = os.path.join(root, file)
                print(f"Found annotation file: {annotation_file}")
                break
    if not os.path.exists(annotation_file):
        raise FileNotFoundError("No annotation file found. Please specify the correct path.")
"""

Found annotation file: tunisian_data\language_annotation\dev_cs.csv
Found annotation file: tunisian_data\TunSwitchCS\dev.csv
Found annotation file: tunisian_data\TunSwitchTO\dev.csv


In [None]:
"""
# Load annotations (assuming CSV with file_name and transcription columns)
annotations = pd.read_csv(annotation_file)
if "file_name" not in annotations.columns or "transcription" not in annotations.columns:
    print("Column names in annotation file:", annotations.columns)
    raise KeyError("Annotation file must have 'file_name' and 'transcription' columns. Adjust column names in code.")
"""

import pandas as pd
import os

# Define the default annotation file path
default_annotation_file = "tunisian_data/language_annotation/traincs.csv"
# Search for the annotation file if default doesn't exist
annotation_file = default_annotation_file
if not os.path.exists(annotation_file):
    print(f"Default annotation file not found: {annotation_file}")
    annotation_file = None
    for root, dirs, files in os.walk("tunisian_data"):
        for file in files:
            if file.endswith((".csv", ".txt")):
                annotation_file = os.path.join(root, file)
                print(f"Found potential annotation file: {annotation_file}")
                break
    if annotation_file is None:
        raise FileNotFoundError("No annotation file (.csv or .txt) found in /tunisian_data. Please upload or specify the correct path.")

# Load annotations
try:
    annotations = pd.read_csv(annotation_file)
except Exception as e:
    raise ValueError(f"Failed to load {annotation_file}: {str(e)}")

# Define expected column roles and possible names
column_mapping = {
    "file_name": ["file_name", "filename", "wav", "audio"],  # Possible names for audio file column
    "transcription": ["transcription", "text", "wrd", "transcript"]  # Possible names for transcription column
}

# Find matching columns
file_name_col = next((col for col in column_mapping["file_name"] if col in annotations.columns), None)
transcription_col = next((col for col in column_mapping["transcription"] if col in annotations.columns), None)

if file_name_col is None or transcription_col is None:
    print("Column names in annotation file:", annotations.columns)
    raise KeyError(f"Annotation file must have columns for file name (e.g., {', '.join(column_mapping['file_name'])}) "
                   f"and transcription (e.g., {', '.join(column_mapping['transcription'])}). "
                   f"Adjust column names in code or file.")

# Rename columns for consistency
annotations = annotations.rename(columns={file_name_col: "file_name", transcription_col: "transcription"})
print(f"Loaded annotations from {annotation_file} with columns: file_name, transcription")

Default annotation file not found: tunisian_data/language_annotation.csv
Found potential annotation file: tunisian_data\language_annotation\dev_cs.csv
Found potential annotation file: tunisian_data\TunSwitchCS\dev.csv
Found potential annotation file: tunisian_data\TunSwitchTO\dev.csv
Loaded annotations from tunisian_data\TunSwitchTO\dev.csv with columns: file_name, transcription


In [8]:
# Prepare data
data = []
for idx, row in annotations.iterrows():
    for audio_dir in audio_dirs:
        audio_path = os.path.join(audio_dir, row["file_name"])
        if os.path.exists(audio_path):
            audio, sr = sf.read(audio_path)
            duration = len(audio) / sr
            data.append({
                "ID": idx + 1,
                "wav": audio_path,
                "wrd": row["transcription"],
                "duration": duration
            })
            break  # Use the first matching audio file


In [None]:
# Create DataFrame and save as CSV
df = pd.DataFrame(data)
train_csv_path = "tunisian_data/train.csv"
df.to_csv(train_csv_path, index=False)
print(f"Created training CSV at: {train_csv_path}")


Created training CSV at: /content/tunisian_data/train.csv


In [10]:

# Load dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
val_dataset = dataset["test"]


In [24]:

# Preprocess the Data
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
processor.tokenizer.set_prefix_tokens(language="ar")

import torchaudio
#best if not the one working
"""
def preprocess(batch):
    audio_path = batch["wav"]
    waveform, sample_rate = torchaudio.load(audio_path)

    # Convert to mono if needed
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    # Resample if needed
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    # Convert to numpy
    speech = waveform.squeeze().numpy()

    batch["input_features"] = processor(speech, sampling_rate=16000).input_features[0]
    batch["labels"] = processor.tokenizer(batch["wrd"]).input_ids
    return batch"""

def preprocess(batch):
    audio, sample_rate = sf.read(batch["wav"])
    if sample_rate != 16000:
        from scipy.signal import resample
        num_samples = int(len(audio) * 16000 / sample_rate)
        audio = resample(audio, num_samples)
    batch["input_features"] = processor(audio, sampling_rate=16000).input_features[0]
    batch["labels"] = processor.tokenizer(batch["wrd"]).input_ids
    return batch


"""def preprocess(batch):
    audio, sample_rate = sf.read(batch["wav"])
    batch["input_features"] = processor(audio, sampling_rate=16000).input_features[0]
    batch["labels"] = processor.tokenizer(batch["wrd"]).input_ids
    return batch"""


'def preprocess(batch):\n    audio, sample_rate = sf.read(batch["wav"])\n    batch["input_features"] = processor(audio, sampling_rate=16000).input_features[0]\n    batch["labels"] = processor.tokenizer(batch["wrd"]).input_ids\n    return batch'

In [14]:



train_dataset = train_dataset.map(preprocess)
val_dataset = val_dataset.map(preprocess)


In [12]:
!pip install jiwer -q


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import evaluate

wer_metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Decode predictions and labels
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    # Compute WER
    wer = wer_metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}



Downloading builder script: 5.13kB [00:00, 2.41MB/s]


In [16]:
!pip install transformers[torch]

Collecting torch<2.7,>=2.1 (from transformers[torch])
  Using cached torch-2.6.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting accelerate>=0.26.0 (from transformers[torch])
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting sympy==1.13.1 (from torch<2.7,>=2.1->transformers[torch])
  Using cached sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading accelerate-1.8.1-py3-none-any.whl (365 kB)
   ---------------------------------------- 0.0/365.3 kB ? eta -:--:--
   - -------------------------------------- 10.2/365.3 kB ? eta -:--:--
   ------ --------------------------------- 61.4/365.3 kB 1.1 MB/s eta 0:00:01
   --------------- ------------------------ 143.4/365.3 kB 1.4 MB/s eta 0:00:01
   ----------------------------- ---------- 266.2/365.3 kB 1.8 MB/s eta 0:00:01
   ---------------------------------------- 365.3/365.3 kB 1.9 MB/s eta 0:00:00
Using cached torch-2.6.0-cp312-cp312-win_amd64.whl (204.1 MB)
Using cached sympy-1.13.1-py3-none-any.whl (

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.7.1 requires torch==2.7.1, but you have torch 2.6.0 which is incompatible.

[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
"""
# Set Up the Model and Training Arguments
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/whisper-tunisian",
    per_device_train_batch_size=8,
    learning_rate=1e-5,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,

)

save_total_limit=1,  #these are to save the best model based on wer
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
"""


#)

# Set Up the Model and Training Arguments
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/whisper-tunisian",
    per_device_train_batch_size=8,
    learning_rate=1e-5,
    num_train_epochs=3,
    # evaluation_strategy removed, use eval_dataset instead
    eval_steps=0,  # Set to 0 to evaluate only at the end of each epoch (default with eval_dataset)
    save_steps=0,  # Save only at epoch end with save_strategy="epoch"
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",  # Evaluate and save at the end of each epoch
    save_total_limit=1,  # Keep only the best model
    load_best_model_at_end=False,
    metric_for_best_model="wer",
    greater_is_better=False,
    logging_steps=10,
)

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.26.0`: Please run `pip install transformers[torch]` or `pip install 'accelerate>=0.26.0'`

In [None]:

# Fine-Tune the Model
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=processor.tokenizer,
    compute_metrics=compute_metrics,

)

  trainer = Seq2SeqTrainer(


In [None]:
from inspect import signature
print(signature(model.forward))
print(train_dataset.column_names)


NameError: name 'model' is not defined

In [None]:
#dont use it unless taarf win o wakteh khtr belk tbadddalk el data o trassilk data khayba
def prepare(batch):
    # audio → input_values
    batch["input_values"] = processor(
        batch["audio"]["array"], sampling_rate=16_000
    ).input_values[0]

    # text → labels
    with processor.as_target_processor():
        batch["labels"] = processor(batch["transcript"]).input_ids
    return batch

train_dataset = train_dataset.map(prepare, remove_columns=train_dataset.column_names)

In [27]:
print(dataset.column_names)

{'train': [], 'test': []}


In [None]:


trainer.train()


ValueError: No columns in the dataset match the model's forward method signature: ({', '.join(signature_columns)}). The following columns have been ignored: []. Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`.

In [None]:

# Save the Model
model.save_pretrained("/content/whisper-tunisian")
processor.save_pretrained("/content/whisper-tunisian")

# Optional: Save to Google Drive
drive.mount('/content/drive')
!cp -r /content/whisper-tunisian /content/drive/MyDrive/whisper-tunisian


In [None]:

# Download the Model
!zip -r whisper-tunisian.zip /content/whisper-tunisian
files.download("whisper-tunisian.zip")


In [None]:

# --- Using the Fine-Tuned Model Locally ---
# Install dependencies locally:
# pip install transformers torch soundfile

# Load the model locally:
# from transformers import WhisperForConditionalGeneration, WhisperProcessor
# model = WhisperForConditionalGeneration.from_pretrained("./whisper-tunisian")
# processor = WhisperProcessor.from_pretrained("./whisper-tunisian")

# Transcribe audio locally:
# import soundfile as sf
# audio, sample_rate = sf.read("path_to_audio.wav")
# inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
# outputs = model.generate(**inputs)
# transcription = processor.batch_decode(outputs, skip_special_tokens=True)[0]
# print(transcription)