In [1]:
!pip install -q transformers peft accelerate soundfile

In [None]:
from transformers import pipeline, AutoProcessor, AutoModelForSpeechSeq2Seq
from peft import PeftModel, PeftConfig
import torch

# Load adapter config
config = PeftConfig.from_pretrained("Bruno7/ksa-whisper")

# Initialize pipeline with base Whisper model
pipe = pipeline(
    "automatic-speech-recognition",
    model=config.base_model_name_or_path,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

# Apply the Saudi-dialect adapter
model = PeftModel.from_pretrained(pipe.model, "Bruno7/ksa-whisper")
pipe.model = model

# Run inference
result = pipe("عازم1.mp3")
print(result["text"])

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

normalizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

Device set to use cpu


adapter_model.safetensors:   0%|          | 0.00/231M [00:00<?, ?B/s]

`return_token_timestamps` is deprecated for WhisperFeatureExtractor and will be removed in Transformers v5. Use `return_attention_mask` instead, as the number of frames can be inferred from it.
Using custom `forced_decoder_ids` from the (generation) config. This is deprecated in favor of the `task` and `language` flags/config options.
Transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English. This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`. See https://github.com/huggingface/transformers/pull/28687 for more details.


In [1]:
!pip install -q transformers peft accelerate soundfile torchaudio

In [5]:
import torch, torchaudio
from transformers import pipeline
from peft import PeftModel, PeftConfig
import warnings

# Suppress unnecessary warnings
warnings.filterwarnings("ignore")

# 1. Load adapter config
config = PeftConfig.from_pretrained("Bruno7/ksa-whisper")

# 2. Initialize Whisper Large-v3 pipeline
pipe = pipeline(
    "automatic-speech-recognition",
    model=config.base_model_name_or_path,
    device=0 if torch.cuda.is_available() else -1
)

# 3. Apply Saudi-dialect adapter
model = PeftModel.from_pretrained(pipe.model, "Bruno7/ksa-whisper")
pipe.model = model

# 4. Convert MP3 to WAV (if needed)
input_mp3 = "عازم1.mp3"
wav_file = "azem1.wav"
waveform, sr = torchaudio.load(input_mp3)
torchaudio.save(wav_file, waveform, sr)

# 5. Run inference (force Arabic transcription)
result = pipe(
    wav_file,
    generate_kwargs={"language": "ar", "task": "transcribe"}
)

print("✅ Saudi Arabic Transcription:\n", result["text"])

In [2]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import torch

# تحميل النموذج والمعالج
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic")
model.eval()
model.to("cuda")

# تحميل وتغيير حجم الصوت
def load_and_resample_audio(audio_path, target_sr=16000):
    audio, orig_sr = torchaudio.load(audio_path)
    if orig_sr != target_sr:
        audio = torchaudio.functional.resample(audio, orig_sr, target_sr)
    return audio.squeeze(0).numpy()

# تحميل ملف الصوت
audio_path = "/content/بدر2.mp3"
audio = load_and_resample_audio(audio_path)

# معالجة الصوت
input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values.to("cuda")

# الحصول على التنبؤات
with torch.no_grad():
    logits = model(input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)

# فك التشفير إلى نص
transcription = processor.decode(predicted_ids[0])
print("النص المعترف به:", transcription)

النص المعترف به: ترى الباريح كان الجو حر مرة ما غدرتنامزين  وليوم ناو يا روح مع الربعلى المول نت غاهوا هو انسوَلف


In [4]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import torch

# تحميل النموذج والمعالج
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic")
model.eval()
model.to("cuda")

# تحميل وتغيير حجم الصوت
def load_and_resample_audio(audio_path, target_sr=16000):
    audio, orig_sr = torchaudio.load(audio_path)
    if orig_sr != target_sr:
        audio = torchaudio.functional.resample(audio, orig_sr, target_sr)
    return audio.squeeze(0).numpy()

# تحميل ملف الصوت
audio_path = "/content/عازم1.mp3"
audio = load_and_resample_audio(audio_path)

# معالجة الصوت
input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values.to("cuda")

# الحصول على التنبؤات
with torch.no_grad():
    logits = model(input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)

# فك التشفير إلى نص
transcription = processor.decode(predicted_ids[0])
print("النص المعترف به:", transcription)

النص المعترف به: ترى البارح كان الجو حر مرة ما غدرتنامزين  واليوم ناوي روح مع الربع ليلمول نتغهوا وانسولف


In [5]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import torch

# تحميل النموذج والمعالج
processor = Wav2Vec2Processor.from_pretrained("othrif/wav2vec2-large-xlsr-arabic")
model = Wav2Vec2ForCTC.from_pretrained("othrif/wav2vec2-large-xlsr-arabic")
model.eval()
model.to("cuda")  # إذا عندك GPU

# دالة لتحميل وتغيير معدل الصوت
def load_and_resample_audio(audio_path, target_sr=16000):
    audio, orig_sr = torchaudio.load(audio_path)
    if orig_sr != target_sr:
        audio = torchaudio.functional.resample(audio, orig_sr, target_sr)
    return audio.squeeze(0).numpy()

# تحميل ملف الصوت
audio_path = "/content/عازم1.mp3"
audio = load_and_resample_audio(audio_path)

# معالجة الصوت
input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values.to("cuda")

# الحصول على التنبؤات
with torch.no_grad():
    logits = model(input_values).logits

predicted_ids = torch.argmax(logits, dim=-1)

# فك التشفير إلى نص
transcription = processor.decode(predicted_ids[0])
print("النص المعترف به:", transcription)


preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/495 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

النص المعترف به: ترى البارح كان الجو حر مرة  ما غدرتنامزين واليوم نا ويروح مع الربع للمول نتغهوة وانسولف


In [6]:
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torchaudio
import torch

# قائمة النماذج
models = [
    "othrif/wav2vec2-large-xlsr-arabic",
    "elgeish/wav2vec2-large-xlsr-53-arabic",
    "mohammed/wav2vec2-large-xlsr-arabic"
]

# دالة لتحميل وتغيير معدل الصوت
def load_and_resample_audio(audio_path, target_sr=16000):
    audio, orig_sr = torchaudio.load(audio_path)
    if orig_sr != target_sr:
        audio = torchaudio.functional.resample(audio, orig_sr, target_sr)
    return audio.squeeze(0).numpy()

# دالة لتشغيل التعرف على الكلام لأي نموذج
def transcribe(audio, model_name):
    processor = Wav2Vec2Processor.from_pretrained(model_name)
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
    model.eval()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    input_values = processor(audio, sampling_rate=16000, return_tensors="pt").input_values.to(device)
    with torch.no_grad():
        logits = model(input_values).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription

# تحميل ملف الصوت
audio_path = "/content/عازم1.mp3"
audio = load_and_resample_audio(audio_path)

# تجربة كل نموذج وطباعة النتائج
for model_name in models:
    text = transcribe(audio, model_name)
    print(f"النص المعترف به بواسطة {model_name}:\n{text}\n{'-'*50}")


النص المعترف به بواسطة othrif/wav2vec2-large-xlsr-arabic:
ترى البارح كان الجو حر مرة  ما غدرتنامزين واليوم نا ويروح مع الربع للمول نتغهوة وانسولف
--------------------------------------------------


preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/303 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

النص المعترف به بواسطة elgeish/wav2vec2-large-xlsr-53-arabic:
trY AlbArH kAn Aljw Hr mrp mA gdrt nAmzyn wAlywm nAwyrwH mE Alrbh lA Almwl ntjhwA wns wlf
--------------------------------------------------


model.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/158 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

النص المعترف به بواسطة mohammed/wav2vec2-large-xlsr-arabic:
ترل بارح كان الجو حر ملة ما كدر تنامزين واليوم نعوروح مع ربع للمول نتغهواء والسولفف
--------------------------------------------------


In [9]:
# from transformers import Wav2Vec2Processor, NllbTokenizer, AutoModel, AutoModelForSeq2SeqLM
# import torchaudio
# import torch

# # دالة لتحميل الصوت وتغيير معدل العينة
# def load_and_resample_audio(audio_path, target_sr=16000):
#     audio, orig_sr = torchaudio.load(audio_path)
#     if orig_sr != target_sr:
#         audio = torchaudio.functional.resample(audio, orig_sr, target_sr)
#     return audio.squeeze(0).numpy()

# # تحميل المعالج للنموذج الصوتي
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

# # تحميل ZeroSwot encoder
# commit_hash = "9cd290f5eef6dc179819815e7c970ae73616f9fa"
# zeroswot_encoder = AutoModel.from_pretrained(
#     "johntsi/ZeroSwot-Large_asr-mustc_en-to-200",
#     trust_remote_code=True,
#     revision=commit_hash,
# )
# zeroswot_encoder.eval()
# device = "cuda" if torch.cuda.is_available() else "cpu"
# zeroswot_encoder.to(device)

# # تحميل نموذج NLLB للترجمة/تحويل embeddings إلى نص
# tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-1.3B")
# nllb_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-1.3B")
# nllb_model.eval()
# nllb_model.to(device)

# # تحميل ملف الصوت
# audio_path = "/content/عازم1.mp3"
# audio = load_and_resample_audio(audio_path)

# # تجهيز input_values و attention_mask
# inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
# input_values = inputs.input_values.to(device)
# attention_mask = inputs.attention_mask.to(device)

# # الحصول على embeddings من ZeroSwot
# with torch.no_grad():
#     compressed_embeds, attention_mask_out = zeroswot_encoder(
#         input_values=input_values,
#         attention_mask=attention_mask
#     )

# # استخدام NLLB للحصول على النص النهائي بالعربية
# with torch.no_grad():
#     predicted_ids = nllb_model.generate(
#         inputs_embeds=compressed_embeds,
#         attention_mask=attention_mask_out,
#         forced_bos_token_id=tokenizer.lang_code_to_id["arb_Arab"],  # رمز اللغة العربية
#         num_beams=5,
#     )

# translation = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)
# print("النص النهائي بالعربية:", translation)
