<a href="https://colab.research.google.com/github/Baba-tt/testrepo/blob/master/LibEnglisg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
# --- Step 1: Install All Dependencies ---
!pip install yt-dlp
!pip install git+https://github.com/openai/whisper.git
!pip install librosa soundfile pandas gradio

import os
import yt_dlp
import whisper
import librosa
import soundfile as sf
import pandas as pd
import gradio as gr

# --- Step 2: Helper Functions ---

# 1. Download and convert YouTube audio to WAV
def download_youtube_audio(youtube_links):
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': '%(title)s.%(ext)s',
        'noplaylist': True,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
        }]
    }

    downloaded_files = []
    for url in youtube_links.strip().splitlines():
        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                info_dict = ydl.extract_info(url, download=True)
                title = info_dict.get('title', 'audio')
                filename = f"{title}.wav"
                downloaded_files.append(filename)
        except Exception as e:
            print(f"Failed to download {url}: {e}")
    return downloaded_files

# 2. Segment audio into 30-second clips
def segment_audio(input_file, output_dir='segmented_audio', segment_length=30):
    try:
        y, sr = librosa.load(input_file, sr=None)
        total_length = librosa.get_duration(y=y, sr=sr)
        num_segments = int(total_length // segment_length)

        os.makedirs(output_dir, exist_ok=True)

        for i in range(num_segments):
            start_sample = int(i * segment_length * sr)
            end_sample = int(min((i + 1) * segment_length * sr, len(y)))
            segment = y[start_sample:end_sample]
            output_file = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(input_file))[0]}_segment_{i+1}.wav")
            sf.write(output_file, segment, sr)
    except Exception as e:
        print(f"Error segmenting {input_file}: {e}")

# 3. Transcribe segmented audio files
def transcribe_audio():
    model = whisper.load_model("base")
    data = []
    segmented_dir = 'segmented_audio'

    for filename in os.listdir(segmented_dir):
        if filename.endswith('.wav'):
            path = os.path.join(segmented_dir, filename)
            try:
                result = model.transcribe(path)
                transcription = result['text']
                data.append([filename, transcription])
            except Exception as e:
                print(f"Error transcribing {filename}: {e}")

    df = pd.DataFrame(data, columns=["filename", "transcription"])
    df.to_csv("liberian_transcriptions.csv", index=False)
    return "liberian_transcriptions.csv"

# 4. Combined process for Gradio interface
def process_links(youtube_input):
    downloaded = download_youtube_audio(youtube_input)
    for f in downloaded:
        if f.endswith(".wav"):
            segment_audio(f)
    csv_path = transcribe_audio()
    return csv_path


Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-_qzkgopb
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-_qzkgopb
  Resolved https://github.com/openai/whisper.git to commit 517a43ecd132a2089d85f4ebc044728a71d49f6e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [21]:
# --- Step 3: Launch Gradio App ---
gr.Interface(
    fn=process_links,
    inputs=gr.Textbox(lines=6, label="Paste YouTube URLs (one per line)"),
    outputs=gr.File(label="Download Transcription CSV"),
    title="Liberian English Transcriber",
    description="Paste YouTube video links in Liberian English. This tool will download the audio, segment it, and transcribe using Whisper."
).launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f4ccaa6f1c5bfdcaef.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [23]:
# Install required libraries
!pip install gradio pandas

# Sample rule-based translation function (simplified)
def translate_to_liberian_english(text):
    rules = {
        "going to": "goin' go",
        "don't": "noh",
        "I'm": "Ah",
        "isn't it?": "enneh?",
        "you're": "yu",
        "friend": "padi",
        "very": "plenty",
        "child": "pikin"
    }

    for se, le in rules.items():
        text = text.replace(se, le)

    return text







In [24]:
import gradio as gr

def interface_fn(text):
    return translate_to_liberian_english(text)

gr.Interface(fn=interface_fn,
             inputs=gr.Textbox(label="Standard English"),
             outputs=gr.Textbox(label="Liberian English"),
             title="Standard English to Liberian English Translator").launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://204a5e05bde5a7cb6a.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


