In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 

/kaggle/input/newdataset/vk.mp3


## ⚙️ Install Required Libraries

This installs `whisper`, `google-generativeai`, and [langgraph](https://www.langgraph.dev/) — a library for building multi-step AI workflows.  
Run this cell only once per session.


In [2]:
# Install dependencies (run only once)
!pip install -q openai-whisper google-generativeai langgraph

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m145.0/145.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.2/47.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s

In [3]:
!pip uninstall -qqy jupyterlab  # Remove unused conflicting packages
!pip install -U -q "google-genai==1.7.0"

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.7/144.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
jupyterlab-lsp 3.10.2 requires jupyterlab<4.0.0a0,>=3.1.0, which is not installed.[0m[31m
[0m

In [4]:
!pip install TTS

Collecting TTS
  Downloading TTS-0.22.0-cp311-cp311-manylinux1_x86_64.whl.metadata (21 kB)
Collecting scikit-learn>=1.3.0 (from TTS)
  Downloading scikit_learn-1.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting anyascii>=0.3.0 (from TTS)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pysbd>=0.3.4 (from TTS)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting umap-learn>=0.5.1 (from TTS)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pandas<2.0,>=1.4 (from TTS)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting trainer>=0.0.32 (from TTS)
  Downloading trainer-0.0.36-py3-none-any.whl.metadata (8.1 kB)
Collecting coqpit>=0.0.16 (from TTS)
  Downloading coqpit-0.0.17-py3-none-any.whl.metadata (11 kB)
Collecting pypinyin (from TTS)
  Downloading pypinyin-0.54.0-py2.py3-none-any.whl.metadata (12 

## 📦 Import Libraries & Setup Gemini Model

Imports `whisper` for speech-to-text, `genai` for Gemini, and `langgraph` for workflow logic.  
Initializes Gemini 1.5 Pro for AI content generation.


In [5]:
# Import necessary libraries
import whisper
import google.generativeai as genai
from langgraph.graph import StateGraph, END
import json
import os

gemini_model = genai.GenerativeModel('gemini-1.5-pro')

In [6]:
from google import genai
from google.genai import types

genai.__version__

'1.7.0'

In [7]:
from kaggle_secrets import UserSecretsClient
import google.generativeai as genai

# Load API key from Kaggle secrets
GOOGLE_API_KEY = UserSecretsClient().get_secret("GOOGLE_API_KEY")

# Configure Gemini client
genai.configure(api_key=GOOGLE_API_KEY)


## 🧠 Load Whisper ASR Model

This loads the `"small"` Whisper model by OpenAI for converting audio to text.  
🔗 [Whisper GitHub](https://github.com/openai/whisper) | Great for speech recognition tasks.


In [8]:
# Load Whisper ASR (Automatic Speech Recognition) model
whisper_model = whisper.load_model("small")

def transcribe_audio(audio_file_path):
    result = whisper_model.transcribe(audio_file_path)
    text = result["text"]
    print(text)  # <-- print inside the function
    return text

100%|███████████████████████████████████████| 461M/461M [00:14<00:00, 33.1MiB/s]
  checkpoint = torch.load(fp, map_location=device)


##  Few-Shot Grammar Correction Examples

Provides sample input-output pairs to guide Gemini in correcting grammar.  
Helps the model understand the expected correction format for similar sentences.


In [9]:
# Few-shot examples for better corrections
few_shot_examples = """
Example 1:
User: i am working in Google
Response: {"original": "i am working in Google", "corrected": "I am working at Google."}

Example 2:
User: she go to school every day
Response: {"original": "she go to school every day", "corrected": "She goes to school every day."}

Example 3:
User: they is playing outside
Response: {"original": "they is playing outside", "corrected": "They are playing outside."}
"""

## 🤖 Correct Grammar Using Gemini with JSON Output

This function sends a prompt to [Gemini 1.5 Pro](https://ai.google.dev/) to fix grammar mistakes in a sentence.  
It uses few-shot examples to help Gemini understand the correction style.  
The model's response is expected in structured JSON like `{"original": ..., "corrected": ...}`.  
If the output isn't valid JSON, it gracefully falls back to a default format.  
This ensures consistent and clean correction results that can be easily parsed and reused.


In [10]:
# Correct text using Gemini and structured JSON output
def correct_text_with_gemini(text):
    prompt = f"""
You are an expert English grammar corrector. Always respond with a valid JSON object.

{few_shot_examples}

Now correct this sentence:
User: {text}
Response:
"""
    response = gemini_model.generate_content(prompt)

    try:
        # Ensure valid JSON format
        correction_json = json.loads(response.text)
    except json.JSONDecodeError:
        correction_json = {"original": text, "corrected": response.text.strip()}

    return correction_json

## 🔄 Define LangGraph State and Nodes

This code defines a custom state `AudioCorrectionState` using Python's `@dataclass`, which holds:
- `audio_path`: the location of the audio file.
- `text`: transcribed text from the audio.
- `corrected`: dictionary storing both original and corrected sentences.

Two **LangGraph nodes** (functions) are defined:
- `node_transcribe`: uses Whisper to convert audio into text.
- `node_correct`: uses Gemini to fix grammar in the transcribed text.

🧠 Learn more about LangGraph stateful workflows:  
🔗 [LangGraph GitHub](https://github.com/langchain-ai/langgraph)


In [11]:
# Define the LangGraph State
from dataclasses import dataclass

@dataclass
class AudioCorrectionState:
    audio_path: str = ""
    text: str = ""
    corrected: dict = None

def node_transcribe(state):
    audio_path = state.audio_path   # <-- FIXED
    text = transcribe_audio(audio_path)
    state.text = text               # <-- FIXED
    return state

def node_correct(state):
    text = state.text               # <-- FIXED
    corrected = correct_text_with_gemini(text)
    state.corrected = corrected     # <-- FIXED
    return state

## 🧩 Create LangGraph Workflow

This builds a **LangGraph workflow** using the `AudioCorrectionState` as the state container.  
- `add_node()` adds processing steps (`transcribe` and `correct`) to the graph.  
- `set_entry_point("transcribe")` sets the workflow's starting point.  
- `add_edge()` defines the execution flow: transcribe ➝ correct ➝ END.  
Finally, `graph.compile()` builds the ready-to-run app from this logic.

🔗 Learn more at [LangGraph Documentation](https://docs.langgraph.dev/)


In [12]:
# Create LangGraph Workflow
graph = StateGraph(AudioCorrectionState)
graph.add_node("transcribe", node_transcribe)
graph.add_node("correct", node_correct)

graph.set_entry_point("transcribe")
graph.add_edge("transcribe", "correct")
graph.add_edge("correct", END)

app = graph.compile()

## 🧠 Step-by-step Audio Correction & Speech Synthesis using Whisper + Gemini + TTS

1. **Audio Input**: Load an audio file (e.g., `vk.mp3`) to process.
2. **Invoke LangGraph App**: Automatically runs transcription (Whisper) and grammar correction (Gemini 1.5).
3. **Extract Corrected Text**: Parse the returned result to fetch only the final corrected sentence.
4. **Initialize TTS Model**: Use 🤖 [Coqui TTS](https://github.com/coqui-ai/TTS) to synthesize speech in the original speaker's voice.
5. **Speech Generation**: Output corrected audio as `corrected_audio.mp3` using the original speaker's tone (via `speaker_wav`).

✅ This creates a full speech-to-corrected-speech loop: **MP3 ➝ Clean Text ➝ Corrected MP3**

🔗 TTS Docs: [https://tts.readthedocs.io](https://tts.readthedocs.io)


In [13]:
from TTS.api import TTS
import json
import re

# Step 1: Audio file path
audio_file_path = "/kaggle/input/newdataset/vk.mp3"

# Step 2: Prepare input
inputs = {
    "audio_path": audio_file_path,
}

# Step 3: Run transcription and correction
result = app.invoke(inputs)

# Step 4: Extract corrected text
corrected_raw = result['corrected']

# Step 5: If corrected_raw is a dict, get 'corrected' field
if isinstance(corrected_raw, dict):
    corrected_raw = corrected_raw.get("corrected", "").strip()
else:
    raise ValueError("Unexpected format: corrected_raw should be a dict!")

# Step 6: Clean up the triple backticks and JSON identifier
corrected_raw = corrected_raw.replace("```json", "").replace("```", "").strip()

# Step 7: Parse it into JSON
corrected_dict = json.loads(corrected_raw)

# Step 8: Now extract ONLY the 'corrected' text
corrected_text = corrected_dict.get("corrected", "").strip()

# Step 9: Print only corrected sentence
print("\nCorrected Text (final to synthesize):", corrected_text)

# Step 10: Prepend a label if needed
corrected_text = "corrected Text is " + corrected_text

# Step 11: Initialize TTS model
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=True, gpu=False)

# Step 12: Generate corrected audio
if corrected_text:
    tts.tts_to_file(
        text=corrected_text,
        file_path="corrected_audio.mp3",
        speaker_wav=audio_file_path,  # Speaker voice sample
        language="en"
    )
    print("\n✅ Corrected audio saved as corrected_audio.mp3")
else:
    print("\n⚠️ No corrected text found!")




 She go to school every day but not have books. Yesterday he ate apple and drink milk but he don't like it. They was playing in a park but then rains came. I no understand why people is not helps each other. Me very tired because walking too many kilometers.

Corrected Text (final to synthesize): She goes to school every day but doesn't have books. Yesterday he ate an apple and drank milk, but he didn't like it. They were playing in a park, but then rain came. I don't understand why people don't help each other. I'm very tired because I walked too many kilometers.
 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--your_tts


 98%|█████████▊| 419M/425M [00:04<00:00, 85.1MiB/s]

 > Model's license - CC BY-NC-ND 4.0
 > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:True
 | > db_level:-27.0
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400
 > External Speaker Encoder Loaded !!
 > initialization of language-embedding layers.
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10