In [None]:
# Create a conda environment with the required packages (Python 3.9).
!pip install -r requirements.txt

In [None]:
from vosk import Model, KaldiRecognizer

FRAME_RATE = 16000
CHANNELS = 1

model = Model(model_name='vosk-model-en-us-0.22')

recognizer = KaldiRecognizer(model, FRAME_RATE)

In [None]:
from pydub import AudioSegment
import os

def split_audio(file_path, output_dir, num_splits):
    # Load the audio file.
    audio = AudioSegment.from_mp3(file_path)

    # Calculate the duration of each split.
    split_duration = len(audio) // num_splits

    # Ensure the output directory exists.
    os.makedirs(output_dir, exist_ok=True)

    # Split and export the audio.
    for i in range(num_splits):
        start_time = i * split_duration
        end_time = (i + 1) * split_duration if i < num_splits - 1 else len(audio)
        split_audio = audio[start_time:end_time]
        split_audio.export(f"{output_dir}/split_{i + 1}.mp3", format="mp3")
        print(f"Exported: split_{i + 1}.mp3")

file_path = "/home/benyamain/Desktop/AudioSummarization/Karpathy_GPT2.mp3"
output_dir = "/home/benyamain/Desktop/AudioSummarization/"
num_splits = 16

split_audio(file_path, output_dir, num_splits)

In [None]:
from pydub import AudioSegment

mp3 = AudioSegment.from_mp3('/home/benyamain/Desktop/AudioSummarization/split_1.mp3')
mp3 = mp3.set_channels(CHANNELS)
mp3 = mp3.set_frame_rate(FRAME_RATE)

In [None]:
# Do not run this cell if your audio file is too large!
mp3.raw_data

In [None]:
recognizer.AcceptWaveform(mp3.raw_data)
result = recognizer.Result()
# result

In [None]:
import json

text = json.loads(result)['text']
text

In [None]:
json.loads(result)

In [None]:
import subprocess
# Requires this model, 'vosk-recasepunc-en-0.22', which can be found through this link: https://alphacephei.com/vosk/models
cased = subprocess.check_output("python3 recasepunc/recasepunc.py predict recasepunc/checkpoint", shell=True, text=True, input=text)
cased

In [None]:
def voice_recognition(filename):
    model = Model(model_name="vosk-model-en-us-0.22")
    
    recognizer = KaldiRecognizer(model, FRAME_RATE)
    recognizer.SetWords(True)
    
    mp3 = AudioSegment.from_mp3(filename)
    mp3 = mp3.set_channels(CHANNELS)
    mp3 = mp3.set_frame_rate(FRAME_RATE)
    
    step = 45000
    transcript = ""
    
    for i in range(0, len(mp3), step):
        print(f"Progress: {i/len(mp3)}")
        segment = mp3[i:i+step]
        
        recognizer.AcceptWaveform(segment.raw_data)
        result = recognizer.Result()
        
        text = json.loads(result)["text"]
        transcript += text
    
    cased = subprocess.check_output('python3 recasepunc/recasepunc.py predict recasepunc/checkpoint', shell=True, text=True, input=transcript)
    
    return cased

In [None]:
voice_recognition('/home/benyamain/Desktop/AudioSummarization/split_1.mp3')

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("summarization")

In [None]:
with open("transcript.txt") as f:
    transcript = f.read()

In [None]:
# Match token length of model (1024)
split_tokens = transcript.split(" ")
docs = []

for i in range(0, len(split_tokens), 850):
    selection = " ".join(split_tokens[i:(i+850)])
    docs.append(selection)

In [None]:
docs
summaries = summarizer(docs)
summaries

In [None]:
summary = "\n\n".join(d['summary_text'] for d in summaries)
print(summary)