In [None]:
import os
import re
import textwrap
import warnings
import pyperclip
import soundfile as sf
from pynput import keyboard
from plyer import notification
from playsound import playsound
from num2words import num2words
from transformers import BigBirdPegasusForConditionalGeneration, AutoTokenizer

# this is 
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
os.system("export CUDA_VISIBLE_DEVICES=\"\"")

** SUMMERIZATION models

In [None]:
# # facebook/bart-large-cnn
# from transformers import pipeline
# summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
# def facebook_bart_large_cnn(text):
#     summary = summarizer(text, max_length=20 if len(text.split())<=50 else 70, min_length=10, do_sample=False)
#     return summary[0]['summary_text']

In [None]:
# google/bigbird-pegasus-large-bigpatent
summarytokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-bigpatent")
summarymodel = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-bigpatent", attention_type="original_full")
def google_bigbird_pegasus_large_bigpatent(text):
    inputs = summarytokenizer(text, return_tensors='pt')
    summaryprediction = summarymodel.generate(**inputs)
    thesummary = summarytokenizer.batch_decode(summaryprediction)
    return thesummary[0].replace('<s>', '').replace('</s>', '').strip()

** Text To Speech models

In [None]:
# # microsoft/speecht5_tts
# import torch
# from datasets import load_dataset
# from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

# def microsoft_speecht5_tts(text):
#     inputs = processor(text=text, return_tensors="pt")
#     return model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder), 22000

In [None]:
# facebook/fastspeech2-en-ljspeech
from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
fairseqmodels, fairseqcfg, fairseqtask = load_model_ensemble_and_task_from_hf_hub(
    "facebook/fastspeech2-en-ljspeech",
    arg_overrides={"vocoder": "hifigan", "fp16": False}
)
fairseqmodel = fairseqmodels[0].to('cpu')
TTSHubInterface.update_cfg_with_data_cfg(fairseqcfg, fairseqtask.data_cfg)
generator = fairseqtask.build_generator(fairseqmodels, fairseqcfg)

def facebook_fastspeech2_en_ljspeech(text):
    sample = TTSHubInterface.get_model_input(fairseqtask, text)
    return TTSHubInterface.get_prediction(fairseqtask, fairseqmodel, generator, sample)

In [None]:
def show_notification(title, message):
    notification.notify(
        title=title,
        message=message,
        timeout=10
    )

In [None]:
def summerize(text):
    print(len(text.split()))
    print(textwrap.fill(text, 80))

    # return facebook_bart_large_cnn(text)
    return google_bigbird_pegasus_large_bigpatent(text)

In [None]:
def readoutload(text):
    # wav, rate = microsoft_speecht5_tts(text)
    wav, rate = facebook_fastspeech2_en_ljspeech(text)
    
    sf.write("nowgeneratedspeechforstudy.wav", wav, samplerate=rate)
    show_notification("summary", text)
    playsound("nowgeneratedspeechforstudy.wav")

In [None]:
def convert_numbers_to_text(text):
    # Regular expression pattern to match numbers
    pattern = r'\b\d+\b'
    
    def replace(match):
        number = int(match.group())
        return num2words(number)
    
    # Replace numbers in the text with their textual representation
    converted_text = re.sub(pattern, replace, text)

    return converted_text

def preprocesstext(text):
    text = text.strip()
    text = text.replace('-\n', '')
    text = text.replace('\n', ' ')
    
    text = convert_numbers_to_text(text)
    # text = text.replace(".", ",").replace("!", ",").replace("?", ",").replace(":", ",").replace(";", ",")
    text = text.replace("(",',').replace(")",',').replace("[",',').replace("]",',').replace("{",',').replace("}",',')
    text = text.replace('"',',').replace("“",',').replace("”",',')
    text = text.replace("-",' ').replace("_",' ').replace("—",' ').replace("–",' ').replace("…",' ')
    
    return text

In [None]:
prev_originaltext = ''
def generatebytext(originaltext, mode):
    global prev_originaltext
    originaltext = preprocesstext(originaltext)

    if mode == 'stts':
        if originaltext != prev_originaltext:
            
            thesummary = originaltext if len(originaltext.split())<=10 else summerize(originaltext)
            thesummarysplitted = [substr for substr in re.split(r"[.!?;:]", thesummary) if substr]
            print(thesummarysplitted)
            for tmptext in thesummarysplitted: readoutload(tmptext)
            prev_originaltext = thesummary
    elif mode == 'tts':
        if originaltext != prev_originaltext:
            thesummarysplitted = [substr for substr in re.split(r"[.!?;:]", originaltext) if substr]
            print(thesummarysplitted)
            for tmptext in thesummarysplitted: readoutload(tmptext)
            prev_originaltext = originaltext


In [None]:
# Global variables to track key presses
alt_pressed = False
t_pressed = False
s_pressed = False

def on_key_press(key):
    global alt_pressed, t_pressed, s_pressed

    try:
        if hasattr(key, 'name') and key.name == 'alt_r' or hasattr(key, 'name') and key.name == 'alt_l':
            alt_pressed = True
        elif hasattr(key, 'char') and key.char == 't' and alt_pressed:
            t_pressed = True
        elif hasattr(key, 'char') and key.char == 's' and alt_pressed and t_pressed:
            s_pressed = True
    except AttributeError:
        pass

def on_key_release(key):
    global alt_pressed, t_pressed, s_pressed

    if hasattr(key, 'name') and key.name == 'alt_r' or hasattr(key, 'name') and key.name == 'alt_l':
        # print('alt', alt_pressed)
        alt_pressed = False
    elif hasattr(key, 'char') and key.char == 't':
        # print('t', t_pressed)
        t_pressed = False
    elif hasattr(key, 'char') and key.char == 's':
        # print('s', s_pressed)
        # print(alt_pressed, t_pressed, s_pressed)

        if alt_pressed and t_pressed and hasattr(key, 'char') and key.char == 's':
            highlighted_text = pyperclip.paste()
            # print('process_highlighted_text')
            process_highlighted_text(highlighted_text, mode='stts')
            alt_pressed = False
            t_pressed = False
            s_pressed = False
        elif alt_pressed and hasattr(key, 'char') and key.char == 's':
            highlighted_text = pyperclip.paste()
            # print('process_highlighted_text')
            process_highlighted_text(highlighted_text, mode='tts')
            alt_pressed = False
            t_pressed = False
            s_pressed = False

def process_highlighted_text(text, mode):
    # Replace this with your custom function to process the highlighted text
    print("Highlighted text:", text)
    generatebytext(text, mode)

# Create a listener for keyboard events
listener = keyboard.Listener(on_press=on_key_press, on_release=on_key_release)
listener.start()

# Keep the script running until interrupted
listener.join()