diff --git a/README.md b/README.md index 8925f46..b5ee15d 100644 --- a/README.md +++ b/README.md @@ -1,63 +1,26 @@
-

Demo Site | Docs | Roadmap

+

Demo Site | Documentation | Roadmap

## What is Whisper Turbo? + Whisper Turbo is a fast, **cross-platform** Whisper implementation, designed to run entirely client-side in your browser/electron app. -With Whisper Turbo, you can add transcription to any app in minutes. +Check out [Getting Started](https://ratchet.sh/whisper-turbo) for more. + +## Demo -Check out [Getting Started]() for more. +https://github.com/FL33TW00D/whisper-turbo/assets/45471420/1e19aa1f-bb56-4b5c-bc00-e79aabb4d1e0 ## Supported Platforms WebGPU is only officially supported on Chromium based browsers running on Windows & MacOS. -For Linux support, check [here](https://github.com/gpuweb/gpuweb/wiki/Implementation-Status). - -## Getting Started - -Install whisper-turbo: -```bash -npm install whisper-turbo -``` - -```typescript -const session = useRef(null); - -const loadModel = async () => { - //The session manager handles constructing the inference session. - const manager = new SessionManager(); - const loadResult = await manager.loadModel( - AvailableModels.WHISPER_TINY, - () => { console.log("loaded!") }, - (progress: number) => { console.log("Loading: ", progress) } - ); - if (loadResult.isErr) { - console.log("Failed to load!"); - } else { - session.current = loadResult.value; - } -}; - -const runSession = async () => { - await session.current.transcribe( - your_uint8_array, - true/false, - (s) => { - console.log("Segment!") - } - ); -}; -``` - -## Docs - -Coming soon +For more information, check out [Supported Platforms](https://ratchet.sh/whisper-turbo/platforms) ## Want to get involved? -- Are you a GPU wizard? -- Do you know what a HRTB is in Rust? -- Do you know what is going on [here](https://github.com/RuyiLi/cursed-typescript/blob/master/random/game-of-life.ts)? -- Reach out: chris@fleetwood.dev +- Are you a GPU wizard? +- Do you know what a HRTB is in Rust? +- Do you know what is going on [here](https://github.com/RuyiLi/cursed-typescript/blob/master/random/game-of-life.ts)? +- Reach out: chris@fleetwood.dev diff --git a/package.json b/package.json index 7aef503..1ba05a5 100644 --- a/package.json +++ b/package.json @@ -33,7 +33,7 @@ "p-retry": "^5.1.2", "true-myth": "^6.2.0", "uuid": "^9.0.1", - "whisper-webgpu": "0.8.0" + "whisper-webgpu": "0.9.0" }, "files": [ "dist/**/*" diff --git a/playground/package.json b/playground/package.json index c5a6018..6bc31e0 100644 --- a/playground/package.json +++ b/playground/package.json @@ -16,7 +16,7 @@ "react-hot-toast": "^2.4.1", "react-responsive-modal": "^6.4.2", "true-myth": "^7.1.0", - "whisper-turbo": "0.9.0" + "whisper-turbo": "file:.." }, "devDependencies": { "@tailwindcss/typography": "^0.5.10", diff --git a/playground/pnpm-lock.yaml b/playground/pnpm-lock.yaml index d7009a9..f87516f 100644 --- a/playground/pnpm-lock.yaml +++ b/playground/pnpm-lock.yaml @@ -30,8 +30,8 @@ dependencies: specifier: ^7.1.0 version: 7.1.0 whisper-turbo: - specifier: 0.9.0 - version: 0.9.0 + specifier: file:.. + version: file:.. devDependencies: '@tailwindcss/typography': @@ -2778,20 +2778,8 @@ packages: dependencies: isexe: 2.0.0 - /whisper-turbo@0.9.0: - resolution: {integrity: sha512-K/tjxD1bhgAE+MjO/u+M14kcprfbX1QNiRtWs8J2fXfb3q5QEYPBM1OCPa7yofZcQ4C8lP/k0vGRdl8OXohKOg==} - dependencies: - comlink: 4.3.1 - fix-webm-duration: 1.0.5 - idb: 7.1.1 - p-retry: 5.1.2 - true-myth: 6.2.0 - uuid: 9.0.1 - whisper-webgpu: 0.8.0 - dev: false - - /whisper-webgpu@0.8.0: - resolution: {integrity: sha512-E0JbPO09f54aMWs1NfzGLIotG/yX0sSHSFyBs6tZyvndMrXo8yeDhGf77SBB6c0KmdjuA45LXVzO7Vo8ERwsug==} + /whisper-webgpu@0.9.0: + resolution: {integrity: sha512-hMQ76a1TwHg4uiXuEPq/w9MFvDp4Ib22OdtKfqTybW6uLtqlJTQJgcGE5vgoVAeAS8MNzYCd/H7AyoVVdl09mA==} dev: false /wrappy@1.0.2: @@ -2808,3 +2796,16 @@ packages: /yocto-queue@0.1.0: resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==} engines: {node: '>=10'} + + file:..: + resolution: {directory: .., type: directory} + name: whisper-turbo + dependencies: + comlink: 4.3.1 + fix-webm-duration: 1.0.5 + idb: 7.1.1 + p-retry: 5.1.2 + true-myth: 6.2.0 + uuid: 9.0.1 + whisper-webgpu: 0.9.0 + dev: false diff --git a/playground/src/components/configModal.tsx b/playground/src/components/configModal.tsx new file mode 100644 index 0000000..a71fb3f --- /dev/null +++ b/playground/src/components/configModal.tsx @@ -0,0 +1,90 @@ +import React, { useState, useEffect } from "react"; +import Modal from "react-responsive-modal"; +import { Task } from "whisper-webgpu"; +import LanguageDropdown from "./languageDropdown"; +import SuppressComponent from "./suppressSelector"; +import TaskComponent from "./taskSelector"; + +interface ConfigModalProps { + isModalOpen: boolean; + setIsModalOpen: React.Dispatch>; + configOptions: ConfigOptions; + setConfigOptions: React.Dispatch>; +} + +export interface ConfigOptions { + language: string | null; + task: Task; + suppress_non_speech: boolean; +} + +const ConfigModal = (props: ConfigModalProps) => { + useEffect(() => { + //@ts-ignore + if (!navigator.gpu) { + props.setIsModalOpen(true); + return; + } + }, []); + + const handleModalClose = () => { + props.setIsModalOpen(false); + }; + + const closeIcon = ( + + + + + + + + + + + + + + + + + + ); + + return ( + <> + +
+
+ + + +
+
+
+ ) + + ); +}; + +export default ConfigModal; diff --git a/playground/src/components/controlPanel.tsx b/playground/src/components/controlPanel.tsx index 359aed0..721e74c 100644 --- a/playground/src/components/controlPanel.tsx +++ b/playground/src/components/controlPanel.tsx @@ -3,27 +3,26 @@ import { AvailableModels, InferenceSession, SessionManager, + Segment, + DecodingOptionsBuilder, + initialize } from "whisper-turbo"; import toast from "react-hot-toast"; import { humanFileSize } from "../util"; import ProgressBar from "./progressBar"; import ModelSelector from "./modelSelector"; import MicButton, { AudioMetadata } from "./micButton"; +import GearIcon from "./gearIcon"; +import ConfigModal, { ConfigOptions } from "./configModal"; +import { Task } from "whisper-webgpu"; -export interface TSSegment { - text: string; - start: number; - stop: number; - last: boolean; -} - -export interface TSTranscript { - segments: Array; +export interface Transcript { + segments: Array; } interface ControlPanelProps { - transcript: TSTranscript; - setTranscript: React.Dispatch>; + transcript: Transcript; + setTranscript: React.Dispatch>; setDownloadAvailable: React.Dispatch>; } @@ -44,6 +43,12 @@ const ControlPanel = (props: ControlPanelProps) => { const [loaded, setLoaded] = useState(false); const [progress, setProgress] = useState(0); const [transcribing, setTranscribing] = useState(false); + const [isConfigOpen, setIsConfigOpen] = useState(false); + const [configOptions, setConfigOptions] = useState({ + language: null, + task: Task.Transcribe, + suppress_non_speech: true, + }); useEffect(() => { if (loadedModel && selectedModel != loadedModel && !transcribing) { @@ -108,23 +113,38 @@ const ControlPanel = (props: ControlPanelProps) => { toast.error("No audio file loaded"); return; } - props.setTranscript((transcript: TSTranscript) => { + props.setTranscript((transcript: Transcript) => { return { ...transcript, segments: [], }; }); setTranscribing(true); + await initialize(); + let builder = new DecodingOptionsBuilder(); + if (configOptions.language) + builder = builder.setLanguage(configOptions.language); + if (configOptions.suppress_non_speech) + builder = builder.setSuppressTokens(Int32Array.from([-1])); + else + builder = builder.setSuppressTokens(Int32Array.from([])); + + builder = builder.setTask(configOptions.task); + const options = builder.build(); + console.log("Options: ", options); + await session.current.transcribe( audioData!, audioMetadata!.fromMic, - (s: any) => { + options, + (s: Segment) => { + console.log(s); if (s.last) { setTranscribing(false); props.setDownloadAvailable(true); return; } - props.setTranscript((transcript: TSTranscript) => { + props.setTranscript((transcript: Transcript) => { return { ...transcript, segments: [...transcript.segments, s], @@ -135,126 +155,143 @@ const ControlPanel = (props: ControlPanelProps) => { }; return ( -
-
- - window.open( - "https://github.com/FL33TW00D/whisper-turbo", - "_blank" - ) - } - /> -
-
- - - {selectedModel != loadedModel && progress == 0 && ( -
- +
+ )} +
+
+
+ + +
- )} -
-
-
- - -
- -
- {blobUrl && ( -
- -
- )} -
- -
- +
+ +
+ + + +
+
+
+

+ Built by{" "} + + @fleetwood + +

-
-

- Built by{" "} - - @fleetwood - -

-
- + ); }; diff --git a/playground/src/components/gearIcon.tsx b/playground/src/components/gearIcon.tsx new file mode 100644 index 0000000..db17e06 --- /dev/null +++ b/playground/src/components/gearIcon.tsx @@ -0,0 +1,141 @@ +const GearIcon = () => { + return ( + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ); +}; + +export default GearIcon; diff --git a/playground/src/components/languageDropdown.tsx b/playground/src/components/languageDropdown.tsx new file mode 100644 index 0000000..ee3467f --- /dev/null +++ b/playground/src/components/languageDropdown.tsx @@ -0,0 +1,189 @@ +import React, { useState } from "react"; +import { ConfigOptions } from "./configModal"; + +const AvailableLanguages = { + en: "English", + zh: "Chinese", + de: "German", + es: "Spanish", + ru: "Russian", + ko: "Korean", + fr: "French", + ja: "Japanese", + pt: "Portuguese", + tr: "Turkish", + pl: "Polish", + ca: "Catalan", + nl: "Dutch", + ar: "Arabic", + sv: "Swedish", + it: "Italian", + id: "Indonesian", + hi: "Hindi", + fi: "Finnish", + vi: "Vietnamese", + he: "Hebrew", + uk: "Ukrainian", + el: "Greek", + ms: "Malay", + cs: "Czech", + ro: "Romanian", + da: "Danish", + hu: "Hungarian", + ta: "Tamil", + no: "Norwegian", + th: "Thai", + ur: "Urdu", + hr: "Croatian", + bg: "Bulgarian", + lt: "Lithuanian", + la: "Latin", + mi: "Maori", + ml: "Malayalam", + cy: "Welsh", + sk: "Slovak", + te: "Telugu", + fa: "Persian", + lv: "Latvian", + bn: "Bengali", + sr: "Serbian", + az: "Azerbaijani", + sl: "Slovenian", + kn: "Kannada", + et: "Estonian", + mk: "Macedonian", + br: "Breton", + eu: "Basque", + is: "Icelandic", + hy: "Armenian", + ne: "Nepali", + mn: "Mongolian", + bs: "Bosnian", + kk: "Kazakh", + sq: "Albanian", + sw: "Swahili", + gl: "Galician", + mr: "Marathi", + pa: "Punjabi", + si: "Sinhala", + km: "Khmer", + sn: "Shona", + yo: "Yoruba", + so: "Somali", + af: "Afrikaans", + oc: "Occitan", + ka: "Georgian", + be: "Belarusian", + tg: "Tajik", + sd: "Sindhi", + gu: "Gujarati", + am: "Amharic", + yi: "Yiddish", + lo: "Lao", + uz: "Uzbek", + fo: "Faroese", + ht: "Haitian creole", + ps: "Pashto", + tk: "Turkmen", + nn: "Nynorsk", + mt: "Maltese", + sa: "Sanskrit", + lb: "Luxembourgish", + my: "Myanmar", + bo: "Tibetan", + tl: "Tagalog", + mg: "Malagasy", + as: "Assamese", + tt: "Tatar", + haw: "Hawaiian", + ln: "Lingala", + ha: "Hausa", + ba: "Bashkir", + jw: "Javanese", + su: "Sundanese", + yue: "Cantonese", +}; + +interface LanguageDropdownProps { + configOptions: ConfigOptions; + setConfigOptions: React.Dispatch>; +} + +const LanguageDropdown = (props: LanguageDropdownProps) => { + const [open, setOpen] = useState(false); + const [selectedLanguage, setSelectedLanguage] = useState( + props.configOptions.language + ); + + const toggleOpen = () => setOpen((prev) => !prev); + + const selectLanguage = (lang: string) => { + props.setConfigOptions((prev: ConfigOptions) => ({ + ...prev, + language: lang, + })); + setSelectedLanguage(lang); + setOpen(false); + }; + + return ( +
+
+ + +
+ + {open && ( +
+
+ {Object.entries(AvailableLanguages).map( + ([lang, name]) => ( + selectLanguage(lang)} + > + {name} + + ) + )} +
+
+ )} +
+ ); +}; + +export default LanguageDropdown; diff --git a/playground/src/components/layout.tsx b/playground/src/components/layout.tsx index 1363817..df170f9 100644 --- a/playground/src/components/layout.tsx +++ b/playground/src/components/layout.tsx @@ -2,7 +2,7 @@ import Head from "next/head"; import { Toaster } from "react-hot-toast"; import React from "react"; -export const siteTitle = "AI Playground"; +export const siteTitle = "Whisper Turbo"; type LayoutProps = { children: React.ReactNode; diff --git a/playground/src/components/modal.tsx b/playground/src/components/modal.tsx index f77a3ed..49b23d4 100644 --- a/playground/src/components/modal.tsx +++ b/playground/src/components/modal.tsx @@ -5,8 +5,6 @@ const WebGPUModal = () => { const [hasWebGPU, setHasWebGPU] = useState(false); const [isModalOpen, setIsModalOpen] = useState(true); - const myRef = React.useRef(null); - useEffect(() => { //@ts-ignore if (!navigator.gpu) { diff --git a/playground/src/components/suppressSelector.tsx b/playground/src/components/suppressSelector.tsx new file mode 100644 index 0000000..47a643d --- /dev/null +++ b/playground/src/components/suppressSelector.tsx @@ -0,0 +1,48 @@ +import React, { useState } from "react"; +import { ConfigOptions } from "./configModal"; + +interface SuppressComponentProps { + configOptions: ConfigOptions; + setConfigOptions: React.Dispatch>; +} + +const SuppressComponent = (props: SuppressComponentProps) => { + const [checkedState, setCheckedState] = useState({ + suppress_non_speech: props.configOptions.suppress_non_speech + }); + + const handleOnChange = (event: React.ChangeEvent) => { + setCheckedState({ + ...checkedState, + [event.target.name]: event.target.checked + }); + + props.setConfigOptions({ + ...props.configOptions, + suppress_non_speech: event.target.checked + }); + }; + + return ( +
+ +
+
+ + +
+
+
+ ); +}; + +export default SuppressComponent; diff --git a/playground/src/components/taskSelector.tsx b/playground/src/components/taskSelector.tsx new file mode 100644 index 0000000..8cb9366 --- /dev/null +++ b/playground/src/components/taskSelector.tsx @@ -0,0 +1,78 @@ +import React, { useState } from "react"; +import { ConfigOptions } from "./configModal"; +import { Task } from "whisper-webgpu"; + +interface TaskComponentProps { + configOptions: ConfigOptions; + setConfigOptions: React.Dispatch>; +} + +const TaskComponent = (props: TaskComponentProps) => { + let state = { + translate: props.configOptions.task === Task.Translate, + transcribe: props.configOptions.task === Task.Transcribe, + }; + + const [checkedState, setCheckedState] = useState(state); + + const handleOnChange = (event: React.ChangeEvent) => { + setCheckedState({ + ...checkedState, + [event.target.name]: event.target.checked, + }); + if (event.target.name === "translate") + setCheckedState({ + translate: event.target.checked, + transcribe: !event.target.checked, + }); + if (event.target.name === "transcribe") + setCheckedState({ + translate: !event.target.checked, + transcribe: event.target.checked, + }); + props.setConfigOptions((prev: ConfigOptions) => ({ + ...prev, + task: + event.target.name === "translate" + ? Task.Translate + : Task.Transcribe, + })); + }; + + return ( +
+ +
+
+ + +
+ +
+ + +
+
+
+ ); +}; + +export default TaskComponent; diff --git a/playground/src/pages/index.tsx b/playground/src/pages/index.tsx index be7f354..b2ce1c7 100644 --- a/playground/src/pages/index.tsx +++ b/playground/src/pages/index.tsx @@ -2,16 +2,14 @@ import type { NextPage } from "next"; import { VT323 } from "@next/font/google"; import { useState } from "react"; import Layout from "../components/layout"; -import ControlPanel, { - TSSegment, - TSTranscript, -} from "../components/controlPanel"; import WebGPUModal from "../components/modal"; +import { Segment } from "whisper-turbo"; +import ControlPanel, { Transcript } from "../components/controlPanel"; const vt = VT323({ weight: "400", display: "swap" }); const Home: NextPage = () => { - const [transcript, setTranscript] = useState({ + const [transcript, setTranscript] = useState({ segments: [], }); const [downloadAvailable, setDownloadAvailable] = useState(false); @@ -43,7 +41,7 @@ const Home: NextPage = () => {
{transcript && transcript.segments.map( - (segment: TSSegment) => { + (segment: Segment) => { return (
>; + raw_audio: boolean, + options: any + ): Promise>; public async transcribe( audio: Uint8Array, raw_audio: boolean, + options: any, callback: (decoded: Segment) => void ): Promise>; - public async transcribe( + async transcribe( audio: Uint8Array, raw_audio: boolean, + options: any, callback?: (decoded: Segment) => void - ): Promise> { + ): Promise> { if (this.session == null) { return Result.err(new Error("Session not initialized")); } if (callback) { if (this.session instanceof Session) { - return await this.session.stream(audio, raw_audio, callback); + return await this.session.stream( + audio, + raw_audio, + options, + callback + ); } else { return await this.session!.stream( audio, raw_audio, + options, Comlink.proxy(callback) ); } } else { - return await this.session!.run(audio); + return await this.session!.run(audio, options); } } diff --git a/src/session.worker.ts b/src/session.worker.ts index 9425bd5..0a37946 100644 --- a/src/session.worker.ts +++ b/src/session.worker.ts @@ -62,8 +62,9 @@ export class Session { } public async run( - audio: Uint8Array - ): Promise> { + audio: Uint8Array, + options: any + ): Promise> { if (!this.whisperSession) { return Result.err( new Error( @@ -72,12 +73,13 @@ export class Session { ); } - return Result.ok(await this.whisperSession.run(audio)); + return Result.ok(await this.whisperSession.run(audio, options)); } public async stream( audio: Uint8Array, raw_audio: boolean, + options: any, callback: (decoded: whisper.Segment) => void ): Promise> { if (!this.whisperSession) { @@ -89,7 +91,12 @@ export class Session { } return Result.ok( - await this.whisperSession.stream(audio, raw_audio, callback) + await this.whisperSession.stream( + audio, + raw_audio, + options, + callback + ) ); } }