Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Added support for LocalAI Speech To Text configuration #2376

Merged
merged 2 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 54 additions & 25 deletions packages/components/src/speechToText.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,69 @@ import { type ClientOptions, OpenAIClient } from '@langchain/openai'
import { AssemblyAI } from 'assemblyai'
import { getFileFromStorage } from './storageUtils'

const SpeechToTextType = {
OPENAI_WHISPER: 'openAIWhisper',
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
LOCALAI_STT: 'localAISTT'
}

export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
if (speechToTextConfig) {
const credentialId = speechToTextConfig.credentialId as string
const credentialData = await getCredentialData(credentialId ?? '', options)
const audio_file = await getFileFromStorage(upload.name, options.chatflowid, options.chatId)

if (speechToTextConfig.name === 'openAIWhisper') {
const openAIClientOptions: ClientOptions = {
apiKey: credentialData.openAIApiKey
}
const openAIClient = new OpenAIClient(openAIClientOptions)
const transcription = await openAIClient.audio.transcriptions.create({
file: new File([new Blob([audio_file])], upload.name),
model: 'whisper-1',
language: speechToTextConfig?.language,
temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
prompt: speechToTextConfig?.prompt
})
if (transcription?.text) {
return transcription.text
switch (speechToTextConfig.name) {
case SpeechToTextType.OPENAI_WHISPER: {
const openAIClientOptions: ClientOptions = {
apiKey: credentialData.openAIApiKey
}
const openAIClient = new OpenAIClient(openAIClientOptions)
const openAITranscription = await openAIClient.audio.transcriptions.create({
file: new File([new Blob([audio_file])], upload.name),
model: 'whisper-1',
language: speechToTextConfig?.language,
temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
prompt: speechToTextConfig?.prompt
})
if (openAITranscription?.text) {
return openAITranscription.text
}
break
}
} else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
const client = new AssemblyAI({
apiKey: credentialData.assemblyAIApiKey
})
case SpeechToTextType.ASSEMBLYAI_TRANSCRIBE: {
const assemblyAIClient = new AssemblyAI({
apiKey: credentialData.assemblyAIApiKey
})

const params = {
audio: audio_file,
speaker_labels: false
}
const params = {
audio: audio_file,
speaker_labels: false
}

const transcription = await client.transcripts.transcribe(params)
if (transcription?.text) {
return transcription.text
const assemblyAITranscription = await assemblyAIClient.transcripts.transcribe(params)
if (assemblyAITranscription?.text) {
return assemblyAITranscription.text
}
break
}
case SpeechToTextType.LOCALAI_STT: {
const LocalAIClientOptions: ClientOptions = {
apiKey: credentialData.localAIApiKey,
baseURL: speechToTextConfig?.baseUrl
}
const localAIClient = new OpenAIClient(LocalAIClientOptions)
const localAITranscription = await localAIClient.audio.transcriptions.create({
file: new File([new Blob([audio_file])], upload.name),
model: speechToTextConfig?.model || 'whisper-1',
language: speechToTextConfig?.language,
temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
prompt: speechToTextConfig?.prompt
})
if (localAITranscription?.text) {
return localAITranscription.text
}
break
}
}
} else {
Expand Down
4 changes: 3 additions & 1 deletion packages/server/src/utils/buildChatflow.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
}

// Run Speech to Text conversion
if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4') {
if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4' || upload.mime === 'audio/ogg') {
logger.debug(`Attempting a speech to text conversion...`)
let speechToTextConfig: ICommonObject = {}
if (chatflow.speechToText) {
const speechToTextProviders = JSON.parse(chatflow.speechToText)
Expand All @@ -99,6 +100,7 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
databaseEntities: databaseEntities
}
const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options)
logger.debug(`Speech to text result: ${speechToTextResult}`)
if (speechToTextResult) {
incomingInput.question = speechToTextResult
}
Expand Down
Binary file added packages/ui/src/assets/images/localai.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
78 changes: 72 additions & 6 deletions packages/ui/src/ui-component/extended/SpeechToText.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,27 @@ import { StyledButton } from '@/ui-component/button/StyledButton'
import { Dropdown } from '@/ui-component/dropdown/Dropdown'
import openAISVG from '@/assets/images/openai.svg'
import assemblyAIPng from '@/assets/images/assemblyai.png'
import localAiPng from '@/assets/images/localai.png'

// store
import useNotifier from '@/utils/useNotifier'

// API
import chatflowsApi from '@/api/chatflows'

// If implementing a new provider, this must be updated in
// components/src/speechToText.ts as well
const SpeechToTextType = {
OPENAI_WHISPER: 'openAIWhisper',
ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
LOCALAI_STT: 'localAISTT'
}

// Weird quirk - the key must match the name property value.
const speechToTextProviders = {
openAIWhisper: {
[SpeechToTextType.OPENAI_WHISPER]: {
label: 'OpenAI Whisper',
name: 'openAIWhisper',
name: SpeechToTextType.OPENAI_WHISPER,
icon: openAISVG,
url: 'https://platform.openai.com/docs/guides/speech-to-text',
inputs: [
Expand Down Expand Up @@ -63,9 +73,9 @@ const speechToTextProviders = {
}
]
},
assemblyAiTranscribe: {
[SpeechToTextType.ASSEMBLYAI_TRANSCRIBE]: {
label: 'Assembly AI',
name: 'assemblyAiTranscribe',
name: SpeechToTextType.ASSEMBLYAI_TRANSCRIBE,
icon: assemblyAIPng,
url: 'https://www.assemblyai.com/',
inputs: [
Expand All @@ -76,6 +86,59 @@ const speechToTextProviders = {
credentialNames: ['assemblyAIApi']
}
]
},
[SpeechToTextType.LOCALAI_STT]: {
label: 'LocalAi STT',
name: SpeechToTextType.LOCALAI_STT,
icon: localAiPng,
url: 'https://localai.io/features/audio-to-text/',
inputs: [
{
label: 'Connect Credential',
name: 'credential',
type: 'credential',
credentialNames: ['localAIApi']
},
{
label: 'Base URL',
name: 'baseUrl',
type: 'string',
description: 'The base URL of the local AI server'
},
{
label: 'Language',
name: 'language',
type: 'string',
description:
'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.',
placeholder: 'en',
optional: true
},
{
label: 'Model',
name: 'model',
type: 'string',
description: `The STT model to load. Defaults to whisper-1 if left blank.`,
placeholder: 'whisper-1',
clates marked this conversation as resolved.
Show resolved Hide resolved
optional: true
},
{
label: 'Prompt',
name: 'prompt',
type: 'string',
rows: 4,
description: `An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.`,
optional: true
},
{
label: 'Temperature',
name: 'temperature',
type: 'number',
step: 0.1,
description: `The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.`,
optional: true
}
]
}
}

Expand Down Expand Up @@ -191,8 +254,11 @@ const SpeechToText = ({ dialogProps }) => {
<FormControl fullWidth>
<Select size='small' value={selectedProvider} onChange={handleProviderChange}>
<MenuItem value='none'>None</MenuItem>
<MenuItem value='openAIWhisper'>OpenAI Whisper</MenuItem>
<MenuItem value='assemblyAiTranscribe'>Assembly AI</MenuItem>
{Object.values(speechToTextProviders).map((provider) => (
<MenuItem key={provider.name} value={provider.name}>
{provider.label}
</MenuItem>
))}
</Select>
</FormControl>
</Box>
Expand Down
Loading