diff --git a/packages/components/src/speechToText.ts b/packages/components/src/speechToText.ts
index 732b574c88..1b95291308 100644
--- a/packages/components/src/speechToText.ts
+++ b/packages/components/src/speechToText.ts
@@ -4,40 +4,69 @@ import { type ClientOptions, OpenAIClient } from '@langchain/openai'
 import { AssemblyAI } from 'assemblyai'
 import { getFileFromStorage } from './storageUtils'
 
+const SpeechToTextType = {
+    OPENAI_WHISPER: 'openAIWhisper',
+    ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
+    LOCALAI_STT: 'localAISTT'
+}
+
 export const convertSpeechToText = async (upload: IFileUpload, speechToTextConfig: ICommonObject, options: ICommonObject) => {
     if (speechToTextConfig) {
         const credentialId = speechToTextConfig.credentialId as string
         const credentialData = await getCredentialData(credentialId ?? '', options)
         const audio_file = await getFileFromStorage(upload.name, options.chatflowid, options.chatId)
 
-        if (speechToTextConfig.name === 'openAIWhisper') {
-            const openAIClientOptions: ClientOptions = {
-                apiKey: credentialData.openAIApiKey
-            }
-            const openAIClient = new OpenAIClient(openAIClientOptions)
-            const transcription = await openAIClient.audio.transcriptions.create({
-                file: new File([new Blob([audio_file])], upload.name),
-                model: 'whisper-1',
-                language: speechToTextConfig?.language,
-                temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
-                prompt: speechToTextConfig?.prompt
-            })
-            if (transcription?.text) {
-                return transcription.text
+        switch (speechToTextConfig.name) {
+            case SpeechToTextType.OPENAI_WHISPER: {
+                const openAIClientOptions: ClientOptions = {
+                    apiKey: credentialData.openAIApiKey
+                }
+                const openAIClient = new OpenAIClient(openAIClientOptions)
+                const openAITranscription = await openAIClient.audio.transcriptions.create({
+                    file: new File([new Blob([audio_file])], upload.name),
+                    model: 'whisper-1',
+                    language: speechToTextConfig?.language,
+                    temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
+                    prompt: speechToTextConfig?.prompt
+                })
+                if (openAITranscription?.text) {
+                    return openAITranscription.text
+                }
+                break
             }
-        } else if (speechToTextConfig.name === 'assemblyAiTranscribe') {
-            const client = new AssemblyAI({
-                apiKey: credentialData.assemblyAIApiKey
-            })
+            case SpeechToTextType.ASSEMBLYAI_TRANSCRIBE: {
+                const assemblyAIClient = new AssemblyAI({
+                    apiKey: credentialData.assemblyAIApiKey
+                })
 
-            const params = {
-                audio: audio_file,
-                speaker_labels: false
-            }
+                const params = {
+                    audio: audio_file,
+                    speaker_labels: false
+                }
 
-            const transcription = await client.transcripts.transcribe(params)
-            if (transcription?.text) {
-                return transcription.text
+                const assemblyAITranscription = await assemblyAIClient.transcripts.transcribe(params)
+                if (assemblyAITranscription?.text) {
+                    return assemblyAITranscription.text
+                }
+                break
+            }
+            case SpeechToTextType.LOCALAI_STT: {
+                const LocalAIClientOptions: ClientOptions = {
+                    apiKey: credentialData.localAIApiKey,
+                    baseURL: speechToTextConfig?.baseUrl
+                }
+                const localAIClient = new OpenAIClient(LocalAIClientOptions)
+                const localAITranscription = await localAIClient.audio.transcriptions.create({
+                    file: new File([new Blob([audio_file])], upload.name),
+                    model: speechToTextConfig?.model || 'whisper-1',
+                    language: speechToTextConfig?.language,
+                    temperature: speechToTextConfig?.temperature ? parseFloat(speechToTextConfig.temperature) : undefined,
+                    prompt: speechToTextConfig?.prompt
+                })
+                if (localAITranscription?.text) {
+                    return localAITranscription.text
+                }
+                break
             }
         }
     } else {
diff --git a/packages/server/src/utils/buildChatflow.ts b/packages/server/src/utils/buildChatflow.ts
index 3ffefff326..df946f98b9 100644
--- a/packages/server/src/utils/buildChatflow.ts
+++ b/packages/server/src/utils/buildChatflow.ts
@@ -78,7 +78,8 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
                 }
 
                 // Run Speech to Text conversion
-                if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4') {
+                if (upload.mime === 'audio/webm' || upload.mime === 'audio/mp4' || upload.mime === 'audio/ogg') {
+                    logger.debug(`Attempting a speech to text conversion...`)
                     let speechToTextConfig: ICommonObject = {}
                     if (chatflow.speechToText) {
                         const speechToTextProviders = JSON.parse(chatflow.speechToText)
@@ -99,6 +100,7 @@ export const utilBuildChatflow = async (req: Request, socketIO?: Server, isInter
                             databaseEntities: databaseEntities
                         }
                         const speechToTextResult = await convertSpeechToText(upload, speechToTextConfig, options)
+                        logger.debug(`Speech to text result: ${speechToTextResult}`)
                         if (speechToTextResult) {
                             incomingInput.question = speechToTextResult
                         }
diff --git a/packages/ui/src/assets/images/localai.png b/packages/ui/src/assets/images/localai.png
new file mode 100644
index 0000000000..321403973d
Binary files /dev/null and b/packages/ui/src/assets/images/localai.png differ
diff --git a/packages/ui/src/ui-component/extended/SpeechToText.jsx b/packages/ui/src/ui-component/extended/SpeechToText.jsx
index 3c8fb674e0..ee376ddc5e 100644
--- a/packages/ui/src/ui-component/extended/SpeechToText.jsx
+++ b/packages/ui/src/ui-component/extended/SpeechToText.jsx
@@ -16,6 +16,7 @@ import { StyledButton } from '@/ui-component/button/StyledButton'
 import { Dropdown } from '@/ui-component/dropdown/Dropdown'
 import openAISVG from '@/assets/images/openai.svg'
 import assemblyAIPng from '@/assets/images/assemblyai.png'
+import localAiPng from '@/assets/images/localai.png'
 
 // store
 import useNotifier from '@/utils/useNotifier'
@@ -23,10 +24,19 @@ import useNotifier from '@/utils/useNotifier'
 // API
 import chatflowsApi from '@/api/chatflows'
 
+// If implementing a new provider, this must be updated in
+// components/src/speechToText.ts as well
+const SpeechToTextType = {
+    OPENAI_WHISPER: 'openAIWhisper',
+    ASSEMBLYAI_TRANSCRIBE: 'assemblyAiTranscribe',
+    LOCALAI_STT: 'localAISTT'
+}
+
+// Weird quirk - the key must match the name property value.
 const speechToTextProviders = {
-    openAIWhisper: {
+    [SpeechToTextType.OPENAI_WHISPER]: {
         label: 'OpenAI Whisper',
-        name: 'openAIWhisper',
+        name: SpeechToTextType.OPENAI_WHISPER,
         icon: openAISVG,
         url: 'https://platform.openai.com/docs/guides/speech-to-text',
         inputs: [
@@ -63,9 +73,9 @@ const speechToTextProviders = {
             }
         ]
     },
-    assemblyAiTranscribe: {
+    [SpeechToTextType.ASSEMBLYAI_TRANSCRIBE]: {
         label: 'Assembly AI',
-        name: 'assemblyAiTranscribe',
+        name: SpeechToTextType.ASSEMBLYAI_TRANSCRIBE,
         icon: assemblyAIPng,
         url: 'https://www.assemblyai.com/',
         inputs: [
@@ -76,6 +86,59 @@ const speechToTextProviders = {
                 credentialNames: ['assemblyAIApi']
             }
         ]
+    },
+    [SpeechToTextType.LOCALAI_STT]: {
+        label: 'LocalAi STT',
+        name: SpeechToTextType.LOCALAI_STT,
+        icon: localAiPng,
+        url: 'https://localai.io/features/audio-to-text/',
+        inputs: [
+            {
+                label: 'Connect Credential',
+                name: 'credential',
+                type: 'credential',
+                credentialNames: ['localAIApi']
+            },
+            {
+                label: 'Base URL',
+                name: 'baseUrl',
+                type: 'string',
+                description: 'The base URL of the local AI server'
+            },
+            {
+                label: 'Language',
+                name: 'language',
+                type: 'string',
+                description:
+                    'The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.',
+                placeholder: 'en',
+                optional: true
+            },
+            {
+                label: 'Model',
+                name: 'model',
+                type: 'string',
+                description: `The STT model to load. Defaults to whisper-1 if left blank.`,
+                placeholder: 'whisper-1',
+                optional: true
+            },
+            {
+                label: 'Prompt',
+                name: 'prompt',
+                type: 'string',
+                rows: 4,
+                description: `An optional text to guide the model's style or continue a previous audio segment. The prompt should match the audio language.`,
+                optional: true
+            },
+            {
+                label: 'Temperature',
+                name: 'temperature',
+                type: 'number',
+                step: 0.1,
+                description: `The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic.`,
+                optional: true
+            }
+        ]
     }
 }
 
@@ -191,8 +254,11 @@ const SpeechToText = ({ dialogProps }) => {
                 <FormControl fullWidth>
                     <Select size='small' value={selectedProvider} onChange={handleProviderChange}>
                         <MenuItem value='none'>None</MenuItem>
-                        <MenuItem value='openAIWhisper'>OpenAI Whisper</MenuItem>
-                        <MenuItem value='assemblyAiTranscribe'>Assembly AI</MenuItem>
+                        {Object.values(speechToTextProviders).map((provider) => (
+                            <MenuItem key={provider.name} value={provider.name}>
+                                {provider.label}
+                            </MenuItem>
+                        ))}
                     </Select>
                 </FormControl>
             </Box>