In [1]:
import json
import os
import speech_recognition as sr # type: ignore
from pdfminer.high_level import extract_text # type: ignore
import python_pptx # type: ignore
from docx import Document # type: ignore
from PIL import Image # type: ignore
import pytesseract # type: ignore

def extract_data_from_pdf(file_path):
    return extract_text(file_path)

def extract_data_from_ppt(file_path):
    ppt = python_pptx.Presentation(file_path)
    text = ""
    for slide in ppt.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

def extract_data_from_docx(file_path):
    doc = Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

def extract_text_from_image(file_path):
    image = Image.open(file_path)
    text = pytesseract.image_to_string(image)
    return text

def transcribe_audio(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio_data = recognizer.record(source)
        text = recognizer.recognize_google(audio_data)
    return text

def extract_data(file_path):
    file_extension = os.path.splitext(file_path)[1].lower()
    if file_extension == '.pdf':
        return extract_data_from_pdf(file_path)
    elif file_extension == '.pptx':
        return extract_data_from_ppt(file_path)
    elif file_extension == '.docx':
        return extract_data_from_docx(file_path)
    elif file_extension in ['.jpg', '.jpeg', '.png']:
        return extract_text_from_image(file_path)
    elif file_extension in ['.mp3', '.wav']:
        return transcribe_audio(file_path)
    else:
        return "Unsupported file format."

def generate_json(file_path):
    data = extract_data(file_path)
    if data != "Unsupported file format.":
        return json.dumps({"data": data})
    else:
        return json.dumps({"error": data})

if __name__ == "__main__":
    file_path = input("Enter the path of the file: ")
    json_output = generate_json(file_path)
    print(json_output)


ModuleNotFoundError: No module named 'speech_recognition'