In [29]:
from pydub import AudioSegment
import speech_recognition as sr
import librosa
import numpy as np

from decimal import Decimal, ROUND_HALF_UP
import aubio
from scipy.fftpack import fft, ifft
from scipy.signal import find_peaks

!pip install aubio

### Library 정리
* AudioSegment는 Pydub library (음성파일조작)
* AudioFile은 Speech_recognition library (for 음성인식)

### 사람 음성의 평균 Pitch
* 남성: 대략 85 Hz에서 180 Hz
* 여성: 대략 165 Hz에서 255 Hz
* 어린이: 대략 210 Hz에서 300 Hz 

### 일반적인 대화 소리 Volume
* 60 ~ 70 dB

In [43]:
audio_file_path = './drama.wav'
interval = 5000 # 10 seconds

recognizer = sr.Recognizer()
audio = AudioSegment.from_file(audio_file_path)
audio_duration = audio.duration_seconds
print("Detailed Information...")
print(f"Duration: {audio_duration} seconds")
print(f"Channels: {audio.channels}")
print(f"Frame rate: {audio.frame_rate}\n\n")

start_time = 0
while start_time < len(audio):
    
    end_time = min((start_time + interval), len(audio))
    print(f"[start_time:{start_time}, end_time: {end_time}, total_time:{len(audio)}]")
    
    segment = audio[start_time:end_time]
    temp_file_path = "./temp_audio.wav"
    segment.export(temp_file_path, format = "wav")
    start_time = end_time
    
    try:
        with sr.AudioFile(temp_file_path) as source:
            segment_audio = recognizer.record(source)
            segment_text = recognizer.recognize_google(segment_audio)

        ###############################################################
        # 단어 또는 음절 수(모음 수를 계산하여 근사치) 계산
        words = len(segment_text.split())
        print("Segment_text.split():",segment_text.split())
        syllables = sum(1 for char in segment_text if char.lower() in 'aeiou')

        # 발화 속도 계산 (단어 당, 만약 분당 계산할 것이라면 * 60 해주기)
        speech_rate_words_per_minute = round((words / segment.duration_seconds),2)
        print("speech_segment_duration:",segment.duration_seconds,"seconds")

        # 발화 속도 계산 (음절 당)
        speech_rate_syllables_per_minute = round((syllables / segment.duration_seconds),2)

        print(f"Speech Rate (Words/Second): {speech_rate_words_per_minute}")
        print(f"Speech Rate (Syllables/Second): {speech_rate_syllables_per_minute}")
        
        ###############################################################
        # Pitch 및 Volume 계산
        y, s = librosa.load(temp_file_path)

        # Calculate pitch (fundamental frequency)
        pitches, magnitudes = librosa.core.piptrack(y=y, sr=s)
        mean_pitch_per_frame = np.mean(pitches, axis=0)
        pitch_mean = np.mean(mean_pitch_per_frame)
        pitch_mean = Decimal(str(pitch_mean)).quantize(Decimal('0.00'), rounding=ROUND_HALF_UP)
        print(f"Mean Pitch: {pitch_mean} Hz")
        
        ###############################################################

        # Calculate volume (RMS energy) in decibels
        rms = librosa.feature.rms(y=y)[0]
        volume = round(20 * np.log10(np.mean(rms)),2)
        print(f"Volume (dB): {volume} dB\n\n")
        
        ###############################################################

    except sr.UnknownValueError:
        print("No speech recognized in the current segment.\n\n")
    except sr.RequestError as e:
        print(f"Error connecting to Google Speech Recognition service: {e}\n\n")
    except Exception as e:
        print(f"An error occurred: {e}\n\n")
        
    
        
print("Finished...")
    

Detailed Information...
Duration: 37.83700680272109 seconds
Channels: 2
Frame rate: 44100


[start_time:0, end_time: 5000, total_time:37837]
Segment_text.split(): ['what', 'can', 'I', 'do', 'for', 'you', 'have', 'a', 'seat', 'please']
speech_segment_duration: 5.0 seconds
Speech Rate (Words/Second): 2.0
Speech Rate (Syllables/Second): 3.0
Mean Pitch: 14.43 Hz
Volume (dB): -43.25 dB


[start_time:5000, end_time: 10000, total_time:37837]
No speech recognized in the current segment.


[start_time:10000, end_time: 15000, total_time:37837]
No speech recognized in the current segment.


[start_time:15000, end_time: 20000, total_time:37837]
Segment_text.split(): ['I', 'would', 'like', 'to', 'know', 'why', 'you', "wouldn't", 'meet']
speech_segment_duration: 5.0 seconds
Speech Rate (Words/Second): 1.8
Speech Rate (Syllables/Second): 2.6
Mean Pitch: 29.34 Hz
Volume (dB): -43.35 dB


[start_time:20000, end_time: 25000, total_time:37837]
Segment_text.split(): ['play', 'me', 'yesterday', "I'm", 'sor

In [70]:
def remove_high_frequency_noise(signal, threshold): # 노이즈 많은 것 같아서 노이즈 리덕션 함수 정의
    
    fft_result = fft(signal) # 주파수 도메인으로 변경
    frequencies = np.fft.fftfreq(len(fft_result), d=1/audio.frame_rate) # 주파수 크기 추출
    magnitudes = np.abs(fft_result)

    high_frequency_indices = np.where(frequencies > threshold) # 높은 스펙트럼 주파수 (노이즈 추정) 제거
    magnitudes[high_frequency_indices] = 0
    filtered_signal = ifft(magnitudes) # 시간 도메인으로 변경

    return filtered_signal.real

def count_fillers(text, filler_words):
    
    return sum(text.lower().count(word) for word in filler_words)

def analyze_audio(audio_file_path, interval = 5000):

    recognizer = sr.Recognizer()
    audio = AudioSegment.from_file(audio_file_path)
    audio_duration = audio.duration_seconds
    print("Detailed Information...")
    print(f"Duration: {audio_duration} seconds")
    print(f"Channels: {audio.channels}")
    print(f"Frame rate: {audio.frame_rate}\n\n")

    start_time = 0
    filler_words_count = 0
    pause_count = 0
    turn = 0
    turn_list = []
    while start_time < len(audio):
        
        turn += 1

        end_time = min((start_time + interval), len(audio))
        print(f"[start_time:{start_time}, end_time: {end_time}, total_time:{len(audio)}]")

        segment = audio[start_time:end_time]
        temp_file_path = "./temp_audio.wav"
        segment.export(temp_file_path, format = "wav")
        start_time = end_time
                
        try:
                
            with sr.AudioFile(temp_file_path) as source:
                segment_audio = recognizer.record(source)
                segment_text = recognizer.recognize_google(segment_audio)

            
            ###############################################################

            # Filler words 계산
            filler_words_count += count_fillers(segment_text, filler_words)
            if (filler_words_count >= 5):
                print("Too much filler words!")
                filler_words_count = 0

            ###############################################################

            # 단어 또는 음절 수(모음 수를 계산하여 근사치) 계산
            words = len(segment_text.split())
            print("Segment_text.split():",segment_text.split())
            syllables = sum(1 for char in segment_text if char.lower() in 'aeiou')

            # 발화 속도 계산 (단어 당, 만약 분당 계산할 것이라면 * 60 해주기)
            speech_rate_words_per_minute = round((words / segment.duration_seconds),2)
            print("speech_segment_duration:",segment.duration_seconds,"seconds")

            # 발화 속도 계산 (음절 당)
            speech_rate_syllables_per_minute = round((syllables / segment.duration_seconds),2)

            print(f"Speech Rate (Words/Second): {speech_rate_words_per_minute}")
            print(f"Speech Rate (Syllables/Second): {speech_rate_syllables_per_minute}")

            ###############################################################

            # Pitch 및 Volume 계산
            audioPV = AudioSegment.from_file(temp_file_path)
            audioPV_array = np.array(audioPV.get_array_of_samples())

            audioPV_array = remove_high_frequency_noise(audioPV_array,2000)

            fft_result = fft(audioPV_array)
            frequencies = np.fft.fftfreq(len(fft_result), d=1/audioPV.frame_rate)
            magnitudes = np.abs(fft_result)
            peaks,_ = find_peaks(magnitudes, height = 5000)
            pitch_values = frequencies[peaks]
            valid_pitches = [p for p in pitch_values if 50 < p < 500]
            average_pitch = np.mean(valid_pitches) if valid_pitches else None
            print(f"Mean Pitch: {average_pitch} Hz")

            ###############################################################

            # Calculate volume (RMS energy) in decibels
            rms = np.sqrt(np.mean(audioPV_array**2))
            volume = 20 * np.log10(rms)
            print(f"Volume (dB): {volume} dB\n\n")

            ###############################################################

        except sr.UnknownValueError:
            ###############################################################

            # Pause 계산 ★★★ 다시 구하기 >> pyaudio할 때 ★★★
            pause_count += 1
            turn_list.append(turn)
            if pause_count >= 2 and abs(turn_list[1]-turn_list[0]) == 1: 
                print("No speech recognized in the current segment.")
                print("Too much break!\n\n")
                pause_count = 0
                turn_list = []
            else:
                print("No speech recognized in the current segment.\n\n")

            
            ###############################################################

        except sr.RequestError as e:
            print(f"Error connecting to Google Speech Recognition service: {e}\n\n")
        except Exception as e:
            print(f"An error occurred: {e}\n\n")



    print("Finished...")
    

if __name__ == "__main__":
    filler_words = ["Um","Uh","Like","You know","Well"]
    audio_file_path = './drama.wav'
    analyze_audio(audio_file_path)

    

Detailed Information...
Duration: 37.83700680272109 seconds
Channels: 2
Frame rate: 44100


[start_time:0, end_time: 5000, total_time:37837]
Segment_text.split(): ['what', 'can', 'I', 'do', 'for', 'you', 'have', 'a', 'seat', 'please']
speech_segment_duration: 5.0 seconds
Speech Rate (Words/Second): 2.0
Speech Rate (Syllables/Second): 3.0
Mean Pitch: 276.78917486752465 Hz
Volume (dB): 52.46381015579701 dB


[start_time:5000, end_time: 10000, total_time:37837]
No speech recognized in the current segment.


[start_time:10000, end_time: 15000, total_time:37837]
No speech recognized in the current segment.
Too much break!


[start_time:15000, end_time: 20000, total_time:37837]
Segment_text.split(): ['I', 'would', 'like', 'to', 'know', 'why', 'you', "wouldn't", 'meet']
speech_segment_duration: 5.0 seconds
Speech Rate (Words/Second): 1.8
Speech Rate (Syllables/Second): 2.6
Mean Pitch: 279.47322957198446 Hz
Volume (dB): 50.747818952087336 dB


[start_time:20000, end_time: 25000, total_time:378

In [None]:
import pyaudio
import numpy as np
import time

def detect_silence(audio_data, silence_threshold=-40):
    # audio_data에서 음성 중단을 감지하고 해당 구간을 반환
    is_silent = np.max(audio_data) < silence_threshold
    return is_silent

def monitor_microphone(input_device_index=None, silence_threshold=-40, break_duration_threshold=10):
    CHUNK = 1024  # 오디오 스트림에서 한 번에 읽을 데이터 크기
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100  # 샘플링 레이트 (Hz)

    p = pyaudio.PyAudio()

    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    input_device_index=input_device_index,
                    frames_per_buffer=CHUNK)

    print("Monitoring microphone...")

    start_time = time.time()
    is_breaking = False

    while True:
        try:
            data = np.frombuffer(stream.read(CHUNK), dtype=np.int16)
            if detect_silence(data, silence_threshold=silence_threshold):
                if not is_breaking:
                    is_breaking = True
                    start_time = time.time()
            else:
                is_breaking = False

            if is_breaking and time.time() - start_time > break_duration_threshold:
                print(f"Too much break! ({break_duration_threshold} seconds)")
                is_breaking = False

        except KeyboardInterrupt:
            break

    print("Monitoring stopped.")
    stream.stop_stream()
    stream.close()
    p.terminate()

# 마이크 입력 모니터링 시작
monitor_microphone(input_device_index=None, silence_threshold=-40, break_duration_threshold=10)
