# 침묵 비침묵 분류

In [None]:
from pydub import AudioSegment
from pydub.silence import detect_nonsilent

import speech_recognition as sr
import json

#adjust target amplitude
def match_target_amplitude(sound, target_dBFS):
    change_in_dBFS = target_dBFS - sound.dBFS
    return sound.apply_gain(change_in_dBFS)

def shorter_filler(json_result, audio_file, min_silence_len, start_time, non_silence_start):

  # 침묵 길이를 더 짧게
  min_silence_length = (int)(min_silence_len/1.2)

  intervals = detect_nonsilent(audio_file,
                              min_silence_len=min_silence_length,
                              silence_thresh=-32.64
                              )

  for interval in intervals:

    interval_audio = audio_file[interval[0]:interval[1]]

    # padding 40 길이 이상인 경우 더 짧게
    if (interval[1]-interval[0] >= 460):
      non_silence_start = shorter_filler(json_result, interval_audio, min_silence_length, interval[0]+start_time, non_silence_start)

  return non_silence_start

def create_json(audio_file):
  intervals_jsons = []

  min_silence_length = 70
  intervals = detect_nonsilent(audio_file,
                              min_silence_len=min_silence_length,
                              silence_thresh=-32.64
                              )

  if intervals[0][0] != 0:
    intervals_jsons.append({'start':0,'end':intervals[0][0],'tag':'0000'}) # tag: 0000 means silence

  non_silence_start = intervals[0][0]
  before_silence_start = intervals[0][1]

  for interval in intervals:
    interval_audio = audio_file[interval[0]:interval[1]]

     # 800ms초 이상의 공백 부분 처리
    if (interval[0]-before_silence_start) >= 800:
      intervals_jsons.append({'start':non_silence_start,'end':before_silence_start+200,'tag':'1000'}) # tag: 1000 means non-slience
      non_silence_start = interval[0]-200
      intervals_jsons.append({'start':before_silence_start,'end':interval[0],'tag':'0000'}) # tag: 0000 means slience

    before_silence_start = interval[1]

  if non_silence_start != len(audio_file):
    intervals_jsons.append({'start':non_silence_start,'end':len(audio_file),'tag':'1000'})

  return intervals_jsons

def STT_with_json(audio_file, jsons):
  first_silence = 0
  num = 0
  unrecognizable_start = 0
  r = sr.Recognizer()
  transcript_json = []
  statistics_silence_json = []

  audio_total_length = audio_file.duration_seconds
  silence_interval = 0
  for json in jsons :
    if json['tag'] == '0000':
      # 개시 지연시간
      if num == 0:
        first_silence = first_silence + (json['end']-json['start'])/1000
      else:
        silence_interval = silence_interval + (json['end']-json['start'])/1000
        silence = "(" + str(round((json['end']-json['start'])/1000)) + "초).."
        transcript_json.append({'start':json['start'],'end':json['end'],'tag':'0000','result':silence})

    elif json['tag'] == '1000':

      # 인식불가 처리
      if unrecognizable_start != 0:
        audio_file[unrecognizable_start:json['end']].export("temp.wav", format="wav")
      else:
        audio_file[json['start']:json['end']].export("temp.wav", format="wav")
      temp_audio_file = sr.AudioFile('temp.wav')
      with temp_audio_file as source:
        audio = r.record(source)
      try :
        stt = r.recognize_google(audio_data = audio, language = "ko-KR")
        # 개시 지연시간
        if num == 0:
          silence = "(" + str(round(first_silence)) + "초).."
          transcript_json.append({'start':0,'end':json['start'],'tag':'0000','result':silence})
          first_silence_interval = first_silence
        if unrecognizable_start != 0:
          transcript_json.append({'start':unrecognizable_start,'end':json['end'],'tag':'1000','result':stt})
        else:
          transcript_json.append({'start':json['start'],'end':json['end'],'tag':'1000','result':stt})
        unrecognizable_start = 0
        num = num + 1
      except:
        if unrecognizable_start == 0:
          unrecognizable_start = json['start']

  statistics_silence_json.append({'침묵시간':100 * (first_silence_interval +silence_interval)/audio_total_length, '발화시간':100 * (audio_total_length - first_silence - silence_interval)/audio_total_length,
                                  '침묵시간(초)':first_silence_interval + silence_interval, '발화시간(초)':(audio_total_length - first_silence - silence_interval), '총시간':audio_total_length})
  return transcript_json, statistics_silence_json

def make_transcript(audio_file_path):
  audio = AudioSegment.from_mp3(audio_file_path)
  normalized_audio = match_target_amplitude(audio, -20.0)
  intervals_jsons = create_json(normalized_audio)
  transcript_json = STT_with_json(normalized_audio, intervals_jsons)

  return transcript_json

# 분석파일 만들기

In [None]:
import glob
import json
import os
import re
import natsort
import pandas as pd
import numpy as np

folder_path = '이국희\[2024 수능특강] 이국희의 확률과 통계'  # 실제 폴더 경로로 변경하세요.

# 폴더 내의 파일 목록 가져오기
files_list = os.listdir(folder_path)
# 폴더 내의 파일 목록 가져오기
json_files = [folder_path + '/' + file for file in files_list if 'transcript' in file.lower() and file.endswith('.json')]
json_files = natsort.natsorted(json_files)

df = pd.DataFrame()
delta = []

# JSON 파일 불러오기 예제
for file_path in json_files:
     # 두 개의 빈 리스트 생성 (컬럼1, 컬럼2)
    column1 = []
    column2 = []

    folder_name = os.path.basename(folder_path)  # 폴더명 추출
    column1.append(folder_name)

    file_name = os.path.splitext(file_path)[0]  # 확장자 제거
    file_name = file_name.split('_')[0]  # _ 앞부분 추출
    file_name = file_name.split('/')[-1] # \ 뒷부분 추출
    column2.append(file_name)
    # print(column2)
    with open(file_path, 'r', encoding='utf-8') as file:
        transcript = pd.read_json(file)

        # 연속적인 0의 블록을 찾아내고, 길이가 20 이상인 경우 해당 행들을 삭제
        count = 0
        delete_indices = []

        for i, value in enumerate(transcript['tag']):
            if value == 0:
                count += 1
            else:
                count = 0
            
            if count >= 20:
                delete_indices.extend(range(i - count + 1, i + 1))

        # 삭제할 인덱스를 제외하고 선택
        transcript = transcript[~transcript.index.isin(delete_indices)]

        data_len = len(transcript)
        # 더한 결과를 저장할 변수 초기화
        numbers = 0
        total_list = []

        # 데이터프레임을 순회하면서 조건을 검사하고 더하기
        for index, row in transcript.iterrows():
            if row['tag'] == 0:
                result_value = row['result']
                # 숫자 추출
                number = re.findall(r'\d+', result_value)[0]
                numbers += int(number)
                # print("numbers : ", numbers)

                # 1000이 나오면 리셋
            elif row['tag'] == 1000:
                total_list.append(numbers)
                # print(f"Total Sum for {index + 1} rows: {numbers}")
                numbers = 0

        # 남은 값 추가 (만약 1000을 초과하지 않는 경우)
        total_list.append(numbers)
        total_list = total_list[1:]
        total_silence = sum(total_list)

        data_med = np.median(total_list)
        data_mean = np.mean(total_list)
        data_std = np.std(total_list)

        delta = delta + [abs(total_list[i] - total_list[i - 1]) for i in range(1, len(total_list))]
        # 델타 표준편차
        delta_std = np.std(delta)

        # DataFrame 생성
        df1 = pd.DataFrame({'강좌명': column1, '강의명': column2, '중앙값':data_med, '평균':data_mean, '표준편차': data_std, 'delta_std': delta_std, 'data_len': data_len,
                            '침묵시간':total_silence})
        df = df.append(df1, ignore_index=True)


In [None]:
stat_files = [folder_path + '/' + file for file in files_list if 'statistics' in file.lower() and file.endswith('.json')]
stat_files = natsort.natsorted(stat_files)
# stat_files

import json
import pandas as pd

statdf = pd.DataFrame()

# Load the JSON file
for stat_json in stat_files:
  with open(stat_json) as f:
    data = json.load(f)
    statdf = statdf.append(pd.DataFrame(data))

# Print the DataFrame
statdf = statdf[['통역개시지연시간(초)', '발화시간(초)']]

statdf = statdf.reset_index().drop(['index'], axis=1)
print(statdf)

print(len(df), len(statdf))

datadf = pd.concat([df, statdf], axis=1)
datadf['침묵시간(초)'] = datadf['침묵시간'] + datadf['통역개시지연시간(초)']

datadf = datadf[['강좌명',	'강의명',	'중앙값',	'평균',	'표준편차',	'delta_std',	'data_len',	'침묵시간(초)', '발화시간(초)']]
datadf['총시간'] = datadf['침묵시간(초)'] + datadf['발화시간(초)']

In [None]:
datadf = datadf[datadf['data_len'] > 5]

In [None]:
datadf.to_csv('./공백분석결과/이국희2024공백.csv', index=False)

# 시각화

In [None]:
# 정승제
import matplotlib.pyplot as plt

plt.boxplot(total_list, vert=False)
plt.xlabel('Absolute Differences')
plt.show()