In [69]:
from dataclasses import dataclass
from datetime import datetime
import json

In [60]:
class DFAFilter():

    '''Filter Messages from keywords
    Use DFA to keep algorithm perform constantly
    >>> f = DFAFilter()
    >>> f.add("sexy")
    >>> f.filter("hello sexy baby")
    hello **** baby
    '''

    def __init__(self):
        self.keyword_chains = {}
        self.delimit = '\x00'

    def add(self, keyword):
        # if not isinstance(keyword, unicode):
        #     keyword = keyword.decode('utf-8')
        if isinstance(keyword, str):
            pass
        else:
            keyword = keyword.decode('utf-8')
        keyword = keyword.lower()
        chars = keyword.strip()
        if not chars:
            return
        level = self.keyword_chains
        for i in range(len(chars)):
            if chars[i] in level:
                level = level[chars[i]]
            else:
                if not isinstance(level, dict):
                    break
                for j in range(i, len(chars)):
                    level[chars[j]] = {}
                    last_level, last_char = level, chars[j]
                    level = level[chars[j]]
                last_level[last_char] = {self.delimit: 0}
                break
        if i == len(chars) - 1:
            level[self.delimit] = 0

    def parse(self, path):
        with open(path) as f:
            for keyword in f:
                self.add(keyword.strip())

    def filter(self, message, repl="*"):
        if isinstance(message, str):
            pass
        else:
            message = message.decode('utf-8')
        message = message.lower()
        ret = []
        start = 0
        while start < len(message):
            level = self.keyword_chains
            step_ins = 0
            for char in message[start:]:
                if char in level:
                    step_ins += 1
                    if self.delimit not in level[char]:
                        level = level[char]
                    else:
                        ret.append(repl * step_ins)
                        start += step_ins - 1
                        break
                else:
                    ret.append(message[start])
                    break
            else:
                ret.append(message[start])
            start += 1

        return ''.join(ret)
        

In [61]:
with open("files/words.json", "r") as f:
    origin_data = json.load(f)
    data = json.loads(origin_data['data'])


In [62]:
def convert_millis(millis):
    millis = int(millis)
    seconds = (millis / 1000) % 60
    minutes = (millis / (1000 * 60)) % 60
    hours = (millis / (1000 * 60 * 60)) % 24
    return "{:02d}:{:02d}:{:02d}".format(int(hours), int(minutes),
                                         int(seconds))


In [63]:
@dataclass
class Audio:
    bg: int
    ed: int
    onebest: str
    speaker: int

In [64]:
audio_list = [
    Audio(bg=convert_millis(e["bg"]),
          ed=convert_millis(e["ed"]),
          onebest=str(e["onebest"]),
          speaker=int(e["speaker"])) for e in data
]


In [65]:
sensitive_words = ["考试", "真题"]

In [66]:
hd_filter = DFAFilter()
for e in sensitive_words:
    hd_filter.add(e)

In [67]:
invalid_audio_list = []
for e in audio_list:
    if "&" not in hd_filter.filter(e.onebest, repl="&"):
        continue
    invalid_audio_list.append(e)

print("敏感语音/所有语音: {}/{}".format(invalid_audio_list.__len__(), audio_list.__len__()))

敏感语音/所有语音: 2/2009


In [68]:
for e in invalid_audio_list:
    print(e)

Audio(bg='00:56:59', ed='00:57:08', onebest='想说大家后面这个期末考试的时候是嗯可以可以到时候参考一下，就是你看到s三a的相关图的时候，', speaker=0)
Audio(bg='01:06:07', ed='01:06:12', onebest='那最后的最后的那个考试有可能会有', speaker=0)


In [80]:
total_mute_duration = 0.0
for i, _ in enumerate(audio_list):
    if i == 0:
        continue
    mute_duration = (
        datetime.strptime(audio_list[i].bg, '%H:%M:%S') -
        datetime.strptime(audio_list[i - 1].ed, '%H:%M:%S')).total_seconds()
    total_mute_duration += mute_duration
    if mute_duration < 60.0:
        continue
    print(audio_list[i - 1])
    print(audio_list[i])
    print()

audio_duration = (datetime.strptime(audio_list[-1].ed, '%H:%M:%S') -
                  datetime.strptime('00:00:00', '%H:%M:%S')).total_seconds()
print("静默时间/总体时间: {}/{}      {:.2f}%".format(
    total_mute_duration, audio_duration,
    100 * total_mute_duration / audio_duration))


Audio(bg='01:11:11', ed='01:11:13', onebest='我们大概过一下这次的一个作业，', speaker=0)
Audio(bg='01:14:47', ed='01:14:50', onebest='嗯等一下这个 Website刷出来，啊', speaker=0)

静默时间/总体时间: 1163.0/7390.0      15.74%
