In [1]:
import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
MIN = 10
MAX = 20

def is_ok(pauses, duration, MIN=10, MAX=20):

    pauses = [0, *pauses, duration]

    for start, end in zip(pauses[:-1], pauses[1:]):
        if not MIN <= end - start <= MAX:
            return False
    return True


def splitter(pauses, duration, MIN=10, MAX=20):
    from itertools import combinations, chain

    def powerset(iterable):
        "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
        s = list(iterable)
        return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))
    all_combinations = list(powerset(pauses))
    logging.info(f"Testing {len(all_combinations)} combinations....")
    for combination in all_combinations:
        if is_ok(combination, duration, MIN=MIN, MAX=MAX):
            return list(combination)
    logging.warning(f"No solution found so that {MIN=}s <= duration <= {MAX=}s.")
    return None


pauses = [7, 15, 16, 20]
splitter(pauses, 28, MIN=5, MAX=10)


2021-12-14 09:51:20,926 - INFO - Testing 16 combinations....


[7, 15, 20]

In [2]:
def splitter2(pauses, duration, MIN=10, MAX=20):
    pauses = [0, *pauses, duration]
    subsequent_pauses = [[s,e] for s, e in zip(pauses, pauses[1:]) if MIN <= e-s <= MAX]
    res = set()
    for s, e in subsequent_pauses:
        if s != 0:
            res.add(s)
        if e != duration:
            res.add(e)
    return sorted(list(res))
    


pauses = [7, 15, 16, 20]
splitter2(pauses, 28, MIN=5, MAX=10)

[7, 15, 20]

In [3]:
from pydub import AudioSegment
from pydub.silence import split_on_silence, detect_silence
file = "/home/peterr/macocu/task6_speech/data/00009875.flac.wav"
sound_file = AudioSegment.from_wav(file)
audio_chunks = split_on_silence(sound_file, min_silence_len=1000, silence_thresh=-40 )
print([i.duration_seconds for i in audio_chunks])

[4.306, 1.813, 2.121, 2.242, 1.549, 2.547, 1.912, 2.422, 2.178, 3.492, 2.824, 3.945, 5.646, 3.345, 12.492, 6.429, 6.731, 7.331, 7.547, 11.441]


In [4]:
gold = "Recimo da je to prvi chunk. To drugi. In končno tretji."
model_output = "To je ta drugi."

from fuzzywuzzy import fuzz
def get_subset(gold, model_output):
    results = list()
    gold = gold.split(" ")
    model_output = model_output.split(" ")
    M = len(gold)
    for start in range(M):
        for end in range(M):
            try:
                subset = gold[start:end]
            except IndexError:
                continue
            ratio = fuzz.ratio(" ".join(model_output), " ".join(subset))
            results.append((ratio, start, end, " ".join(subset)))
    maximum_ratio = max(results, key=lambda tup: tup[0])
    for result in results:
        if result[0] == maximum_ratio[0]:
            return result
    return results


get_subset(gold, model_output)


(75, 6, 8, 'To drugi.')

In [5]:
get_subset(
    "a o tome govori i tematika zbog kojeg su se sve žalili",
    "aotomegovoritematika"
)

(87, 0, 6, 'a o tome govori i tematika')

In [6]:
get_subset(
    "poštovane kolegice i kolege zastupnici molio bi da nastavimo sa radom sjednice",
    "poštovane kolegice ikolege zastupnici"
)

(99, 0, 5, 'poštovane kolegice i kolege zastupnici')

In [7]:
import difflib

l1 = "poštovane kolegice i kolege zastupnici molio bi da nastavimo sa radom sjednice"
l2 = "poštovane kolegice ikolege zastupnici"

seq_mat = difflib.SequenceMatcher(a=l1, b=l2)

match = seq_mat.find_longest_match(alo=0, ahi=len(l1), blo=0, bhi=len(l2))

print("============ Longest Matching Sequence ==================")
print("\nMatch Object : {}".format(match))
print("Matching Sequence from l1 : {}".format(l1[match.a:match.a+match.size]))
print("Matching Sequence from l2 : {}\n".format(l2[match.b:match.b+match.size]))

print("============ All Matching Sequences ==================")
for match in seq_mat.get_matching_blocks():
    print("\nMatch Object : {}".format(match))
    print("Matching Sequence from l1 : {}".format(l1[match.a:match.a+match.size]))
    print("Matching Sequence from l2 : {}".format(l2[match.b:match.b+match.size]))


Match Object : Match(a=0, b=0, size=20)
Matching Sequence from l1 : poštovane kolegice i
Matching Sequence from l2 : poštovane kolegice i


Match Object : Match(a=0, b=0, size=20)
Matching Sequence from l1 : poštovane kolegice i
Matching Sequence from l2 : poštovane kolegice i

Match Object : Match(a=21, b=20, size=17)
Matching Sequence from l1 : kolege zastupnici
Matching Sequence from l2 : kolege zastupnici

Match Object : Match(a=78, b=37, size=0)
Matching Sequence from l1 : 
Matching Sequence from l2 : 


In [8]:
!pip install nwalign3



In [9]:
import nwalign3 as nw

In [10]:
nw.global_align_no_matrix("EEAEE-", "EF-EEG")

TypeError: global_align_no_matrix() takes exactly 5 positional arguments (2 given)

In [None]:
from Bio import pairwise2
from Bio.Seq import Seq
seq1 = Seq("ACCGGT") 
seq2 = Seq("ACGT")

pairwise2.align.globalxx(seq1, seq2)

[Alignment(seqA='ACCGGT', seqB='A-C-GT', score=4.0, start=0, end=6),
 Alignment(seqA='ACCGGT', seqB='AC--GT', score=4.0, start=0, end=6),
 Alignment(seqA='ACCGGT', seqB='A-CG-T', score=4.0, start=0, end=6),
 Alignment(seqA='ACCGGT', seqB='AC-G-T', score=4.0, start=0, end=6)]

In [None]:
alignments = pairwise2.align.globalxx(
        Seq("a o tome govori i tematika zbog kojeg su se sve zalili".encode("ascii")),
        Seq("a o tome govori itematika")
)
alignments

[Alignment(seqA='a o tome govori i tematika zbog kojeg su se sve zalili', seqB='a o tome govori i-temati--------k----------------a----', score=25.0, start=0, end=54),
 Alignment(seqA='a o tome govori i tematika zbog kojeg su se sve zalili', seqB='a o tome govori i-tematik------------------------a----', score=25.0, start=0, end=54)]

In [None]:
alignments[1].seqB

'a o tome govori i-tematik------------------------a----'

In [None]:
l1 = "postovane kolegice i kolege zastupnici molio bi da nastavimo sa radom sjednice"
l2 = "postovane kolegice ikolege zastupnici ja bi molio"

alignments = pairwise2.align.globalxx(
        Seq(l2),
        Seq(l1))
alignments

[Alignment(seqA='postovane kolegice i-kolege zastupnici ja----- bi ----------moli-------o----------', seqB='postovane kolegice i kolege zastupnici --molio bi da nastavimo-- sa radom sjednice', score=45.0, start=0, end=82),
 Alignment(seqA='postovane kolegice i-kolege zastupnici ja---- bi ----------moli-------o----------', seqB='postovane kolegice i kolege zastupnici -molio bi da nastavimo-- sa radom sjednice', score=45.0, start=0, end=81),
 Alignment(seqA='postovane kolegice i-kolege zastupnici ja---- bi ----------moli-------o----------', seqB='postovane kolegice i kolege zastupnici m-olio bi da nastavimo-- sa radom sjednice', score=45.0, start=0, end=81),
 Alignment(seqA='postovane kolegice i-kolege zastupnici j-a--- bi ----------moli-------o----------', seqB='postovane kolegice i kolege zastupnici -molio bi da nastavimo-- sa radom sjednice', score=45.0, start=0, end=81),
 Alignment(seqA='postovane kolegice i-kolege zastupnici ja--- bi ----------moli-------o----------', seqB='postovan

In [None]:
alignments[1].seqB

'postovane kolegice i kolege zastupnici -molio bi da nastavimo-- sa radom sjednice'

In [None]:
get_subset(l1, l2)

(92, 0, 6, 'postovane kolegice i kolege zastupnici molio')

In [None]:
get_subset(l1, "")

(100, 0, 0, '')

In [6]:
def Getletterfromindex(num):
    import string
    #produces a string from numbers so

    #1->a
    #2->b
    #26->z
    #27->aa
    #28->ab
    #52->az
    #53->ba
    #54->bb

    num2alphadict = dict(zip(range(1, 27), string.ascii_lowercase))
    outval = ""
    numloops = (num-1) //26

    if numloops > 0:
        outval = outval + self.Getletterfromindex(numloops)

    remainder = num % 26
    if remainder > 0:
        outval = outval + num2alphadict[remainder]
    else:
        outval = outval + "z"
    return outval

Getletterfromindex(26)

'z'