In [1]:
import os
import glob
import json
import nltk
import math
import numpy as np
from nltk.tokenize import TweetTokenizer

In [37]:
tknzr = TweetTokenizer(reduce_len=True)

DATA_PATH = "../TwitchHighlightCrawler/vod/" # const

MAX_CLIP_LENGTH = 60 # const
MIN_CLIP_LENGTH = 5 # const

DIVERSITY_BASE_WINDOW = 30 # const for diversity method
DIVERSITY_THRESHOLD = 0.835 # const for diversity method

_test = [{'channel': 'ninja', 'video':'425622866'}, {'channel': 'shroud', 'video':'259514478'}]

In [None]:
videoLength = 28662 # temp const
numberOfMessage = 1276 # temp const
with open("../TwitchHighlightCrawler/comments.json", "r", encoding="utf-8") as file:
    commentFQ = file.read()
commentFQ = json.loads(commentFQ)

In [None]:
with open("c.txt", "r") as clip_file:
    globalClip = clip_file.read()
    
globalClip = json.loads(globalClip)['data'] 

In [3]:
def normalized_shannon_entropy(text): # entropy diversity measure (normalized)
    entropy = 0
    
    textLength = len(text)
    if textLength <= 1:
        return 0
    else:
        vocabulary = set(text)
        for word in vocabulary:
            p = text.count(word) / textLength

            entropy -= p * math.log2(p)

        return entropy / math.log2(textLength)

In [13]:
def getBaseline(channel, video, length = None, messages = None): # use comment frequency to get baseline
    # length is a hint for the length of this video
    if not length: # check length is exist
        with open(DATA_PATH + channel + "/" + str(video) + "/info.json", "r", encoding="utf-8") as file:
            data = file.read()
            data = json.loads(data)
        length = data['length']
            
    # messages is a hint for comment messages in this video
    if not messages: # check messages is exist
        messages = [[] for i in range(length)]
        
        messagePathList = glob.glob(DATA_PATH + channel + "/" + str(video) + "/Message-*.json")
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()
                
            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset
                
                if offset >= length:
                    break
                    
                messages[offset].append( comment['message']['body'] )
                
    commentFrequencyEachSecond = np.empty(length, dtype=int)
    totalNumberOfMessages = 0
    for i in range(length):
        numberOfMessagesInThisSecond = len(messages[i])
        
        commentFrequencyEachSecond[i] = numberOfMessagesInThisSecond
        totalNumberOfMessages += numberOfMessagesInThisSecond
        
    avgCommentFrequency = totalNumberOfMessages / length
    
    predictedFragment = { # initialize the object store what we predict
        'macro': [0] * length,
        'global': np.zeros(length, dtype=int),
        'count': 0 # predicted fragment count for macro precision computing
    }
    strike = 0
    for i in range(length):
        if commentFrequencyEachSecond[i] > avgCommentFrequency:
            strike += 1
        else: # no strike or after a strike
            if strike >= 5:
                startIndex = i - strike # now i will be (the end of prediction + 1), so it can minus strike directly
                
                predictedFragment['count'] += 1
                
                predictedFragment['macro'][startIndex] = { # mark prediction in macro
                    'duration': strike,
                    'precision': []
                }
                
                for global_index in range(startIndex, i): # mark prediction in global metrics
                    predictedFragment['global'][global_index] = 1
                    
            strike = 0 # reset
            
        if i == length - 1 and strike >= 5: # check for the last second (use length instead of i to compute following data)
            startIndex = length - strike
            
            predictedFragment['count'] += 1

            predictedFragment['macro'][startIndex] = { # mark prediction in macro
                'duration': strike,
                'precision': []
            }

            for global_index in range(startIndex, length): # mark prediction in global metrics
                predictedFragment['global'][global_index] = 1
                
    return predictedFragment

In [14]:
def detectLowDiversity(channel, video, length = None, messages = None):
    # length is a hint for the length of this video
    if not length: # check length is exist
        with open(DATA_PATH + channel + "/" + str(video) + "/info.json", "r", encoding="utf-8") as file:
            data = file.read()
            data = json.loads(data)
        length = data['length']
            
    # messages is a hint for comment messages in this video
    if not messages: # check messages is exist
        messages = [[] for i in range(length)]
        
        messagePathList = glob.glob(DATA_PATH + channel + "/" + str(video) + "/Message-*.json")
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()
                
            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset
                
                if offset >= length:
                    break
                    
                messages[offset].append( comment['message']['body'] )
                
    predictedFragment = { # initialize the object store what we predict
        'macro': [0] * length,
        'global': np.zeros(length, dtype=int),
        'count': 0 # predicted fragment count for macro precision computing
    }
    
    startIndex = 0
    while startIndex < length:
        localMessage = ''
        
        for j in range(startIndex, startIndex + DIVERSITY_BASE_WINDOW):
            if j >= length: # over video length
                j -= 1 # resume for the lengh of predictedFragment['macro'] computing
                break

            localMessage += ' '.join(messages[j]) + ' '

        localTokens = tknzr.tokenize(localMessage) # tokenization
        localText = nltk.text.Text(localTokens) # convert tokens to NLTK text

        score = normalized_shannon_entropy(localText)
        
        if score < DIVERSITY_THRESHOLD:
            predictedFragment['count'] += 1
            
            predictedFragment['macro'][startIndex] = { # mark prediction in macro
                'duration': j - startIndex + 1, # length of this fragment = j - i + 1
                'precision': []
            }
            
            for global_index in range(startIndex, j+1): # mark prediction in global metrics
                predictedFragment['global'][global_index] = 1
    
        startIndex += DIVERSITY_BASE_WINDOW # move to next period
    
    return predictedFragment

In [15]:
def detectLowDiversityWithoutLowFrequency(channel, video, length = None, messages = None, totalNumberOfMessages = None):
    # length is a hint for the length of this video
    if not length: # check length is exist
        with open(DATA_PATH + channel + "/" + str(video) + "/info.json", "r", encoding="utf-8") as file:
            data = file.read()
            data = json.loads(data)
        length = data['length']
            
    # messages is a hint for comment messages in this video
    if not messages: # check messages is exist
        messages = [[] for i in range(length)]
        totalNumberOfMessages = 0 # if messages doesn't exist, then totalNumberOfMessages also will not exist
        
        messagePathList = glob.glob(DATA_PATH + channel + "/" + str(video) + "/Message-*.json")
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()
                
            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset
                
                if offset >= length:
                    break
                    
                messages[offset].append( comment['message']['body'] )
                totalNumberOfMessages += 1
                
    predictedFragment = { # initialize the object store what we predict
        'macro': [0] * length,
        'global': np.zeros(length, dtype=int),
        'count': 0 # predicted fragment count for macro precision computing
    }
    
    avgMessageFrequency = totalNumberOfMessages / length
    
    startIndex = 0
    while startIndex < length:
        localMessage = ''
        localFrequency = 0
        
        for j in range(startIndex, startIndex + DIVERSITY_BASE_WINDOW):
            if j >= length: # over video length
                j -= 1 # resume for the lengh of predictedFragment['macro'] computing
                break

            localMessage += ' '.join(messages[j]) + ' '
            localFrequency += len(messages[j])

        localTokens = tknzr.tokenize(localMessage) # tokenization
        localText = nltk.text.Text(localTokens) # convert tokens to NLTK text

        score = normalized_shannon_entropy(localText)
        
        if score < DIVERSITY_THRESHOLD:
            localFrequency /= (j - startIndex + 1)
            if localFrequency > avgMessageFrequency:
                predictedFragment['count'] += 1

                predictedFragment['macro'][startIndex] = { # mark prediction in macro
                    'duration': j - startIndex + 1, # length of this fragment = j - i + 1
                    'precision': []
                }

                for global_index in range(startIndex, j+1): # mark prediction in global metrics
                    predictedFragment['global'][global_index] = 1
        
        startIndex += DIVERSITY_BASE_WINDOW # move to next period
    
    return predictedFragment

In [16]:
def detectLowDiversityExpand(channel, video, length = None, messages = None): # now only expand the tail of fragment
    # length is a hint for the length of this video
    if not length: # check length is exist
        with open(DATA_PATH + channel + "/" + str(video) + "/info.json", "r", encoding="utf-8") as file:
            data = file.read()
            data = json.loads(data)
        length = data['length']
            
    # messages is a hint for comment messages in this video
    if not messages: # check messages is exist
        messages = [[] for i in range(length)]
        
        messagePathList = glob.glob(DATA_PATH + channel + "/" + str(video) + "/Message-*.json")
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()
                
            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset
                
                if offset >= length:
                    break
                    
                messages[offset].append( comment['message']['body'] )
                
    predictedFragment = { # initialize the object store what we predict
        'macro': [0] * length,
        'global': np.zeros(length, dtype=int),
        'count': 0 # predicted fragment count for macro precision computing
    }
    
    startIndex = 0
    while startIndex < length:
        localMessage = ''
        numberOfMessages = 0
        
        for j in range(startIndex, startIndex + DIVERSITY_BASE_WINDOW):
            if j >= length: # over video length
                j -= 1 # resume for the lengh of predictedFragment['macro'] computing
                break

            localMessage += ' '.join(messages[j]) + ' '
            numberOfMessages += len(messages[j])
        
        localTokens = tknzr.tokenize(localMessage) # tokenization
        localText = nltk.text.Text(localTokens) # convert tokens to NLTK text

        score = normalized_shannon_entropy(localText)
        
        endIndex = j # WARNING!! j is out of scope
        duration = endIndex - startIndex + 1
        # Detect if it is highlight
        if score < DIVERSITY_THRESHOLD:            
            localFrequency = numberOfMessages / duration
            
            RESHARP_WINDOW = 5
            for i in range(endIndex + 1, endIndex + (MAX_CLIP_LENGTH - DIVERSITY_BASE_WINDOW + 1), RESHARP_WINDOW):
                if i >= length: # over video length
                    break
                
                for j in range(i, i+RESHARP_WINDOW):
                    if j >= length: # over video length
                        j -= 1 # resume for the duration computing
                        break
                    
                    localMessage += ' '.join(messages[j]) + ' '
                    numberOfMessages += len(messages[j])
                
                localTokens = tknzr.tokenize(localMessage) # tokenization
                localText = nltk.text.Text(localTokens) # convert tokens to NLTK text
                
                newScore = normalized_shannon_entropy(localText)
                
                newEndIndex = j # WARNING!! j is out of scope
                newDuration = newEndIndex - startIndex + 1
                if numberOfMessages != 0 and (newScore / (numberOfMessages/ newDuration) < score / localFrequency):
                    duration = newDuration
                    endIndex = newEndIndex
                else:
                    break
                
            predictedFragment['count'] += 1

            predictedFragment['macro'][startIndex] = { # mark prediction in macro
                'duration': duration,
                'precision': []
            }

            for global_index in range(startIndex, endIndex+1): # mark prediction in global metrics
                predictedFragment['global'][global_index] = 1
        
        startIndex += duration # move to next period
    
    return predictedFragment

In [17]:
def detectLowDiversityWithoutLowFrequencyExpand(channel, video, length = None, messages = None, totalNumberOfMessages = None): # now only expand the tail of fragment
    # length is a hint for the length of this video
    if not length: # check length is exist
        with open(DATA_PATH + channel + "/" + str(video) + "/info.json", "r", encoding="utf-8") as file:
            data = file.read()
            data = json.loads(data)
        length = data['length']
            
    # messages is a hint for comment messages in this video
    if not messages: # check messages is exist
        messages = [[] for i in range(length)]
        totalNumberOfMessages = 0 # if messages doesn't exist, then totalNumberOfMessages also will not exist
        
        messagePathList = glob.glob(DATA_PATH + channel + "/" + str(video) + "/Message-*.json")
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()
                
            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset
                
                if offset >= length:
                    break
                    
                messages[offset].append( comment['message']['body'] )
                totalNumberOfMessages += 1
                
    predictedFragment = { # initialize the object store what we predict
        'macro': [0] * length,
        'global': np.zeros(length, dtype=int),
        'count': 0 # predicted fragment count for macro precision computing
    }
    
    avgMessageFrequency = totalNumberOfMessages / length
    
    startIndex = 0
    while startIndex < length:
        localMessage = ''
        numberOfMessages = 0
        
        for j in range(startIndex, startIndex + DIVERSITY_BASE_WINDOW):
            if j >= length: # over video length
                j -= 1 # resume for the lengh of predictedFragment['macro'] computing
                break

            localMessage += ' '.join(messages[j]) + ' '
            numberOfMessages += len(messages[j])
        
        localTokens = tknzr.tokenize(localMessage) # tokenization
        localText = nltk.text.Text(localTokens) # convert tokens to NLTK text

        score = normalized_shannon_entropy(localText)
        
        endIndex = j # WARNING!! j is out of scope
        duration = endIndex - startIndex + 1
        # Detect if it is highlight
        if score < DIVERSITY_THRESHOLD:            
            localFrequency = numberOfMessages / duration
            if localFrequency > avgMessageFrequency:
                RESHARP_WINDOW = 5
                for i in range(endIndex + 1, endIndex + (MAX_CLIP_LENGTH - DIVERSITY_BASE_WINDOW + 1), RESHARP_WINDOW):
                    if i >= length: # over video length
                        break

                    for j in range(i, i+RESHARP_WINDOW):
                        if j >= length: # over video length
                            j -= 1 # resume for the duration computing
                            break

                        localMessage += ' '.join(messages[j]) + ' '
                        numberOfMessages += len(messages[j])

                    localTokens = tknzr.tokenize(localMessage) # tokenization
                    localText = nltk.text.Text(localTokens) # convert tokens to NLTK text

                    newScore = normalized_shannon_entropy(localText)

                    newEndIndex = j # WARNING!! j is out of scope
                    newDuration = newEndIndex - startIndex + 1
                    if numberOfMessages != 0 and (newScore / (numberOfMessages/ newDuration) < score / localFrequency):
                        duration = newDuration
                        endIndex = newEndIndex
                    else:
                        break

                predictedFragment['count'] += 1

                predictedFragment['macro'][startIndex] = { # mark prediction in macro
                    'duration': duration,
                    'precision': []
                }

                for global_index in range(startIndex, endIndex+1): # mark prediction in global metrics
                    predictedFragment['global'][global_index] = 1
        
        startIndex += duration # move to next period
    
    return predictedFragment

In [None]:
messageEachSecond = [''] * videoLength

for i in range(1, numberOfMessage + 1):
    with open("../TwitchHighlightCrawler/vod/lirik/389178879/Message-" + str(i) + ".json", "r", encoding="utf-8") as file:
        messages = file.read()
        
    messages = json.loads(messages)['comments']
    for message in messages:
        offset = math.floor( message['content_offset_seconds'] )
        
        if offset >= videoLength:
            break
        
        if messageEachSecond[offset]:
            messageEachSecond[offset] += ' ' + message['message']['body']
        else:
            messageEachSecond[offset] = message['message']['body']

In [None]:
predictedFragment = {
    'macro': [0] * videoLength,
    'micro': np.zeros(videoLength, dtype=int),
    'count': 0 # predicted fragment count for macro precision computing # 356
}

i = 0
while i < videoLength:
    localMessage = ''
    localFQ = 0
    
    for j in range(i, i+30):
        if j >= videoLength: # over video length
            j -= 1 # resume for lengh of predictedFragment['macro'] computing
            break
            
        localMessage += messageEachSecond[j] + ' '
        localFQ += commentFQ[j]
        
    localTokens = tknzr.tokenize(localMessage) # tokenization
    localText = nltk.text.Text(localTokens) # convert tokens to NLTK text
    
    score = normalized_shannon_entropy(localText)
    
    if score < 0.835:
        if localFQ / (j - i + 1) > 2.526795059660875:
            '''scoreList = []
            localMessage = '' # reset
            localFQ = 0 # reset
            shape_window = 5
            for macro_index in range(i, i+30, shape_window):
                for macro_seek in range(macro_index, macro_index + shape_window):
                    localMessage += messageEachSecond[j] + ' '
                    localFQ += commentFQ[j]
                    
                localTokens = tknzr.tokenize(localMessage) # tokenization
                localText = nltk.text.Text(localTokens) # convert tokens to NLTK text
                
                scoreList.append({
                    'frequency': localFQ / (macro_seek - i + 1),
                    'diversity': normalized_shannon_entropy(localText)
                })'''
            
            predictedFragment['count'] += 1
            
            predictedFragment['macro'][i] = { # mark prediction in macro
                'duration': j - i + 1, # length of this fragment = j - i + 1
                'precision': []
            }
            
            for micro_index in range(i, j+1): # mark prediction in micro
                predictedFragment['micro'][micro_index] = 1
    
    i += 30

In [22]:
def local_score(clipDataList, predict, numberOfPredictedFragment):
    localPrecision = 0
    localRecall = 0
    
    videoLength = len(predict)
    
    for clip in clipDataList:
        recall = 0
        overlapTime = 0 # for local recall computing

        endOfClip = clip['offset'] + clip['duration'] # not real video end timeline point

        for i in range(clip['offset'] - (MAX_CLIP_LENGTH-1), clip['offset'] - MIN_CLIP_LENGTH + 1): # (offset - 59) <= prediction <= (offset - 5) # eg. offset = 85, duration = 10, 26~80
            if predict[i] and (i + predict[i]['duration']) > clip['offset']: # Simplify from (i + (predict[i]['duration']-1)) >= clip['offset']
                endOfPrediction = i + predict[i]['duration']
                overlap = endOfPrediction - clip['offset'] # compute overlap seconds

                if endOfPrediction > endOfClip:
                    overlap -= (endOfPrediction - endOfClip) # remove the part over than the end of ground true

                # precision
                predict[i]['precision'].append( overlap/predict[i]['duration'] )
                # local recall
                singleRecallValue = overlap / clip['duration']
                recall += singleRecallValue
                overlapTime += 1

        for i in range(clip['offset'] - MIN_CLIP_LENGTH + 1, clip['offset']): # (offset - 4(less than MIN_CLIP_LENGTH)) <= prediction < offset # 81~84
            if predict[i]: # no need to check if it overlap the ground truth period, it must be in the period
                endOfPrediction = i + predict[i]['duration']
                overlap = endOfPrediction - clip['offset']; # compute overlap seconds

                if endOfPrediction > endOfClip: # predicted fragment is larger than clip
                    overlap -= (endOfPrediction - endOfClip); # remove the part over than the end of ground true

                # precision
                predict[i]['precision'].append( overlap/predict[i]['duration'] )
                # local recall
                singleRecallValue = overlap / clip['duration']
                recall += singleRecallValue
                overlapTime += 1

        for i in range(clip['offset'], endOfClip): # offset <= prediction < (offset + duration) # 85~94
            if predict[i]: # no need to check if it overlap the ground truth period, it must be in the period
                endOfPrediction = i + predict[i]['duration']
                overlap = endOfClip - i; # compute overlap seconds

                if endOfPrediction < endOfClip: # predicted fragment is smaller than clip
                    overlap -= endOfClip - endOfPrediction

                # precision
                predict[i]['precision'].append( overlap/predict[i]['duration'] )
                # local recall
                singleRecallValue = overlap / clip['duration']
                recall += singleRecallValue
                overlapTime += 1

        if overlapTime:
            recall /= overlapTime
            localRecall += recall
    
    for i in range(videoLength):
        if predict[i]: # check have predicted fragment
            if len(predict[i]['precision']): # the fragment has overlap
                precision = 0 # for local precision computing

                for singlePrecisionValue in predict[i]['precision']:
                    # local precision
                    precision += singlePrecisionValue
                precision /= len(predict[i]['precision'])

                localPrecision += precision

    localPrecision /= numberOfPredictedFragment
    localRecall /= len(clipDataList)
    F1 = (2 * localPrecision * localRecall) / (localPrecision + localRecall)
    print(str(localPrecision) + ' ' + str(localRecall) + ' ' + str(F1))
    
    return {
        'precision': precision,
        'recall': recall,
        'F1': F1
    }
    
def macro_score(clipDataList, predict):
    macroPrecision = 0
    macroRecall = 0
    macroF1 = 0
    
    videoLength = len(predict)
    
    for clip in clipDataList:
        precision = 0
        recall = 0
        F1 = 0
        
        overlapTime = 0

        endOfClip = clip['offset'] + clip['duration'] # not real video end timeline point

        for i in range(clip['offset'] - (MAX_CLIP_LENGTH-1), clip['offset'] - MIN_CLIP_LENGTH + 1): # (offset - 59) <= prediction <= (offset - 5) # eg. offset = 85, duration = 10, 26~80
            if predict[i] and (i + predict[i]['duration']) > clip['offset']: # Simplify from (i + (predict[i]['duration']-1)) >= clip['offset']
                endOfPrediction = i + predict[i]['duration']
                overlap = endOfPrediction - clip['offset'] # compute overlap seconds

                if endOfPrediction > endOfClip:
                    overlap -= (endOfPrediction - endOfClip) # remove the part over than the end of ground true

                # precision
                singlePrecisionVlaue = overlap / predict[i]['duration']
                precision += singlePrecisionVlaue
                # recall
                singleRecallValue = overlap / clip['duration']
                recall += singleRecallValue
                # F1
                singleF1Value = (2 * singlePrecisionVlaue * singleRecallValue) / (singlePrecisionVlaue + singleRecallValue)
                F1 += singleF1Value
                
                overlapTime += 1

        for i in range(clip['offset'] - MIN_CLIP_LENGTH + 1, clip['offset']): # (offset - 4(less than MIN_CLIP_LENGTH)) <= prediction < offset # 81~84
            if predict[i]: # no need to check if it overlap the ground truth period, it must be in the period
                endOfPrediction = i + predict[i]['duration']
                overlap = endOfPrediction - clip['offset']; # compute overlap seconds

                if endOfPrediction > endOfClip: # predicted fragment is larger than clip
                    overlap -= (endOfPrediction - endOfClip); # remove the part over than the end of ground true

                # precision
                singlePrecisionVlaue = overlap / predict[i]['duration']
                precision += singlePrecisionVlaue
                # recall
                singleRecallValue = overlap / clip['duration']
                recall += singleRecallValue
                # F1
                singleF1Value = (2 * singlePrecisionVlaue * singleRecallValue) / (singlePrecisionVlaue + singleRecallValue)
                F1 += singleF1Value
                
                overlapTime += 1

        for i in range(clip['offset'], endOfClip): # offset <= prediction < (offset + duration) # 85~94
            if predict[i]: # no need to check if it overlap the ground truth period, it must be in the period
                endOfPrediction = i + predict[i]['duration']
                overlap = endOfClip - i; # compute overlap seconds

                if endOfPrediction < endOfClip: # predicted fragment is smaller than clip
                    overlap -= endOfClip - endOfPrediction

                # precision
                singlePrecisionVlaue = overlap / predict[i]['duration']
                precision += singlePrecisionVlaue
                # recall
                singleRecallValue = overlap / clip['duration']
                recall += singleRecallValue
                # F1
                singleF1Value = (2 * singlePrecisionVlaue * singleRecallValue) / (singlePrecisionVlaue + singleRecallValue)
                F1 += singleF1Value
                
                overlapTime += 1

        if overlapTime:
            precision /= overlapTime
            recall /= overlapTime
            F1 /= overlapTime
            
            macroPrecision += precision
            macroRecall += recall
            macroF1 += F1
    
    numberOfAnswer = len(clipDataList)
    
    macroPrecision /= numberOfAnswer
    macroRecall /= numberOfAnswer
    macroF1 /= numberOfAnswer  
    print(str(macroPrecision) + ' ' + str(macroRecall) + ' ' + str(macroF1))
    
def global_score(groundTruth, predict):
    videoLength = len(groundTruth)
    
    # normal score
    overlap = 0
    totalClipSeconds = 0 # WARNING!! we can count this value when computing local score
    totalPredictSeconds = 0
    
    # weighted score
    weightedOverlap = 0
    totalWeightedClipSeconds = 0 # WARNING!! we can count this value when computing local score
    totalWeightedPredictSeconds = 0
    
    for i in range(videoLength):
        if groundTruth[i]:
            totalClipSeconds += 1
            totalWeightedClipSeconds += groundTruth[i]
            
            if predict[i]:
                totalPredictSeconds += 1
                totalWeightedPredictSeconds += groundTruth[i]
                
                overlap += 1
                weightedOverlap += groundTruth[i]
        elif predict[i]:
            totalPredictSeconds += 1
            totalWeightedPredictSeconds += 1 # no clip on this second, so weight is 1
                
    precision = overlap / totalPredictSeconds
    recall = overlap / totalClipSeconds
    F1 = (2 * precision * recall) / (precision + recall)
    
    weightedPrecision = weightedOverlap / totalWeightedPredictSeconds
    weightedRecall = weightedOverlap / totalWeightedClipSeconds
    weightedF1 = (2 * weightedPrecision * weightedRecall) / (weightedPrecision + weightedRecall)
    print(str(precision) + ' ' + str(recall) + ' ' + str(F1))
    print(str(weightedPrecision) + ' ' + str(weightedRecall) + ' ' + str(weightedF1))
    
    return {
        'normal': {
            'precision': precision,
            'recall': recall,
            'F1': F1
        },
        'weighted': {
            'precision': weightedPrecision,
            'recall': weightedRecall,
            'F1': weightedF1
        }
    }

def evaluation(channel, video, predict):
    global_answer = np.zeros(len(predict['macro']), dtype=int) # WARNING!! use dependent data len(predict['macro'])
    
    clipList = os.listdir(DATA_PATH + channel + '/'+ video +'/clip')
    clipDataList = [] # ground true for loacl & macro
    for clip in clipList:
        with open(DATA_PATH + channel + '/'+ video +'/clip/' + clip, "r", encoding="utf-8") as file:
            data = file.read()

        data = json.loads(data)
        
        # generate ground true for local & macro evaluation metrics
        clipDataList.append({
            'offset': data['vod']['offset'],
            'duration': math.ceil( data['duration'] ) # length of ground true clip
        })
        
        # generate ground true for global evaluation metrics
        for i in range(clipDataList[-1]['offset'], clipDataList[-1]['offset'] + clipDataList[-1]['duration']):
            global_answer[i] += 1
    
    print(predict['count'])
    local_score(clipDataList, predict['macro'], predict['count'])
    #macro_score(clipDataList, predict['macro'])
    global_score(global_answer, predict['global'])

In [55]:
# 'lirik', '389178879'
# DIVERSITY_THRESHOLD < 0.835

# getBaseline
515
0.31255464027255025 0.2876522567032966 0.299586852701908
0.39937353171495693 0.2872430301323571 0.33415233415233414
#0.6573413100600842 0.2876522567032966 0.3614468198549178
0.7179628608200037 0.40441176470588236 0.5173898641934416

# detectLowDiversity
356
0.2628395718152477 0.4893461761926615 0.34198877008915485
0.3429000187582067 0.5147845677274008 0.41161900472866464
#0.45959520239879975 0.4893461761926615 0.46529794868318325
0.6652331804281345 0.7208989229494615 0.691948310139165

# detectLowDiversityWithoutLowFrequency
253
0.31302776542041605 0.48823340732165216 0.3814751486696332
0.40737812911725957 0.43537031822021965 0.4209093384154642
#0.45743794769281965 0.48823340732165216 0.46380044758987177
0.7378940621175922 0.6557062966031483 0.6943766621884683

# detectLowDiversityReshap
291
0.2607106187891543 0.6286878185352636 0.3685762945314142
0.3512001443782711 0.5480146437623205 0.4280686317641883
#0.45190903466015914 0.6286878185352636 0.5041619874828696
0.6524051244863428 0.6987883181441591 0.6748006100457534

# detectLowDiversityWithoutLowFrequencyReshap
198
0.3020856113494304 0.6081953340588494 0.40367110887207064
0.4196286472148541 0.4455083075190087 0.4321813959841552
#0.45583974787498033 0.6081953340588494 0.5007325842535026
0.7422243166823751 0.6524440762220381 0.6944444444444444

0.31302776542041605
0.46581959122317684


In [None]:
# 'lirik', '389178879'
# DIVERSITY_THRESHOLD < 0.8

# getBaseline
515
0.31255464027255025 0.2876522567032966 0.299586852701908
0.39937353171495693 0.2872430301323571 0.33415233415233414
0.7179628608200037 0.40441176470588236 0.5173898641934416

# detectLowDiversity
229
0.28183306622319965 0.43600120533922376 0.34236191122585213
0.37098657326328077 0.3579273444100253 0.3643399742009459
0.7054803881372147 0.5345898922949461 0.608260177929653

# detectLowDiversityWithoutLowFrequency
166
0.32917472583340396 0.4344338179064545 0.37454958846252523
0.43775100401606426 0.3069557871022247 0.3608674060585995
0.7699827487061529 0.48534589892294944 0.595394632364618

# detectLowDiversityReshap
192
0.2740116097248532 0.43936409740730725 0.33752442754146034
0.3683712121212121 0.38341312306392566 0.3757416862149855
0.6500524658971668 0.4490990057995029 0.5312059778281374

# detectLowDiversityWithoutLowFrequencyReshap
143
0.31189666808158234 0.40969243856628224 0.35416750433124056
0.4261029411764706 0.3263869332582371 0.36963801626534837
0.7202007528230866 0.41611433305716655 0.5274696422710863

In [None]:
# 'lirik', '389178879'
# DIVERSITY_THRESHOLD < 0.4

# getBaseline
515
0.31255464027255025 0.2876522567032966 0.299586852701908
0.39937353171495693 0.2872430301323571 0.33415233415233414
0.7179628608200037 0.40441176470588236 0.5173898641934416

# detectLowDiversity
379
0.2533532653462485 0.4943392632744201 0.33501061397651916
0.33033826638477803 0.5280202759785976 0.40641595318088225
0.6486573924296344 0.7267502071251035 0.6854868250750934

# detectLowDiversityWithoutLowFrequency
269
0.2976308227436131 0.4896959067560582 0.37023662518000683
0.3876084262701363 0.4404393128696142 0.412338518323227
0.7200158631239023 0.6580882352941176 0.6876606336065795

# detectLowDiversityReshap
297
0.25854672514892657 0.631986448774863 0.3669667373519406
0.3494538407329105 0.5585750492818924 0.42993388967161594
0.6480484201496449 0.7041217895608948 0.6749224469537163

# detectLowDiversityWithoutLowFrequencyReshap
204
0.2963177650150011 0.609231027051812 0.39871065567158764
0.41049935979513447 0.45142213460996905 0.4299892703862661
0.7331169207582169 0.6548777961888981 0.6917922489948856

In [28]:
evaluation('lirik', '389178879', predictedFragment)

0.31302776542041605
0.48823340732165216
0.40737812911725957
0.43537031822021965


In [36]:
#kk = getBaseline('lirik', '389178879')
#kk = detectLowDiversity('lirik', '389178879')
#kk = detectLowDiversityWithoutLowFrequency('lirik', '389178879')
#kk = detectLowDiversityReshap('lirik', '389178879')
#kk = detectLowDiversityWithoutLowFrequencyReshap('lirik', '389178879')
#print(kk['count'])
evaluation('lirik', '389178879', kk)

204
0.2963177650150011 0.609231027051812 0.39871065567158764
0.41049935979513447 0.45142213460996905 0.4299892703862661
0.7331169207582169 0.6548777961888981 0.6917922489948856


In [None]:
356
253

print(localPrecision) # 0.2628395718152477 0.31302776542041605
print(localRecall) # 0.4646896538639685 0.46581959122317684