In [1]:
import os
import glob
import json
import nltk
import math
from nltk.tokenize import TweetTokenizer

In [2]:
tknzr = TweetTokenizer(reduce_len=True)

DATA_PATH = "../TwitchHighlightCrawler/vod/" # const

MAX_CLIP_LENGTH = 60 # const
MIN_CLIP_LENGTH = 5 # const

In [3]:
def normalized_shannon_entropy(text): # entropy diversity measure (normalized)
    entropy = 0
    
    textLength = len(text)
    if textLength <= 1:
        return entropy # 0
    else:
        vocabulary = set(text)
        for word in vocabulary:
            p = text.count(word) / textLength

            entropy -= p * math.log2(p)

        return entropy / math.log2(textLength)

In [10]:
def dependentFrequencyDistribution(channel, video, overflow = 0, length = None, messages = None):
    # length is a hint for the length of this video
    if not length: # check length is exist
        with open(DATA_PATH + channel + "/" + str(video) + "/info.json", "r", encoding="utf-8") as file:
            data = file.read()
            data = json.loads(data)
        length = data['length']
            
    # messages is a hint for comment messages in this video
    if not messages: # check messages is exist
        messages = [[] for i in range(length)]
        
        messagePathList = glob.glob(DATA_PATH + channel + "/" + str(video) + "/Message-*.json")
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()
                
            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset
                
                if offset >= length:
                    break
                    
                messages[offset].append( comment['message']['body'] )
    
    with open(str(video) + ' - dependentFrequency(overflow-' + overflow +').txt', mode='w') as outputFile:
        clipList = os.listdir(DATA_PATH + channel + '/'+ video +'/clip')
        for clip in clipList:
            with open(DATA_PATH + channel + '/'+ video +'/clip/' + clip, "r", encoding="utf-8") as file:
                data = file.read()

            data = json.loads(data)
            
            output = ''
            for i in range(data['vod']['offset'] - overflow, data['vod']['offset'] + math.ceil( data['duration'] ) + overflow):
                output += str(len(messages[i])) + ' '
            output += '\n'
            
            outputFile.write(output)

In [11]:
def frequencyDistribution(channel, video, overflow = 0, length = None, messages = None):
    # length is a hint for the length of this video
    if not length: # check length is exist
        with open(DATA_PATH + channel + "/" + str(video) + "/info.json", "r", encoding="utf-8") as file:
            data = file.read()
            data = json.loads(data)
        length = data['length']
            
    # messages is a hint for comment messages in this video
    if not messages: # check messages is exist
        messages = [[] for i in range(length)]
        
        messagePathList = glob.glob(DATA_PATH + channel + "/" + str(video) + "/Message-*.json")
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()
                
            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset
                
                if offset >= length:
                    break
                    
                messages[offset].append( comment['message']['body'] )
    
    with open(str(video) + ' - frequency(overflow-' + overflow +').txt', mode='w') as outputFile:
        clipList = os.listdir(DATA_PATH + channel + '/'+ video +'/clip')
        for clip in clipList:
            with open(DATA_PATH + channel + '/'+ video +'/clip/' + clip, "r", encoding="utf-8") as file:
                data = file.read()

            data = json.loads(data)
            
            output = ''
            totalSecond = 0
            totalNumberOfMessages = 0
            for i in range(data['vod']['offset'] - overflow, data['vod']['offset'] + math.ceil( data['duration'] ) + overflow):
                totalSecond += 1
                totalNumberOfMessages += len(messages[i])
                output += str(totalNumberOfMessages / totalSecond) + ' '
            output += '\n'
            
            outputFile.write(output)

In [12]:
def dependentDiversityDistribution(channel, video, overflow = 0, length = None, messages = None):
    # length is a hint for the length of this video
    if not length: # check length is exist
        with open(DATA_PATH + channel + "/" + str(video) + "/info.json", "r", encoding="utf-8") as file:
            data = file.read()
            data = json.loads(data)
        length = data['length']
            
    # messages is a hint for comment messages in this video
    if not messages: # check messages is exist
        messages = [[] for i in range(length)]
        
        messagePathList = glob.glob(DATA_PATH + channel + "/" + str(video) + "/Message-*.json")
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()
                
            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset
                
                if offset >= length:
                    break
                    
                messages[offset].append( comment['message']['body'] )
    
    with open(str(video) + ' - dependentDiversity(overflow-' + overflow +').txt', mode='w') as outputFile:
        clipList = os.listdir(DATA_PATH + channel + '/'+ video +'/clip')
        for clip in clipList:
            with open(DATA_PATH + channel + '/'+ video +'/clip/' + clip, "r", encoding="utf-8") as file:
                data = file.read()

            data = json.loads(data)
            
            output = ''
            for i in range(data['vod']['offset'] - overflow, data['vod']['offset'] + math.ceil( data['duration'] ) + overflow):
                totalMessage = ''
                for message in messages[i]: # concate messages in this second
                    totalMessage += message + ' '
                    
                tokens = tknzr.tokenize(totalMessage) # tokenization
                text = nltk.text.Text(tokens) # convert tokens to NLTK text
                
                output += str(normalized_shannon_entropy(text)) + ' '
            output += '\n'
            
            outputFile.write(output)

In [13]:
def diversityDistribution(channel, video, overflow = 0, length = None, messages = None):
    # length is a hint for the length of this video
    if not length: # check length is exist
        with open(DATA_PATH + channel + "/" + str(video) + "/info.json", "r", encoding="utf-8") as file:
            data = file.read()
            data = json.loads(data)
        length = data['length']
            
    # messages is a hint for comment messages in this video
    if not messages: # check messages is exist
        messages = [[] for i in range(length)]
        
        messagePathList = glob.glob(DATA_PATH + channel + "/" + str(video) + "/Message-*.json")
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()
                
            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset
                
                if offset >= length:
                    break
                    
                messages[offset].append( comment['message']['body'] )
    
    with open(str(video) + ' - diversity(overflow-' + overflow +').txt', mode='w') as outputFile:
        clipList = os.listdir(DATA_PATH + channel + '/'+ video +'/clip')
        for clip in clipList:
            with open(DATA_PATH + channel + '/'+ video +'/clip/' + clip, "r", encoding="utf-8") as file:
                data = file.read()

            data = json.loads(data)
            
            output = ''
            totalMessage = ''
            for i in range(data['vod']['offset'] - overflow, data['vod']['offset'] + math.ceil( data['duration'] ) + overflow):
                for message in messages[i]: # concate messages in this second to previous messages
                    totalMessage += message + ' '
                    
                tokens = tknzr.tokenize(totalMessage) # tokenization
                text = nltk.text.Text(tokens) # convert tokens to NLTK text
                
                output += str(normalized_shannon_entropy(text)) + ' '
            output += '\n'
            
            outputFile.write(output)

In [14]:
def DFRatio(channel, video, overflow = 0, length = None, messages = None):
    # length is a hint for the length of this video
    if not length: # check length is exist
        with open(DATA_PATH + channel + "/" + str(video) + "/info.json", "r", encoding="utf-8") as file:
            data = file.read()
            data = json.loads(data)
        length = data['length']
            
    # messages is a hint for comment messages in this video
    if not messages: # check messages is exist
        messages = [[] for i in range(length)]
        
        messagePathList = glob.glob(DATA_PATH + channel + "/" + str(video) + "/Message-*.json")
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()
                
            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset
                
                if offset >= length:
                    break
                    
                messages[offset].append( comment['message']['body'] )
    
    with open(str(video) + ' - DFRatio(overflow-' + overflow +').txt', mode='w') as outputFile:
        clipList = os.listdir(DATA_PATH + channel + '/'+ video +'/clip')
        for clip in clipList:
            with open(DATA_PATH + channel + '/'+ video +'/clip/' + clip, "r", encoding="utf-8") as file:
                data = file.read()

            data = json.loads(data)
            
            output = ''
            totalMessage = ''
            totalSecond = 0
            totalNumberOfMessages = 0
            for i in range(data['vod']['offset'] - overflow, data['vod']['offset'] + math.ceil( data['duration'] ) + overflow):
                totalSecond += 1
                for message in messages[i]: # concate messages in this second to previous messages
                    totalMessage += message + ' '
                totalNumberOfMessages += len(messages[i])
                tokens = tknzr.tokenize(totalMessage) # tokenization
                text = nltk.text.Text(tokens) # convert tokens to NLTK text
                
                ratio = normalized_shannon_entropy(text) / (totalNumberOfMessages / totalSecond) if totalNumberOfMessages else 0 # diversity / frequency
                output += str(ratio) + ' '
            output += '\n'
            
            outputFile.write(output)

In [9]:
dependentFrequencyDistribution('lirik', '389178879', 15)
frequencyDistribution('lirik', '389178879', 15)
dependentDiversityDistribution('lirik', '389178879', 15)
diversityDistribution('lirik', '389178879', 15)
DFRatio('lirik', '389178879', 15)