In [1]:
import os
import glob
import json
import math
import datetime as dt
import numpy as np
import nltk
import random
import string

In [6]:
DATA_PATH = "../TwitchHighlightCrawler/vod/" # const
TARGET_CHANNEL_LIST = ['ninja', 'shroud', 'tfue', 'lirik', 'summit1g', 'sodapoppin', 'timthetatman', 'loltyler1', 'drdisrespect', 'asmongold', 'dakotaz', 'nickmercs', 'tsm_daequan', 'xqcow', 'castro_1021'] # const

with open("../TwitchHighlightCrawler/json/Channel_ID.json", "r", encoding="utf-8") as file:
    data = file.read()
CHANNEL_ID = json.loads(data)

TRAINING_DATA_START_DATE = '2019-04-01T00:00:00Z'
TRAINING_DATA_END_DATE = '2019-04-15T00:00:00Z'
TESTING_DATA_START_DATE = '2019-04-15T00:00:00Z'
TESTING_DATA_END_DATE = '2019-04-22T00:00:00Z'

tknzr = nltk.tokenize.TweetTokenizer(reduce_len=True)

In [3]:
def log():
    print('a')
def stringToDateTime(str): # only parse twitch info format
    str = (str[:19] + 'Z') if len(str) > 20 else str
    return dt.datetime.strptime(str, '%Y-%m-%dT%H:%M:%SZ')

In [8]:
def shannon_entropy(text): # entropy diversity measure
    entropy = 0
    
    vocabulary = set(text)
    textLength = len(text)
    for word in vocabulary:
        p = text.count(word) / textLength
        
        entropy -= p * math.log2(p)
        
    return entropy

def normalized_shannon_entropy(text): # entropy diversity measure (normalized)
    entropy = 0
    
    textLength = len(text)
    if textLength <= 1:
        return entropy # 0
    else:
        vocabulary = set(text)
        for word in vocabulary:
            p = text.count(word) / textLength

            entropy -= p * math.log2(p)

        return entropy / math.log2(textLength)

In [9]:
def generateFittingData(start_datetime = dt.datetime(1,1,1), end_datetime = dt.datetime(9999,12,31,23,59,59), only_chat = False, is_testing = False, filter_low_views = False): # end_datetime default value is not enough
    LENGTH_OF_TASK_ID = 13 # const
    
    POSSIBLE_VIDEO_TYPE = {'archive', 'upload', 'highlight'} # const
    NUMBER_OF_FEATURES = 2 # const
    
    VIEWS_THRESHOLD = 3 # const
    CLIP_GRACE_PERIOD = 14 # const, 14 days for recording clip
    
    DUMMY_VALUE = -1
    
    task_id = ''.join(random.choices(string.ascii_lowercase + string.digits, k=LENGTH_OF_TASK_ID))
    
    fittingData = {
        'mark': [], # store channel name and video id in a tuple (channel, video)
        'data': [],
        'label': [] # this would be different between training label and testing label
    }
    
    max_video_length = 0
    
    for channel in TARGET_CHANNEL_LIST: # channel loop
        videoList = os.listdir(DATA_PATH + channel)
        for video in videoList: # video loop
            VIDEO_PATH = DATA_PATH + channel + '/'+ video +'/' # local const
            
            if not os.path.isfile(VIDEO_PATH + 'info.json'): # check file exist
                print(channel + ' ' + video + ' info.json file does not exist')
                continue
            
            # read video info
            with open(VIDEO_PATH + 'info.json', "r", encoding="utf-8") as file:
                data = file.read()
            videoInfo = json.loads(data)
            
            if 'error' in videoInfo: # check info is correct
                print(channel + ' ' + video + ' ' + videoInfo['error'])
                continue
            if videoInfo['broadcast_type'] != 'archive': # check video type
                #print(channel + ' ' + video + ' ' + videoInfo['broadcast_type'])
                if videoInfo['broadcast_type'] not in POSSIBLE_VIDEO_TYPE:
                    print(channel + ' ' + video + ' ' + videoInfo['broadcast_type'])
                continue
            if str(videoInfo['channel']['_id']) != CHANNEL_ID[channel]: # check channel is correct
                print(channel + ' ' + video + ' ' + videoInfo['channel']['name'])
                continue
            if videoInfo['viewable'] != 'public': # check video accessible
                print(channel + ' ' + video + ' ' + videoInfo['viewable'])
                continue
            '''if videoInfo['created_at'] != videoInfo['published_at'] or videoInfo['created_at'] != videoInfo['recorded_at'] or videoInfo['published_at'] != videoInfo['recorded_at']: # check the difference in datetime data field
                print(channel + ' ' + video + ' ' + videoInfo['created_at'] + ' ' + videoInfo['published_at'] + ' ' + videoInfo['recorded_at'])
            '''
            videoDateTime = stringToDateTime(videoInfo['created_at'])
            if videoDateTime >= start_datetime and videoDateTime < end_datetime: # if the video is in the specific range, then generate fitting data
                # process label (we need to check this video is valid first)
                clipCount = 0 # valid clip count
                    
                clipDeadline = videoDateTime + dt.timedelta(seconds=videoInfo['length']) + dt.timedelta(days=CLIP_GRACE_PERIOD)

                CLIP_PATH = VIDEO_PATH + 'clip/'
                clipList = os.listdir(CLIP_PATH)
                if not is_testing: # training
                    labelData = np.zeros( (videoInfo['length'], 1), dtype=int )
                    
                    for clip in clipList: # clip loop
                        with open(CLIP_PATH + clip, "r", encoding="utf-8") as file:
                            data = file.read()
                        clipInfo = json.loads(data)
                        
                        if filter_low_views and clipInfo['views'] < VIEWS_THRESHOLD:
                            continue # filter out a clip with low views
                        if stringToDateTime(clipInfo['created_at']) >= clipDeadline:
                            continue # filter out a clip created outside the range

                        offset = clipInfo['vod']['offset']
                        duration = math.ceil( clipInfo['duration'] ) # length of ground true clip

                        if offset >= videoInfo['length']: # check a clip is locate in the video range
                            print(channel + ' ' + video + '(outside) ' + clip)
                            continue

                        for i in range(offset, offset + duration): # loop the clip range
                            if i >= videoInfo['length']: # check a index is locate in the video range
                                print(channel + ' ' + video + '(inside) ' + clip)
                                break

                            if not labelData[i][0]:
                                labelData[i][0] = 1
                                
                        clipCount += 1
                else: # testing
                    labelData = np.zeros( videoInfo['length'], dtype=int )
                    
                    for clip in clipList: # clip loop
                        with open(CLIP_PATH + clip, "r", encoding="utf-8") as file:
                            data = file.read()
                        clipInfo = json.loads(data)
                        
                        if filter_low_views and clipInfo['views'] < VIEWS_THRESHOLD:
                            continue # filter out a clip with low views
                        if stringToDateTime(clipInfo['created_at']) >= clipDeadline:
                            continue # filter out a clip created outside the range

                        offset = clipInfo['vod']['offset']
                        duration = math.ceil( clipInfo['duration'] ) # length of ground true clip

                        if offset >= videoInfo['length']: # check a clip is locate in the video range
                            print(channel + ' ' + video + '(outside) ' + clip)
                            continue

                        # generate ground true for global evaluation metrics
                        for i in range(offset, offset + duration): # loop the clip range
                            if i >= videoInfo['length']: # check a index is locate in the video range
                                print(channel + ' ' + video + '(inside) ' + clip)
                                break
                            
                            labelData[i] += 1
                                
                        clipCount += 1
                        
                if clipCount < 1:
                    continue # skip this video
                else:
                    fittingData['mark'].append([channel, video])
                    fittingData['label'].append(labelData)
                
                if videoInfo['length'] > max_video_length: # store max video length for padding
                    max_video_length = videoInfo['length']
                
                # process features
                featureData = [] # the feature data of this video
                
                # process message data (it can be optimized)
                messages = [[] for i in range(videoInfo['length'])]
                
                messagePathList = glob.glob(VIDEO_PATH + 'Message-*.json')
                for path in messagePathList:
                    with open(path, "r", encoding="utf-8") as file:
                        data = file.read()

                    commentData = json.loads(data)['comments']
                    for comment in commentData:
                        offset = math.floor( comment['content_offset_seconds'] ) # get comment offset

                        if offset >= videoInfo['length']:
                            break
                        if only_chat and comment['source'] != 'chat':
                            if comment['source'] != 'comment':
                                print(channel + ' ' + video + ' ' + str(offset) + ' ' + comment['source'])
                            continue
                        if comment['state'] != 'published':
                            print(channel + ' ' + video + ' ' + str(offset) + ' ' + comment['state'])
                            
                        messages[offset].append( comment['message']['body'] )
                
                # process feature data
                for i in range(videoInfo['length']):
                    features = []
                    
                    # frequency
                    features.append( len(messages[i]) )
                    
                    # diversity
                    localMessage = ' '.join(messages[i])
                    localTokens = tknzr.tokenize(localMessage) # tokenization
                    localText = nltk.text.Text(localTokens) # convert tokens to NLTK text
                    
                    features.append( normalized_shannon_entropy(localText) )
                    
                    # append each timestep features to the feature data of a video
                    featureData.append(features)
                
                fittingData['data'].append(featureData)
    
    # pad each data to the same length
    numberOfData = len(fittingData['data'])
    paddingData = np.full( (numberOfData, max_video_length, len(fittingData['data'][0][0])), fill_value=DUMMY_VALUE, dtype=np.float64 )
    for s, x in enumerate(fittingData['data']):
        video_length = len(x)
        paddingData[s, 0:video_length, :] = x
    fittingData['data'] = paddingData
    #fittingData['data'] = [] # free memory
    
    # transform mark list to numpy array (this step just for easy processing)
    fittingData['mark'] = np.array(fittingData['mark'])
    if not is_testing:
        # pad each leabel data to the same length
        paddingLabel = np.full( (numberOfData, max_video_length, 1), fill_value=DUMMY_VALUE, dtype=int )
        for s, y in enumerate(fittingData['label']):
            video_length = len(y)
            paddingLabel[s, 0:video_length, :] = y
        fittingData['label'] = paddingLabel
        #fittingData['label'] = [] # free memory
        
        np.save('training_mark-' + task_id, fittingData['mark'])
        np.save('training_data-' + task_id, fittingData['data'])
        np.save('training_label-' + task_id, fittingData['label'])
    else:
        fittingData['label'] = np.array(fittingData['label']) # also need to transform label list when testing
        
        np.save('testing_mark-' + task_id, fittingData['mark'])
        np.save('testing_data-' + task_id, fittingData['data'])
        np.save('testing_label-' + task_id, fittingData['label'])
        
    return task_id

def generateTestingData(start_datetime = dt.datetime(1,1,1), end_datetime = dt.datetime(9999,12,31,23,59,59), only_chat = False): # end_datetime default value is not enough
    LENGTH_OF_TASK_ID = 13 # const
    
    POSSIBLE_VIDEO_TYPE = {'archive', 'upload', 'highlight'} # const
    NUMBER_OF_FEATURES = 2 # const
    
    task_id = ''.join(random.choices(string.ascii_lowercase + string.digits, k=LENGTH_OF_TASK_ID))
    
    markList = [] # store channel name and video id for easy evaluation 
    fittingDataList = [] # We doesn't pad value here. Instead, we store all testing data in a list
    
    for channel in TARGET_CHANNEL_LIST: # channel loop
        videoList = os.listdir(DATA_PATH + channel)
        for video in videoList: # video loop
            VIDEO_PATH = DATA_PATH + channel + '/'+ video +'/' # local const
            
            if not os.path.isfile(VIDEO_PATH + 'info.json'): # check file exist
                print(channel + ' ' + video + ' info.json file does not exist')
                continue
            
            # read video info
            with open(VIDEO_PATH + 'info.json', "r", encoding="utf-8") as file:
                data = file.read()
            videoInfo = json.loads(data)
            
            if 'error' in videoInfo: # check info is correct
                print(channel + ' ' + video + ' ' + videoInfo['error'])
                continue
            if videoInfo['broadcast_type'] != 'archive': # check video type
                #print(channel + ' ' + video + ' ' + videoInfo['broadcast_type'])
                if videoInfo['broadcast_type'] not in POSSIBLE_VIDEO_TYPE:
                    print(channel + ' ' + video + ' ' + videoInfo['broadcast_type'])
                continue
            if videoInfo['viewable'] != 'public': # check video accessible
                print(channel + ' ' + video + ' ' + videoInfo['viewable'])
                continue
            '''if videoInfo['created_at'] != videoInfo['published_at'] or videoInfo['created_at'] != videoInfo['recorded_at'] or videoInfo['published_at'] != videoInfo['recorded_at']: # check the difference in datetime data field
                print(channel + ' ' + video + ' ' + videoInfo['created_at'] + ' ' + videoInfo['published_at'] + ' ' + videoInfo['recorded_at'])
            '''
            videoDateTime = stringToDateTime(videoInfo['created_at'])
            if videoDateTime >= start_datetime and videoDateTime < end_datetime: # if the video is in the specific range, then generate fitting data                
                featureData = [] # the feature data of this video
                
                # process message data (it can be optimized)
                messages = [[] for i in range(videoInfo['length'])]
                
                messagePathList = glob.glob(VIDEO_PATH + 'Message-*.json')
                for path in messagePathList:
                    with open(path, "r", encoding="utf-8") as file:
                        data = file.read()

                    commentData = json.loads(data)['comments']
                    for comment in commentData:
                        offset = math.floor( comment['content_offset_seconds'] ) # get comment offset

                        if offset >= videoInfo['length']:
                            break
                        if only_chat and comment['source'] != 'chat':
                            if comment['source'] != 'comment':
                                print(channel + ' ' + video + ' ' + str(offset) + ' ' + comment['source'])
                            continue
                        if comment['state'] != 'published':
                            print(channel + ' ' + video + ' ' + str(offset) + ' ' + comment['state'])
                            
                        messages[offset].append( comment['message']['body'] )
                
                # process features
                for i in range(videoInfo['length']):
                    features = []
                    
                    # frequency
                    features.append( len(messages[i]) )
                    
                    # diversity
                    localMessage = ' '.join(messages[i])
                    localTokens = tknzr.tokenize(localMessage) # tokenization
                    localText = nltk.text.Text(localTokens) # convert tokens to NLTK text
                    
                    features.append( shannon_entropy(localText) )
                    
                    # append each timestep features to the feature data of a video
                    featureData.append(features)
                
                # process data
                markList.append([channel, video]) # 2 dimension, [number of data, info]
                fittingDataList.append([featureData]) # 4 dimension, [number of data, 0, video length, features], the last 3 dimension is RNN input (batch_size, timesteps, input_dim)
    
    markList = np.array(markList)
    fittingDataList = np.array(fittingDataList)
    
    np.save('testing-' + task_id, markList)
    np.save('testing-' + task_id, fittingDataList)
        
    return task_id

In [21]:
#print( generateFittingData(stringToDateTime(TRAINING_DATA_START_DATE), stringToDateTime(TRAINING_DATA_END_DATE), only_chat=True, filter_low_views=True) )
print( generateFittingData(stringToDateTime(TESTING_DATA_START_DATE), stringToDateTime(TESTING_DATA_END_DATE), only_chat=True, is_testing=True) ) # 1h 18m 1s, 14m 53s
#print( generateTestingData(stringToDateTime(TESTING_DATA_START_DATE), stringToDateTime(TESTING_DATA_END_DATE), True) )

lirik 386059282 Not Found
summit1g 397613985 Not Found
summit1g 398121286 Not Found
summit1g 398614891 Not Found
timthetatman 397423734 Not Found
timthetatman 397682683 Not Found
timthetatman 397865909 Not Found
timthetatman 398222973 Not Found
timthetatman 398648650 Not Found
timthetatman 398850412 Not Found
drdisrespect 397935556 Not Found
drdisrespect 398441715 Not Found
drdisrespect 398921124 Not Found
dakotaz 397343707 Not Found
dakotaz 397811515 Not Found
dakotaz 398302034 Not Found
dakotaz 398305477 Not Found
dakotaz 398780595 Not Found
nickmercs 398455211 Not Found
nickmercs 398657494 Not Found
nickmercs 399051300 Not Found
tsm_daequan 398159685 Not Found
tsm_daequan 398775122 Not Found
xqcow 397036781 Not Found
xqcow 397465468 Not Found
xqcow 397898610 Not Found
xqcow 398544427 Not Found
xqcow 398796877 Not Found
xqcow 399105570 Not Found
castro_1021 397950594 Not Found
castro_1021 398447100 Not Found
castro_1021 399403171 Not Found
5g15zuybkddvt


In [4]:
aaa = dt.datetime(9999,12,31,23,59,59)#dt.datetime.strptime('2019-04-01T23:57:01Z', '%Y-%m-%dT%H:%M:%SZ')
bbb = stringToDateTime('2019-04-01T00:00:00Z')
ccc = dt.datetime.strptime('2019-04-01T00:00:00Z', '%Y-%m-%dT%H:%M:%SZ')
#aaa
#bbb >= ccc
ddd = stringToDateTime('2019-04-01T00:00:56.12345Z')
#len('2019-04-01T00:00:00Z'[:20])
ddd
bbb + dt.timedelta(seconds=97) + dt.timedelta(days=3)

datetime.datetime(2019, 4, 4, 0, 1, 37)

In [1]:
import numpy as np

fee = []
for i in range(3):
    fe = []
    fe.append(2.0)
    fe.append(3.9)
    fee.append(fe)
    
data = []
data.append(fee)
data.append(fee)
print(np.array(data))

[[[2.  3.9]
  [2.  3.9]
  [2.  3.9]]

 [[2.  3.9]
  [2.  3.9]
  [2.  3.9]]]


In [2]:
import numpy as np
a = None
b = None

In [3]:
a = np.empty( (10000000, 2) )
for i in range(10000000):
    for j in range(2):
        a[i][j] = 1.666 #8.76s

In [8]:
a = None
b = None

In [9]:
b = []
for i in range(10000000):
    c = []
    for j in range(2):
        c.append(1.666)
    
    b.append(c) # 9.8
    
b = np.array(b)#12.6s

In [None]:
c = []
for i in range(10):
    d = np.empty( (1000000, 2) )

In [31]:
a = [[[]]]
len(a[0])

1

In [77]:
a = np.array([-1], dtype=np.float64)
a[0] == -1.

True

In [85]:
a = [2,3,4]
b = len(a)
a = []
b

3

In [12]:
a = [[[2,5],[3,6],[4,8]], [[7,9],[3,6]]]
b = [[2,5],[3,6],[4,8]]
#a.append(b)
n = np.array(a)
np.array([n[0]])

array([[[2, 5],
        [3, 6],
        [4, 8]]])

In [13]:
a = []
a.append(['ninja', '406168424'])
a.append(['sodapoppin', '4061684249'])

np.array(a)

array([['ninja', '406168424'],
       ['sodapoppin', '4061684249']], dtype='<U10')

In [15]:
a = {
    'np': [1,2,3]
}

b = np.array([5,6,7])
a['np'] = b

b[0] = 9
a

{'np': array([9, 6, 7])}

In [4]:
a = np.zeros( (100, 1), dtype=int )
a[0][0] = 26
a[25][0] = 31
np.sum(a)

57

In [3]:
def a(b=0, c=1, d=2, e=3):
    print(b)
    print(c)
    print(d)
    print(e)
a(5, 6, e=9)

5
6
2
9


In [1]:
if 1==1:
    a = 'ggyoyo'
a + '6653'

'ggyoyo6653'