In [2]:
import os
import glob
import json
import math
import datetime as dt
import numpy as np
import nltk
import random
import string
import re
import unicodedata
from sklearn.preprocessing import minmax_scale
from sklearn.utils import class_weight
from gensim.models import Word2Vec



In [16]:
DATA_PATH = "../TwitchHighlightCrawler/vod/" # const
TARGET_CHANNEL_LIST = ['ninja', 'shroud', 'tfue', 'lirik', 'summit1g', 'sodapoppin', 'timthetatman', 'loltyler1', 'drdisrespect', 'asmongold', 'dakotaz', 'nickmercs', 'tsm_daequan', 'xqcow', 'castro_1021'] # const

with open("../TwitchHighlightCrawler/json/Channel_ID.json", "r", encoding="utf-8") as file:
    data = file.read()
CHANNEL_ID = json.loads(data)

TRAINING_DATA_START_DATE = '2019-04-01T00:00:00Z'
TRAINING_DATA_END_DATE = '2019-05-01T00:00:00Z'
TESTING_DATA_START_DATE = '2019-05-01T00:00:00Z'
TESTING_DATA_END_DATE = '2019-05-08T00:00:00Z'

DUMMY_VALUE = -1

VIEWS_THRESHOLD = 3 # const
CLIP_GRACE_PERIOD = 14 # const, 14 days for recording clip

tknzr = nltk.tokenize.TweetTokenizer(reduce_len=True)
stemmer = nltk.stem.snowball.SnowballStemmer("english")

FULL2HALF = dict((i + 0xFEE0, i) for i in range(0x21, 0x7F))
FULL2HALF[0x3000] = 0x20

APOSTROPHE = {
    "aren't": ["are", "not"],

    "can't": ["can", "not"],

    "couldn't": ["could", "not"],
    
    "couldn't've": ["could", "not", "have"], # 另外網頁+的

    "didn't": ["did", "not"],

    "doesn't": ["does", "not"],

    "don't": ["do", "not"],

    "hadn't": ["had", "not"],

    "hasn't": ["has", "not"],

    "haven't": ["have", "not"],
    
    #"he'd" : ["he", "would"],

    "he'll": ["he", "will"],

    "he's": ["he", "is"],
    
    #"i'd" : ["i", "would"],

    #"i'd" : ["i", "had"],

    "i'll": ["i", "will"],

    "i'm": ["i", "am"],

    "isn't": ["is", "not"],

    "it's": ["it", "is"],

    "it'll": ["it", "will"],

    "i've": ["i", "have"],

    "let's": ["let", "us"],

    "mightn't": ["might", "not"],

    "mustn't": ["must", "not"],

    "shan't": ["shall", "not"],
    
    #"she'd" : ["she", "would"],

    "she'll": ["she", "will"],

    "she's": ["she", "is"],

    "shouldn't": ["should", "not"],
    
    "shouldn't've": ["should", "not", "have"], # 另外網頁+的

    "that's": ["that", "is"],

    "there's": ["there", "is"],
    
    #"they'd" : ["they", "would"],

    "they'll": ["they", "will"],

    "they're": ["they", "are"],

    "they've": ["they", "have"],
    
    #"we'd" : ["we", "would"],

    "we're": ["we", "are"],

    "weren't": ["were", "not"],

    "we've": ["we", "have"],

    "what'll": ["what", "will"],

    "what're": ["what", "are"],

    "what's": ["what", "is"],

    "what've": ["what", "have"],

    "where's": ["where", "is"],

    "where're": ["where", "are"], # 我+的
    
    #"who'd" : ["who", "would"],

    "who'll": ["who", "will"],

    "who're": ["who", "are"],

    "who's": ["who", "is"],

    "who've": ["who", "have"],

    "won't": ["will", "not"],

    "wouldn't": ["would", "not"],
    
    #"you'd" : ["you", "would"],

    "you'll": ["you", "will"],

    "you're": ["you", "are"],

    "you've": ["you", "have"],
    
    #"'re": ["are"],

    "that're": ["that", "are"], # 我+的

    "wasn't": ["was", "not"],

    "we'll": ["we", "will"], # 我修正的

    "didn't": ["did", "not"]
}

WORD_EMBEDDING_SIZE = 300
#WORD_VECTOR = model.wv

FEATURE_DATA_TYPE = np.float64

In [21]:
def log():
    print('a')

def stringToDateTime(s): # only parse twitch info format
    s = (s[:19] + 'Z') if len(s) > 20 else s
    return dt.datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ')

def customMessageTokenizer(message):
    #message = message.strip().translate(FULL2HALF).casefold() # remove space in head and tail # fullwidth to halfwidth # to lower case
    message = unicodedata.normalize('NFKC', message) # fullwidth to halfwidth and normalize all character
    message = message.strip().casefold() # remove space in head and tail # to lower case
    message = re.sub(r'(.{1})\1{2,}', r'\g<1>\g<1>', message) # replace redundant characters (repeat more than 3 times) to 2 character (eg. LUUUUUL -> LUUL)
    message = re.sub(r'(\S{2})\1{2,}', r'\g<1>\g<1>', message) # replace redundant connected words (repeat more than 3 times) (eg. PogPogPog -> PogPog)
    message = re.sub(r'(\S{3})\1{2,}', r'\g<1>\g<1>', message)
    message = re.sub(r'(\S{4})\1{2,}', r'\g<1>\g<1>', message)
    message = re.sub(r'(\S{5})\1{2,}', r'\g<1>\g<1>', message)
    message = re.sub(r'(\S{6,})\1{2,}', r'\g<1>\g<1>', message) # not perform well in too many connection (eg. LULULULULULULULULULULULUL)
    message = re.sub(r'(?<=[^.])\.$', '', message) # remove period
    
    # re-construct abbreviations that use space (eg. T H I C C -> THICC)
    matches = re.finditer(r"(?:(?:(?<=\s)|(?<=^))(?:\w\s)(?!\w{2})){2,}", message)
    matchIndexs = [i.span(0) for i in matches]
    if len(matchIndexs):
        msg = message # tmp
        message = ''
        
        lastEndIndex = 0
        for i in matchIndexs:
            message += msg[lastEndIndex:i[0]] + msg[i[0]:i[1]].replace(' ', '')
            lastEndIndex = i[1]
        message += msg[lastEndIndex:]
        
    # re-construct abbreviations that use dot (eg. U.S.A -> USA)
    matches = re.finditer(r"(?:(?:(?<=\.)|)(?:\w\.)){2,}", message)
    matchIndexs = [i.span(0) for i in matches]
    if len(matchIndexs):
        msg = message # tmp
        message = ''
        
        lastEndIndex = 0
        for i in matchIndexs:
            message += msg[lastEndIndex:i[0]] + msg[i[0]:i[1]].replace('.', '')
            lastEndIndex = i[1]
        message += msg[lastEndIndex:]
        
    message = message.replace('/', ' / ') # split slash by ourselves
    
    tokens = tknzr.tokenize(message) # tokenize
    tokens = [apostrophe for token in tokens for apostrophe in ([token] if token not in APOSTROPHE else APOSTROPHE[token])]
    result = []
    for token in tokens:
        match = re.search(r"'s(?=\s|$)|(?<=s)'(?:\s|$)", token)
        if match:
            splitBoundary = match.start()
            result += [token[:splitBoundary], token[splitBoundary:]]
        else:
            result += [token]
            
    return result

In [27]:
a = "May's :) LUUUUUUUUL?? parents' we're 1995/08/30 U.S.A!!!!!!=    = T H I C C :p PogPogPogLULULULULULULULULULULULUL 1 0 0 43kg I'd D:D:."
customMessageTokenizer(a)

['may',
 "'s",
 ':)',
 'luul',
 '?',
 '?',
 'parents',
 "'",
 'we',
 'are',
 '1995',
 '/',
 '08',
 '/',
 '30',
 'usa',
 '!',
 '!',
 '=',
 '=',
 'thicc',
 ':p',
 'pogpoglulul',
 '100',
 '43kg',
 "i'd",
 'd:',
 'd:']

In [4]:
def shannon_entropy(text): # entropy diversity measure
    entropy = 0
    
    vocabulary = set(text)
    textLength = len(text)
    for word in vocabulary:
        p = text.count(word) / textLength
        
        entropy -= p * math.log2(p)
        
    return entropy

def normalized_shannon_entropy(text): # entropy diversity measure (normalized)
    entropy = 0
    
    textLength = len(text)
    if textLength <= 1:
        return entropy # 0
    else:
        vocabulary = set(text)
        for word in vocabulary:
            p = text.count(word) / textLength

            entropy -= p * math.log2(p)

        return entropy / math.log2(textLength)

In [39]:
def getNFnDFeatureWithMinMaxNormalization(messages, total_timesteps, window_size):
    videoLength = len(messages)
    
    # process feature data
    unnormalizedFeatureData = np.empty( (total_timesteps, 1), dtype=FEATURE_DATA_TYPE ) # features that need to normalize after processing
    normalizedFeatureData = np.empty( (total_timesteps, 1), dtype=FEATURE_DATA_TYPE )

    for t in range(0, total_timesteps):
        realVideoIndex = t * window_size
        '''windowUpperBound = realVideoIndex + window_size
        if windowUpperBound > videoLength: # check if the window out of the range
            windowUpperBound = videoLength'''
        
        messageList = [] # store messages in this window
        for i in range(realVideoIndex, realVideoIndex + window_size): # generate message data about this window range
            if i >= videoLength: # check if end
                break
            messageList += messages[i]
        
        # features that need to normalize
        # frequency
        unnormalizedFeatureData[t][0] = len(messageList)

        # features that doens't need to normalize
        # diversity
        localMessage = ' '.join(messageList).casefold()
        localTokens = tknzr.tokenize(localMessage) # tokenization
        localText = nltk.text.Text(localTokens) # convert tokens to NLTK text

        normalizedFeatureData[t][0] =  normalized_shannon_entropy(localText)

    unnormalizedFeatureData = minmax_scale(unnormalizedFeatureData) # normalize unnormalized features
    return np.concatenate((unnormalizedFeatureData, normalizedFeatureData), axis=1) # combine

def getNFNLnDFeatureWithMinMaxNormalization(messages, total_timesteps, window_size):
    videoLength = len(messages)
    
    # process feature data
    unnormalizedFeatureData = np.empty( (total_timesteps, 2), dtype=FEATURE_DATA_TYPE ) # features that need to normalize after processing
    normalizedFeatureData = np.empty( (total_timesteps, 1), dtype=FEATURE_DATA_TYPE )

    for t in range(0, total_timesteps):
        realVideoIndex = t * window_size
        
        messageList = [] # store messages in this window
        for i in range(realVideoIndex, realVideoIndex + window_size): # generate message data about this window range
            if i >= videoLength: # check if end
                break
            messageList += messages[i]
            
        # features that need to normalize
        # frequency
        unnormalizedFeatureData[t][0] = len(messageList)

        # length
        localMessage = ' '.join(messageList).casefold()
        localTokens = tknzr.tokenize(localMessage) # tokenization
        unnormalizedFeatureData[t][1] = (len(localTokens) / unnormalizedFeatureData[t][0]) if unnormalizedFeatureData[t][0] > 0 else 0 # token count / message count

        # features that doens't need to normalize
        # diversity
        localText = nltk.text.Text(localTokens) # convert tokens to NLTK text

        normalizedFeatureData[t][0] =  normalized_shannon_entropy(localText)

    unnormalizedFeatureData = minmax_scale(unnormalizedFeatureData) # normalize unnormalized features
    return np.concatenate((unnormalizedFeatureData, normalizedFeatureData), axis=1) # combine

def getNFNLnDFeatureWithTanhEstimators(messages, total_timesteps, window_size):
    videoLength = len(messages)
    
    # process feature data
    unnormalizedFeatureData = np.empty( (total_timesteps, 2), dtype=FEATURE_DATA_TYPE ) # features that need to normalize after processing
    normalizedFeatureData = np.empty( (total_timesteps, 1), dtype=FEATURE_DATA_TYPE )

    for t in range(0, total_timesteps):
        realVideoIndex = t * window_size
        
        messageList = [] # store messages in this window
        for i in range(realVideoIndex, realVideoIndex + window_size): # generate message data about this window range
            if i >= videoLength: # check if end
                break
            messageList += messages[i]
        
        # features that need to normalize
        # frequency
        unnormalizedFeatureData[t][0] = len(messageList)

        # length
        localMessage = ' '.join(messageList).casefold()
        localTokens = tknzr.tokenize(localMessage) # tokenization
        unnormalizedFeatureData[t][1] = (len(localTokens) / unnormalizedFeatureData[t][0]) if unnormalizedFeatureData[t][0] > 0 else 0 # token count / message count

        # features that doens't need to normalize
        # diversity
        localText = nltk.text.Text(localTokens) # convert tokens to NLTK text

        normalizedFeatureData[t][0] =  normalized_shannon_entropy(localText)
    
    m = np.mean(unnormalizedFeatureData, axis=0)
    std = np.std(unnormalizedFeatureData, axis=0)
    unnormalizedFeatureData = 0.5 * (np.tanh(0.01 * ((unnormalizedFeatureData - m) / std)) + 1) # normalize unnormalized features
    return np.concatenate((unnormalizedFeatureData, normalizedFeatureData), axis=1) # combine

def getNFnDFeatureWithMinMaxNormalizationByProcessedData(messages, total_timesteps, window_size):
    videoLength = len(messages)
    
    # process feature data
    unnormalizedFeatureData = np.empty( (total_timesteps, 1), dtype=FEATURE_DATA_TYPE ) # features that need to normalize after processing
    normalizedFeatureData = np.empty( (total_timesteps, 1), dtype=FEATURE_DATA_TYPE )

    for t in range(0, total_timesteps):
        realVideoIndex = t * window_size
        windowUpperBound = realVideoIndex + window_size
        if windowUpperBound > videoLength: # check if the window out of the range
            windowUpperBound = videoLength        
        
        # features that need to normalize
        # frequency
        totalMessage = 0
        for i in range(realVideoIndex, windowUpperBound):
            totalMessage += len(messages[i])
        
        unnormalizedFeatureData[t][0] = totalMessage

        # features that doens't need to normalize
        # diversity
        tokenList = [token for i in range(realVideoIndex, windowUpperBound) for message in messages[i] for token in message] # generate token data about this window range
        localText = nltk.text.Text(tokenList) # convert tokens to NLTK text

        normalizedFeatureData[t][0] =  normalized_shannon_entropy(localText)

    unnormalizedFeatureData = minmax_scale(unnormalizedFeatureData) # normalize unnormalized features
    return np.concatenate((unnormalizedFeatureData, normalizedFeatureData), axis=1) # combine

def getWEFeatureByProcessedData(messages, total_timesteps, window_size):
    videoLength = len(messages)
    
    # process feature data
    featureData = np.zeros( (total_timesteps, WORD_EMBEDDING_SIZE), dtype=FEATURE_DATA_TYPE )
    
    for t in range(0, total_timesteps):
        realVideoIndex = t * window_size
        windowUpperBound = realVideoIndex + window_size
        if windowUpperBound > videoLength: # check if the window out of the range
            windowUpperBound = videoLength
            
        # word embeddings
        tokenList = [token for i in range(realVideoIndex, windowUpperBound) for message in messages[i] for token in message] # generate token data about this window range
        nltkText = nltk.text.Text(tokenList)
        
        numOfTokens = len(tokenList)
        vocabulary = set(nltkText)
        for word in vocabulary:
            if word in WORD_VECTOR.vocab:
                featureData[t] += (nltkText.count(word) / numOfTokens) * WORD_VECTOR[word]
        
    return featureData

In [40]:
def generateFeatureWithMark(filename = '', only_chat = True, window = 1):
    task_id = random.randrange(100)
    
    markList = np.load(filename, allow_pickle=True)
    
    fittingData = {
        'data': []
    }
    
    max_data_length = 0
    
    for mark in markList:
        VIDEO_PATH = DATA_PATH + mark[0] + '/' + mark[1] + '/' # local const
        
        # read video info
        with open(VIDEO_PATH + 'info.json', "r", encoding="utf-8") as file:
            data = file.read()
        videoInfo = json.loads(data)
        
        dataLength = math.ceil(videoInfo['length'] / window)
        if dataLength > max_data_length: # store max data length for padding
            max_data_length = dataLength
        
        # process features                
        # process message data (it can be optimized)
        messages = [[] for i in range(videoInfo['length'])]

        messagePathList = glob.glob(VIDEO_PATH + 'Message-*.json')
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()

            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset

                if offset >= videoInfo['length']:
                    break
                if only_chat and comment['source'] != 'chat':
                    if comment['source'] != 'comment':
                        print(channel + ' ' + video + ' ' + str(offset) + ' ' + comment['source'])
                    continue
                if comment['state'] != 'published':
                    print(channel + ' ' + video + ' ' + str(offset) + ' ' + comment['state'])

                messages[offset].append( comment['message']['body'] )
        
        fittingData['data'].append( getNFnDFeatureWithMinMaxNormalization(messages, dataLength, window) )
        
    # pad each data to the same length
    numberOfData = len(fittingData['data'])
    paddingData = np.full( (numberOfData, max_data_length, len(fittingData['data'][0][0])), fill_value=DUMMY_VALUE, dtype=FEATURE_DATA_TYPE )
    for s, x in enumerate(fittingData['data']):
        video_length = len(x)
        paddingData[s, 0:video_length, :] = x
    fittingData['data'] = paddingData
    #paddingData = [] # free memory
    
    np.save(re.sub(r'mark(?=-)', 'data', filename[:-4]) + '-' + f"{task_id:02}", fittingData['data'])
    return task_id

def generateLabelWithMark(filename = '', filter_low_views = False):   
    task_id = random.randrange(100)
    
    markList = np.load(filename, allow_pickle=True)
    
    fittingData = {
        'label': []
    }
    
    max_data_length = 0
    
    for mark in markList:
        VIDEO_PATH = DATA_PATH + mark[0] + '/' + mark[1] + '/' # local const
        
        # read video info
        with open(VIDEO_PATH + 'info.json', "r", encoding="utf-8") as file:
            data = file.read()
        videoInfo = json.loads(data)
        
        #dataLength = math.ceil(videoInfo['length'] / window)
        if videoInfo['length'] > max_data_length: # store max data length for padding
            max_data_length = videoInfo['length']
        
        # process label
        clipDeadline = stringToDateTime(videoInfo['created_at']) + dt.timedelta(seconds=videoInfo['length']) + dt.timedelta(days=CLIP_GRACE_PERIOD)
        
        CLIP_PATH = VIDEO_PATH + 'clip/'
        clipList = os.listdir(CLIP_PATH)
        
        if (re.match(r'(.+)_mark', filename)[1] == 'training'): # training
            is_testing = False
            
            labelData = np.zeros( (videoInfo['length'], 1), dtype=int )
            
            for clip in clipList: # clip loop
                with open(CLIP_PATH + clip, "r", encoding="utf-8") as file:
                    data = file.read()
                clipInfo = json.loads(data)

                if filter_low_views and clipInfo['views'] < VIEWS_THRESHOLD:
                    continue # filter out a clip with low views
                if stringToDateTime(clipInfo['created_at']) >= clipDeadline:
                    continue # filter out a clip created outside the range
                
                offset = clipInfo['vod']['offset']
                duration = math.ceil( clipInfo['duration'] ) # length of ground true clip

                if offset >= videoInfo['length']: # check a clip is locate in the video range
                    continue

                for i in range(offset, offset + duration): # loop the clip range
                    if i >= videoInfo['length']: # check a index is locate in the video range
                        break

                    if not labelData[i][0]:
                        labelData[i][0] = 1                        
        else: # testing
            is_testing = True
            
            labelData = np.zeros( videoInfo['length'], dtype=int )
                    
            for clip in clipList: # clip loop
                with open(CLIP_PATH + clip, "r", encoding="utf-8") as file:
                    data = file.read()
                clipInfo = json.loads(data)

                if filter_low_views and clipInfo['views'] < VIEWS_THRESHOLD:
                    continue # filter out a clip with low views
                if stringToDateTime(clipInfo['created_at']) >= clipDeadline:
                    continue # filter out a clip created outside the range

                offset = clipInfo['vod']['offset']
                duration = math.ceil( clipInfo['duration'] ) # length of ground true clip

                if offset >= videoInfo['length']: # check a clip is locate in the video range
                    continue

                # generate ground true for global evaluation metrics
                for i in range(offset, offset + duration): # loop the clip range
                    if i >= videoInfo['length']: # check a index is locate in the video range
                        break

                    labelData[i] += 1
        
        fittingData['label'].append(labelData)
        
    if not is_testing:
        # pad each leabel data to the same length
        paddingLabel = np.full( (len(fittingData['label']), max_data_length, 1), fill_value=DUMMY_VALUE, dtype=int )
        for s, y in enumerate(fittingData['label']):
            video_length = len(y)
            paddingLabel[s, 0:video_length, :] = y
        fittingData['label'] = paddingLabel
        #paddingLabel = [] # free memory
    else:
        fittingData['label'] = np.array(fittingData['label'])
    
    np.save(re.sub(r'mark(?=-)', 'label', filename[:-4]) + '-' + f"{task_id:02}", fittingData['label'])
    return task_id

def generateGroundTruthWithMark(filename = '', filter_low_views = False):
    markList = np.load(filename, allow_pickle=True)
    
    fittingData = {
        'label': []
    }
    
    max_data_length = 0
    
    for mark in markList:
        VIDEO_PATH = DATA_PATH + mark[0] + '/' + mark[1] + '/' # local const
        
        # read video info
        with open(VIDEO_PATH + 'info.json', "r", encoding="utf-8") as file:
            data = file.read()
        videoInfo = json.loads(data)
        
        #dataLength = math.ceil(videoInfo['length'] / window)
        if videoInfo['length'] > max_data_length: # store max data length for padding
            max_data_length = videoInfo['length']
        
        # process label
        clipDeadline = stringToDateTime(videoInfo['created_at']) + dt.timedelta(seconds=videoInfo['length']) + dt.timedelta(days=CLIP_GRACE_PERIOD)
        
        CLIP_PATH = VIDEO_PATH + 'clip/'
        clipList = os.listdir(CLIP_PATH)
        
        labelData = np.zeros( videoInfo['length'], dtype=int )

        for clip in clipList: # clip loop
            with open(CLIP_PATH + clip, "r", encoding="utf-8") as file:
                data = file.read()
            clipInfo = json.loads(data)

            if filter_low_views and clipInfo['views'] < VIEWS_THRESHOLD:
                continue # filter out a clip with low views
            if stringToDateTime(clipInfo['created_at']) >= clipDeadline:
                continue # filter out a clip created outside the range

            offset = clipInfo['vod']['offset']
            duration = math.ceil( clipInfo['duration'] ) # length of ground true clip

            if offset >= videoInfo['length']: # check a clip is locate in the video range
                continue

            # generate ground true for global evaluation metrics
            for i in range(offset, offset + duration): # loop the clip range
                if i >= videoInfo['length']: # check a index is locate in the video range
                    break

                labelData[i] += 1
        
        fittingData['label'].append(labelData)
        
    fittingData['label'] = np.array(fittingData['label'])
    np.save('ground_truth-' + filename[-17:-4], fittingData['label'])

def regenerateLabel(filename = '', window = 1, task_id = ''): # it is useful when regenerating label from original window size (1s) to other window size
    oldLabelData = np.load(filename)
    
    newLabelData = []
    max_data_length = 0
    for sample in oldLabelData:
        indexArray = np.where(sample == -1)[0]
        if len(indexArray):
            sampleLength = indexArray[0]
        else:
            sampleLength = len(sample)
        
        newSampleLength = math.ceil(sampleLength / window)
        if newSampleLength > max_data_length: # store max data length for padding
            max_data_length = newSampleLength
        
        label = np.empty((newSampleLength, 1), dtype=np.float64)
        for timestep in range(0, sampleLength, window):
            upperBound = timestep + window
            if upperBound < sampleLength:
                label[int(timestep/window)] = np.mean( sample[timestep:upperBound], dtype=np.float64 )
            else:
                label[int(timestep/window)] = np.mean( sample[timestep:sampleLength], dtype=np.float64 )
                
        newLabelData.append(label)
        
    # pad each data to the same length
    numberOfData = len(newLabelData)
    paddingLabel = np.full( (numberOfData, max_data_length, 1), fill_value=DUMMY_VALUE, dtype=np.float64 )
    for s, y in enumerate(newLabelData):
        data_length = len(y)
        paddingLabel[s, 0:data_length, :] = y
    
    np.save(filename[:-4] + '-' + task_id, paddingLabel)

def generateFeatureWithMarkByProcessedData(filename = '', dataPath = '', window = 1):
    task_id = random.randrange(100)
    
    fittingData = {
        'data': []
    }
    
    max_data_length = 0
    
    markList = np.load(filename, allow_pickle=True)
    for mark in markList:
        messages = np.load(dataPath + mark[0] + '/' + mark[1] + '/' + 'messages.npy', allow_pickle=True)
        videoLength = len(messages)
        
        dataLength = dataLength = math.ceil(videoLength / window)
        if dataLength > max_data_length: # store max data length for padding
            max_data_length = dataLength

        fittingData['data'].append( getNFnDFeatureWithMinMaxNormalizationByProcessedData(messages, dataLength, window) )
    
    # pad each data to the same length
    numberOfData = len(fittingData['data'])
    paddingData = np.full( (numberOfData, max_data_length, len(fittingData['data'][0][0])), fill_value=DUMMY_VALUE, dtype=FEATURE_DATA_TYPE )
    for s, x in enumerate(fittingData['data']):
        video_length = len(x)
        paddingData[s, 0:video_length, :] = x
    fittingData['data'] = paddingData
    #paddingData = [] # free memory
    
    np.save(re.sub(r'mark(?=-)', 'data', filename[:-4]) + '-' + f"{task_id:02}", fittingData['data'])
    return task_id
    
def generateFittingData(start_datetime = dt.datetime(1,1,1), end_datetime = dt.datetime(9999,12,31,23,59,59), only_chat = True, is_testing = False, filter_low_views = False, window=1): # end_datetime default value is not enough
    LENGTH_OF_TASK_ID = 13 # const
    
    POSSIBLE_VIDEO_TYPE = {'archive', 'upload', 'highlight'} # const
    #NUMBER_OF_FEATURES = 2 # const
        
    task_id = ''.join(random.choices(string.ascii_lowercase + string.digits, k=LENGTH_OF_TASK_ID))
    
    fittingData = {
        'mark': [], # store channel name and video id in a tuple (channel, video)
        'data': [],
        'label': [] # this would be different between training label and testing label
    }
    
    max_data_length = 0
    
    for channel in TARGET_CHANNEL_LIST: # channel loop
        CHANNEL_PATH = DATA_PATH + channel + '/'
        
        videoList = os.listdir(DATA_PATH + channel)
        for video in videoList: # video loop
            VIDEO_PATH = CHANNEL_PATH + video +'/' # local const
            
            if not os.path.isfile(VIDEO_PATH + 'info.json'): # check file exist
                print(channel + ' ' + video + ' info.json file does not exist')
                continue
            
            # read video info
            with open(VIDEO_PATH + 'info.json', "r", encoding="utf-8") as file:
                data = file.read()
            videoInfo = json.loads(data)
            
            if 'error' in videoInfo: # check info is correct
                print(channel + ' ' + video + ' ' + videoInfo['error'])
                continue
            if videoInfo['broadcast_type'] != 'archive': # check video type
                #print(channel + ' ' + video + ' ' + videoInfo['broadcast_type'])
                if videoInfo['broadcast_type'] not in POSSIBLE_VIDEO_TYPE:
                    print(channel + ' ' + video + ' ' + videoInfo['broadcast_type'])
                continue
            if str(videoInfo['channel']['_id']) != CHANNEL_ID[channel]: # check channel is correct
                print(channel + ' ' + video + ' ' + videoInfo['channel']['name'])
                continue
            if videoInfo['viewable'] != 'public': # check video accessible
                print(channel + ' ' + video + ' ' + videoInfo['viewable'])
                continue
            '''if videoInfo['created_at'] != videoInfo['published_at'] or videoInfo['created_at'] != videoInfo['recorded_at'] or videoInfo['published_at'] != videoInfo['recorded_at']: # check the difference in datetime data field
                print(channel + ' ' + video + ' ' + videoInfo['created_at'] + ' ' + videoInfo['published_at'] + ' ' + videoInfo['recorded_at'])
            '''
            videoDateTime = stringToDateTime(videoInfo['created_at'])
            if videoDateTime >= start_datetime and videoDateTime < end_datetime: # if the video is in the specific range, then generate fitting data
                # process label (we need to check this video is valid first)
                clipCount = 0 # valid clip count
                    
                clipDeadline = videoDateTime + dt.timedelta(seconds=videoInfo['length']) + dt.timedelta(days=CLIP_GRACE_PERIOD)

                CLIP_PATH = VIDEO_PATH + 'clip/'
                clipList = os.listdir(CLIP_PATH)
                if not is_testing: # training
                    labelData = np.zeros( (videoInfo['length'], 1), dtype=int )
                    
                    for clip in clipList: # clip loop
                        with open(CLIP_PATH + clip, "r", encoding="utf-8") as file:
                            data = file.read()
                        clipInfo = json.loads(data)
                        
                        if filter_low_views and clipInfo['views'] < VIEWS_THRESHOLD:
                            continue # filter out a clip with low views
                        if stringToDateTime(clipInfo['created_at']) >= clipDeadline:
                            continue # filter out a clip created outside the range

                        offset = clipInfo['vod']['offset']
                        duration = math.ceil( clipInfo['duration'] ) # length of ground true clip

                        if offset >= videoInfo['length']: # check a clip is locate in the video range
                            print(channel + ' ' + video + '(outside) ' + clip)
                            continue

                        for i in range(offset, offset + duration): # loop the clip range
                            if i >= videoInfo['length']: # check a index is locate in the video range
                                print(channel + ' ' + video + '(inside) ' + clip)
                                break

                            if not labelData[i][0]:
                                labelData[i][0] = 1
                                
                        clipCount += 1
                else: # testing
                    labelData = np.zeros( videoInfo['length'], dtype=int )
                    
                    for clip in clipList: # clip loop
                        with open(CLIP_PATH + clip, "r", encoding="utf-8") as file:
                            data = file.read()
                        clipInfo = json.loads(data)
                        
                        if filter_low_views and clipInfo['views'] < VIEWS_THRESHOLD:
                            continue # filter out a clip with low views
                        if stringToDateTime(clipInfo['created_at']) >= clipDeadline:
                            continue # filter out a clip created outside the range

                        offset = clipInfo['vod']['offset']
                        duration = math.ceil( clipInfo['duration'] ) # length of ground true clip

                        if offset >= videoInfo['length']: # check a clip is locate in the video range
                            print(channel + ' ' + video + '(outside) ' + clip)
                            continue

                        # generate ground true for global evaluation metrics
                        for i in range(offset, offset + duration): # loop the clip range
                            if i >= videoInfo['length']: # check a index is locate in the video range
                                print(channel + ' ' + video + '(inside) ' + clip)
                                break
                            
                            labelData[i] += 1
                                
                        clipCount += 1
                        
                if clipCount < 1:
                    continue # skip this video (do not store label data)
                else:
                    fittingData['mark'].append([channel, video])
                    fittingData['label'].append(labelData)
                
                timesteps = math.ceil(videoInfo['length'] / window)
                if timesteps > max_data_length: # store max data length for padding
                    max_data_length = timesteps
                
                # process features                
                # process message data (it can be optimized)
                messages = [[] for i in range(videoInfo['length'])]

                messagePathList = glob.glob(VIDEO_PATH + 'Message-*.json')
                for path in messagePathList:
                    with open(path, "r", encoding="utf-8") as file:
                        data = file.read()

                    commentData = json.loads(data)['comments']
                    for comment in commentData:
                        offset = math.floor( comment['content_offset_seconds'] ) # get comment offset

                        if offset >= videoInfo['length']:
                            break
                        if only_chat and comment['source'] != 'chat':
                            if comment['source'] != 'comment':
                                print(channel + ' ' + video + ' ' + str(offset) + ' ' + comment['source'])
                            continue
                        if comment['state'] != 'published':
                            print(channel + ' ' + video + ' ' + str(offset) + ' ' + comment['state'])

                        messages[offset].append( comment['message']['body'] )
                        
                fittingData['data'].append( getNFnDFeatureWithMinMaxNormalization(messages, timesteps, window) )
    
    # pad each data to the same length
    numberOfData = len(fittingData['data'])
    paddingData = np.full( (numberOfData, max_data_length, len(fittingData['data'][0][0])), fill_value=DUMMY_VALUE, dtype=FEATURE_DATA_TYPE )
    for s, x in enumerate(fittingData['data']):
        video_length = len(x)
        paddingData[s, 0:video_length, :] = x
    fittingData['data'] = paddingData
    #paddingData = [] # free memory
    
    # transform mark list to numpy array (this step just for easy processing later)
    fittingData['mark'] = np.array(fittingData['mark'])
    if not is_testing:
        # pad each leabel data to the same length
        paddingLabel = np.full( (numberOfData, max_data_length, 1), fill_value=DUMMY_VALUE, dtype=int )
        for s, y in enumerate(fittingData['label']):
            video_length = len(y)
            paddingLabel[s, 0:video_length, :] = y
        fittingData['label'] = paddingLabel
        #paddingLabel = [] # free memory
        
        np.save('training_mark-' + task_id, fittingData['mark'])
        np.save('training_data-' + task_id, fittingData['data'])
        np.save('training_label-' + task_id, fittingData['label'])
    else:
        fittingData['label'] = np.array(fittingData['label']) # also need to transform label list when testing
        
        np.save('testing_mark-' + task_id, fittingData['mark'])
        np.save('testing_data-' + task_id, fittingData['data'])
        np.save('testing_label-' + task_id, fittingData['label'])
        
    return task_id

def storeProcessedMessageWithMark(mark_filename = '', destination = '', only_chat = True):
    markList = np.load(mark_filename, allow_pickle=True)
    for mark in markList:
        VIDEO_PATH = DATA_PATH + mark[0] + '/' + mark[1] + '/' # local const
        
        # read video info
        with open(VIDEO_PATH + 'info.json', "r", encoding="utf-8") as file:
            data = file.read()
        videoInfo = json.loads(data)
        
        # process message data (it can be optimized)
        messages = [[] for i in range(videoInfo['length'])]

        messagePathList = glob.glob(VIDEO_PATH + 'Message-*.json')
        for path in messagePathList:
            with open(path, "r", encoding="utf-8") as file:
                data = file.read()

            commentData = json.loads(data)['comments']
            for comment in commentData:
                offset = math.floor( comment['content_offset_seconds'] ) # get comment offset

                if offset >= videoInfo['length']:
                    break
                if only_chat and comment['source'] != 'chat':
                    if comment['source'] != 'comment':
                        print(channel + ' ' + video + ' ' + str(offset) + ' ' + comment['source'])
                    continue
                if comment['state'] != 'published':
                    print(channel + ' ' + video + ' ' + str(offset) + ' ' + comment['state'])
                
                messages[offset].append( customMessageTokenizer(comment['message']['body']) )
        
        folder_path = destination + mark[0] + '/' + mark[1] + '/'
        os.makedirs(folder_path) # create folder
        messageData = np.array(messages) # transform to numpy array for easy processing later
        np.save(folder_path + 'messages', messageData)

def trainWordEmbeddingModel(data_path = '', model_name = ''):
    trainingSentences = []
    
    channelList = os.listdir(data_path)
    for channel in channelList:
        CHANNEL_PATH = data_path + channel + '/'
        
        videoList = os.listdir(CHANNEL_PATH)
        for video in videoList:
            messages = np.load(CHANNEL_PATH + video + '/' + 'messages.npy', allow_pickle=True)
            trainingSentences += [[token for sentence in timestep for token in sentence] for timestep in messages if timestep]
            
    model = Word2Vec( sentences = trainingSentences, size = WORD_EMBEDDING_SIZE, workers = 6, sg = 0, hs = 0 ) # window = 20, iter = 50, min_count = 4, alpha = 0.75
    model.save(model_name + ".model")

In [21]:
print( generateFittingData(stringToDateTime(TRAINING_DATA_START_DATE), stringToDateTime(TRAINING_DATA_END_DATE), only_chat=True, filter_low_views=True) )
print( generateFittingData(stringToDateTime(TESTING_DATA_START_DATE), stringToDateTime(TESTING_DATA_END_DATE), only_chat=True, is_testing=True, filter_low_views=True) ) # 1h 18m 1s, 14m 53s
#print( generateTestingData(stringToDateTime(TESTING_DATA_START_DATE), stringToDateTime(TESTING_DATA_END_DATE), True) )

lirik 386059282 Not Found
summit1g 397613985 Not Found
summit1g 398121286 Not Found
summit1g 398614891 Not Found
timthetatman 397423734 Not Found
timthetatman 397682683 Not Found
timthetatman 397865909 Not Found
timthetatman 398222973 Not Found
timthetatman 398648650 Not Found
timthetatman 398850412 Not Found
drdisrespect 397935556 Not Found
drdisrespect 398441715 Not Found
drdisrespect 398921124 Not Found
dakotaz 397343707 Not Found
dakotaz 397811515 Not Found
dakotaz 398302034 Not Found
dakotaz 398305477 Not Found
dakotaz 398780595 Not Found
nickmercs 398455211 Not Found
nickmercs 398657494 Not Found
nickmercs 399051300 Not Found
tsm_daequan 398159685 Not Found
tsm_daequan 398775122 Not Found
xqcow 397036781 Not Found
xqcow 397465468 Not Found
xqcow 397898610 Not Found
xqcow 398544427 Not Found
xqcow 398796877 Not Found
xqcow 399105570 Not Found
castro_1021 397950594 Not Found
castro_1021 398447100 Not Found
castro_1021 399403171 Not Found
5g15zuybkddvt


In [4]:
aaa = dt.datetime(9999,12,31,23,59,59)#dt.datetime.strptime('2019-04-01T23:57:01Z', '%Y-%m-%dT%H:%M:%SZ')
bbb = stringToDateTime('2019-04-01T00:00:00Z')
ccc = dt.datetime.strptime('2019-04-01T00:00:00Z', '%Y-%m-%dT%H:%M:%SZ')
#aaa
#bbb >= ccc
ddd = stringToDateTime('2019-04-01T00:00:56.12345Z')
#len('2019-04-01T00:00:00Z'[:20])
ddd
bbb + dt.timedelta(seconds=97) + dt.timedelta(days=3)

datetime.datetime(2019, 4, 4, 0, 1, 37)

In [1]:
import numpy as np

fee = []
for i in range(3):
    fe = []
    fe.append(2.0)
    fe.append(3.9)
    fee.append(fe)
    
data = []
data.append(fee)
data.append(fee)
print(np.array(data))

[[[2.  3.9]
  [2.  3.9]
  [2.  3.9]]

 [[2.  3.9]
  [2.  3.9]
  [2.  3.9]]]


In [2]:
import numpy as np
a = None
b = None

In [3]:
a = np.empty( (10000000, 2) )
for i in range(10000000):
    for j in range(2):
        a[i][j] = 1.666 #8.76s

In [8]:
a = None
b = None

In [9]:
b = []
for i in range(10000000):
    c = []
    for j in range(2):
        c.append(1.666)
    
    b.append(c) # 9.8
    
b = np.array(b)#12.6s

In [None]:
c = []
for i in range(10):
    d = np.empty( (1000000, 2) )

In [31]:
a = [[[]]]
len(a[0])

1

In [77]:
a = np.array([-1], dtype=np.float64)
a[0] == -1.

True

In [85]:
a = [2,3,4]
b = len(a)
a = []
b

3

In [12]:
a = [[[2,5],[3,6],[4,8]], [[7,9],[3,6]]]
b = [[2,5],[3,6],[4,8]]
#a.append(b)
n = np.array(a)
np.array([n[0]])

array([[[2, 5],
        [3, 6],
        [4, 8]]])

In [13]:
a = []
a.append(['ninja', '406168424'])
a.append(['sodapoppin', '4061684249'])

np.array(a)

array([['ninja', '406168424'],
       ['sodapoppin', '4061684249']], dtype='<U10')

In [15]:
a = {
    'np': [1,2,3]
}

b = np.array([5,6,7])
a['np'] = b

b[0] = 9
a

{'np': array([9, 6, 7])}

In [4]:
a = np.zeros( (100, 1), dtype=int )
a[0][0] = 26
a[25][0] = 31
np.sum(a)

57

In [3]:
def a(b=0, c=1, d=2, e=3):
    print(b)
    print(c)
    print(d)
    print(e)
a(5, 6, e=9)

5
6
2
9


In [1]:
if 1==1:
    a = 'ggyoyo'
a + '6653'

'ggyoyo6653'

In [18]:
from sklearn.preprocessing import minmax_scale
'''foo = np.array([[15,0.9],[5,0.34],[45,0.1],[0,0.78]])
column_1 = foo[:,1] #first column you don't want to scale
column_2 = minmax_scale(foo[:,0]) #second column you want to scale
foo_norm = np.stack((column_2, column_1), axis=1) #stack both columns to get a 2d array
foo_norm'''

a = np.array([[15,60],[5,15],[45,0],[0,30]], dtype=np.float64)
b = np.array([[0.9],[0.34],[0.1],[0.78]])
c = minmax_scale(a)
bc = np.concatenate((b,c), axis=1)
bc

array([[0.9       , 0.33333333, 1.        ],
       [0.34      , 0.11111111, 0.25      ],
       [0.1       , 1.        , 0.        ],
       [0.78      , 0.        , 0.5       ]])

In [22]:
a = {
    'aa': []
}
b = np.full(6, -1)
a['aa'] = b
b = []
a['aa']

array([-1, -1, -1, -1, -1, -1])

In [7]:
a = [[[1], [0], [0], [0], [[1],[0]]]]
#np.unique(a)
class_weight.compute_class_weight('balanced', [0, 1], a)

TypeError: unhashable type: 'list'

In [8]:
a = 'aaa bbb ccc 　  cccccc d'

localTokens = tknzr.tokenize(a) # tokenization
print(len(localTokens))
print(localTokens)
#localText = nltk.text.Text(localTokens) # convert tokens to NLTK text

5
['aaa', 'bbb', 'ccc', 'ccc', 'd']


In [10]:
random.randrange(20)

3

In [6]:
a = 3
f"{a:02}"

'03'

In [7]:
'aaa.npy'[:-4] + '-' + f"{a:02}"
re.sub(r'mark(?=-)', 'data', 'training_mark-aaa.npy'[:-4]) + '-' + f"{a:02}"

'training_data-aaa-03'

In [19]:
a  = np.load('training_data-yb9ap3dscbr3p')

FileNotFoundError: [Errno 2] No such file or directory: 'training_data-yb9ap3dscbr3p'

In [48]:
a = np.array([[15,60],[5,15],[10000,0],[0,30]], dtype=np.float64)
b = np.array([[0.9],[0.34],[0.1],[0.78]])

m = np.mean(a, axis=0) # array([16.25, 26.25])
std = np.std(a, axis=0) # array([17.45530005, 22.18529919])
def my_func(a, i):
    return a - m[i]

#np.apply_along_axis(my_func, 0, a[:,0], 0)
print( 0.5 * (np.tanh(0.01 * ((a - m) / std)) + 1) )

[[0.49712291 0.5076058 ]
 [0.49711136 0.49746456]
 [0.50865938 0.4940842 ]
 [0.49710558 0.50084515]]


In [None]:
# 減完m
array([[ -1.25,  33.75],
       [-11.25, -11.25],
       [ 28.75, -26.25],
       [-16.25,   3.75]])

# 除以std
array([[-0.07161149,  1.52127766],
       [-0.64450339, -0.50709255],
       [ 1.64706421, -1.18321596],
       [-0.93094934,  0.16903085]])

# 乘0.01
array([[-0.00071611,  0.01521278],
       [-0.00644503, -0.00507093],
       [ 0.01647064, -0.01183216],
       [-0.00930949,  0.00169031]])

# tanh運算
array([[-0.00071611,  0.0152116 ],
       [-0.00644494, -0.00507088],
       [ 0.01646915, -0.01183161],
       [-0.00930922,  0.00169031]])

# +1
array([[0.99928389, 1.0152116 ],
       [0.99355506, 0.99492912],
       [1.01646915, 0.98816839],
       [0.99069078, 1.00169031]])

# 乘0.5
array([[0.49964194, 0.5076058 ],
       [0.49677753, 0.49746456],
       [0.50823458, 0.4940842 ],
       [0.49534539, 0.50084515]])

In [12]:
a = [['aa', 'bb', 'cc'], ['dd','ee']]
b = []
b += a[0]
b += a[1]
aa = ' '.join(b)
aa

c = []
c += a[0]
print(c)
c += a[1]
print(c)

['aa', 'bb', 'cc']
['aa', 'bb', 'cc', 'dd', 'ee']


In [15]:
a = np.array([[[1],[1],[0],[-1],[-1]], [[0],[1],[-1],[-1],[-1]], [[0],[1],[1],[0],[0]]], dtype=np.float64)
for s in a:
    if len(np.where(s == -1)[0]):
        print(np.where(s == -1)[0][0])
    else:
        print(-1)

3
2
-1


In [22]:
a = np.array([[[1],[1],[0],[-1],[-1]], [[0],[1],[-1],[-1],[-1]], [[0],[1],[1],[0],[0]]], dtype=np.float64)
a[0][0][0] == [1]

array([ True])

In [29]:
np.mean(a[2][0:3])

0.6666666666666666

In [37]:
a = [1,2,3,4]
a[4/2]

TypeError: list indices must be integers or slices, not float

In [6]:
a = ["Maße".casefold(), "MASSE".casefold()]
b = ["Maße".lower(), "MASSE".lower()]
print(set(a))
print(set(b))

{'masse'}
{'maße', 'masse'}


In [20]:
from nltk.stem.snowball import SnowballStemmer
stemmer.stem("We're trying to plan our future generously")
#SnowballStemmer("english").stem("We're trying to plan our future generously")

"we're trying to plan our future gener"

In [9]:
a = "May's :) LUUUUUUUUL?? MAY's we're 1995/08/30 U.S.A!!!!!!=    = T H I C C :p　！！!！2⁵"
tknzr.tokenize(a)
#b = nltk.tokenize.casual.reduce_lengthening(a)
#nltk.word_tokenize(a)

["May's",
 ':)',
 'LUUUL',
 '?',
 '?',
 "MAY's",
 "we're",
 '1995/08',
 '/',
 '30',
 'U',
 '.',
 'S',
 '.',
 'A',
 '!',
 '!',
 '!',
 '=',
 '=',
 'T',
 'H',
 'I',
 'C',
 'C',
 ':p',
 '！',
 '！',
 '!',
 '！',
 '2⁵']

In [12]:
re.match(r'(.+)_mark', 'testing_mark-5g15zuybkddvt')[1]

'testing'

In [2]:
'training_data-yb9ap3dscbr3p.npy'[-17:-4]

'yb9ap3dscbr3p'

In [9]:
a = [[] for i in range(5)]
b = ['aaa', 'bbb', 'ccc']
#b = 'aaaaaaccccc'
c = ['kk','gg']
d = ['pp','554258','!!']
a[0].append(b)
a[2].append(c)
a[0].append(d)
a

[[['aaa', 'bbb', 'ccc'], ['pp', '554258', '!!']], [], [['kk', 'gg']], [], []]

In [None]:
(.*)\1{2,}

T.H.I.C.C.K AAA U.S.A bbbb A.A E.D.F.....
T H I C C K AA L U L  bbbb A A... abcdefg L U L !!!


abc.....
abc.

a's May's US's girls' aaaa'saaaa

In [25]:
re.sub(r'(.{1})\1{2,}', r'\g<1>\g<1>', 'LUUUUUUUUUUUUUUUUUUUUL.....!!!!aaa') # LUUL..!!aa
re.sub(r'(\S+)\1{2,}', r'\g<1>', 'POGPOGPOG') # POG
#re.sub(r'((?<=\s)|(?<=^))(\w\s){2}', '', 'T H I C C AAA L U L')
#a = re.finditer(r"(?='s\s)|(?<=s)(?='\s)", "it's")
'''a = re.finditer(r"'s(?=\s|$)|(?<=s)'(?:\s|$)", "it's")
for i in a:
    print(i.start())'''
re.search(r"'s(?=\s|$)|(?<=s)'(?:\s|$)", "it's").start()

2

In [64]:
a = re.finditer(r"(?:(?<=\s)|(?<=^))(?:\w\s){2}", 'aaaaaa')
b =  [m.span(0) for m in a]
b

0

In [63]:
if a:
    print(87)
else:
    print(a)

87


In [73]:
a = '5sdsdsds0'
a[3:8] + a[8:]

'sdsds0'

In [74]:
if 'LUL' == 'ＬＵＬ':
    print(87)

In [87]:
FULL2HALF = dict((i + 0xFEE0, i) for i in range(0x21, 0x7F))
FULL2HALF[0x3000] = 0x20

'abＬＵcccsＬ'.translate(FULL2HALF)
#FULL2HALF

'abLUcccsL'

In [91]:
'abＬ ＵcccsＬaa　'.strip().translate(FULL2HALF).casefold()

'abl ucccslaa'

In [5]:
a = ['abc', 'aaa', 'cbc']
d = {"abc": ['ab', 'c']}

#b = [d[token] if token in d else token for token in a]
#b = [t for token in a if token in d for t in d[token]]
b = [t for token in a for t in ([token] if token not in d else d[token])]
b

['ab', 'c', 'aaa', 'cbc']

In [27]:
a = ["abc'sa", "a'aa", "cbc's", "ccgs'", "bbb"]

b = [t for token in a for t in ([token] if not re.search(r"'s(?=\s|$)|(?<=s)'(?:\s|$)", token) else for i in )]
re.search(r"'s(?=\s|$)|(?<=s)'(?:\s|$)", "its").start()

AttributeError: 'NoneType' object has no attribute 'start'

In [28]:
re.split(r"'s(?=\s|$)|(?<=s)'(?:\s|$)", "it's")

['it', '']

In [42]:
import timeit 

timeit.timeit(stmt='''\
tokens = range(10000)
t = []
for i in range(10000):
    t.append(i)''', number=10000) # 8.948452900000007

timeit.timeit(stmt='''\
tokens = range(10000)
a = {}
for i in range(10000, 20000):
    a[i] = i
b = [apostrophe for token in tokens for apostrophe in ([token] if token not in a else a[token])]''', number=10000) # 21.459299200000032

timeit.timeit(stmt='''\
import numpy as np
tokens = range(10000)
a = {}
for i in range(10000, 20000):
    if np.random.choice(np.arange(0,2), p=[0.2, 0.8]):
        a[i] = i
b = [apostrophe for token in tokens for apostrophe in ([token] if token not in a else a[token])]''', number=10000)

7.492903800000022

In [69]:
def a(path='5566'):
    for path in range(10):
        print(path)
    print(path)
a()

0
1
2
3
4
5
6
7
8
9
9


In [70]:
os.makedirs('test/abc/')

In [28]:
import unicodedata
foo = 'ｎ　３死幹!！！！！2⁵  ǆ ㌀①ﻌ アパート1/4 ع'
unicodedata.normalize('NFKC', foo)

'n 3死幹!!!!!25  dž アパート1ع アパート1/4 ع'

In [29]:
a=np.array([foo])
np.save('kkk', a)

In [30]:
b = np.load('kkk.npy')
b

array(['ｎ\u3000３死幹!！！！！2⁵  ǆ ㌀①ﻌ アパート1/4 ع'], dtype='<U29')

In [2]:
a = [[['aaa','bbb','ccc'], ['kkk','ccc']], [], [['g87','964']]]
tokenList = [token for i in range(0, 3) for message in a[i] for token in message]
tokenList

['aaa', 'bbb', 'ccc', 'kkk', 'ccc', 'g87', '964']

In [4]:
a = [[], [], [['aaa','bbb','ccc'], ['kkk','ccc']], [], [['g87','964']]]
#[[token for sentence in timestep for token in sentence] for timestep in a]
[[token for sentence in timestep for token in sentence] for timestep in a if timestep]

[['aaa', 'bbb', 'ccc', 'kkk', 'ccc'], ['g87', '964']]

In [5]:
a = [['aaa', 'bbb', 'ccc', 'kkk', 'ccc'], ['g87', '964']]
b = [["cat", "say", "meow"], ["dog", "say", "woof"]]
a + b

[['aaa', 'bbb', 'ccc', 'kkk', 'ccc'],
 ['g87', '964'],
 ['cat', 'say', 'meow'],
 ['dog', 'say', 'woof']]

In [12]:
a = np.array( [5/6], dtype=np.float32 )
b = np.empty( 1, dtype=np.float64 )
c = np.array( a, dtype=np.float64 )
#b[0] += a[0]
b[0] = float(a[0])
print(5/6)
print(a[0])
print(b[0])
print(c[0])
print(float(a[0]))

0.8333333333333334
0.8333333
0.8333333134651184
0.8333333134651184
0.8333333134651184
