### Talk Time

In [1]:
import os
import multiprocessing
#import multiprocessing as mp
from multiprocessing import Process, Manager, Pool, Queue
from itertools import islice
from collections import Counter
import re
import pandas as pd
import numpy as np
import glob
import nltk
from nltk import word_tokenize
from nltk import ngrams
from functools import reduce
from pathlib import Path
import shutil

In [2]:
nprocs = multiprocessing.cpu_count()
print(f"Number of CPU cores: {nprocs}")

Number of CPU cores: 16


In [3]:
# language pair
lang_folder = "Dutch"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
#lang_pair = "Intersect"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> native language

# adding native word to shared word
word_start = 0  # 0 native word start index
word_end = 200  # 28 native word end index

# youtube read data
sample_num = 1  # 6
time_shift = 0.6

In [4]:
def word_in_wordgroup_simple(source_word_list, df_target, target_column, word_sample_num, simple=False):

    '''word_in_wordgroup(not_in_sent_word_list, df_youtube_sent_select, "search_string", 5, simple=False)\n
       source_word_list is searching word list\n
       df_target is dataframe, target_column are dataframe column string name\n
       word_sample_num is searching sample number.
       simple use for all column row result or only target column result 
    '''
    if simple:
        df_select = df_target[[f"{target_column}"]].dropna()
    else:
        df_select = df_target
        
    df_result = pd.DataFrame()
    for i in source_word_list:
        try:
            word_in_word_cluster = df_select[df_select[f"{target_column}"].str.contains(fr"(?:\s|^){i}(?:\s|$)", na=True)].head(word_sample_num)    
        except:
            pass        
        word_in_word_cluster.insert(0,"search_string",i)
        df_result = pd.concat([df_result,word_in_word_cluster], axis=0)
    df_result.reset_index(drop=True, inplace=True)

    return df_result

In [5]:
def word_group_youtube(df, search_list, target_column, sample_num):
    '''
    word_group_youtube(df_youtube_sentence, search_list, "sentence", 6)
    ''' 
    df_search_result = pd.DataFrame()
    for j in search_list:
        try:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].sample(sample_num)
        except:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].head(sample_num)
        #df_result = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)]  # sentence length part
        #df_result.sort_values(f"{target_column}",key=lambda x:x.str.len(), inplace=True)
        #df_select = df_result.head(sample_num)
        df_select.insert(0,"search_string",j)
        df_search_result = pd.concat([df_search_result,df_select], axis=0)
    df_search_result.reset_index(inplace=True, drop=True)
    
    return df_search_result

In [6]:
def word_group_time_loc(df, search, start_sent, end_sent, sent, sent_video_id):
    '''
    word_group_time_loc(df_search_result, "search_string", "start_time", "end_time", "sentence", "video_id")
    '''
    df.reset_index(drop=True, inplace=True)
    word_time_loc_list = []
    for i in range(len(df)):
        word = df.loc[i,f"{search}"]
        start_time = df.loc[i,f"{start_sent}"]
        end_time = df.loc[i,f"{end_sent}"]
        sentence = df.loc[i,f"{sent}"]
        video_id = df.loc[i,f"{sent_video_id}"]
        time_length = end_time-start_time
        sentence_length = len(sentence)
        time_length_ratio = time_length/sentence_length
        loc_list = []
        for j in re.finditer(fr"(?:\s|^){word}(?:\s|$)", sentence, re.IGNORECASE|re.UNICODE):
            loc_list.append(j)
            start = loc_list[0].start()
            end = loc_list[0].end()
            start_loc = start_time+(start*time_length_ratio)
            end_loc = start_time+(end*time_length_ratio)
        word_time_loc_list.append([word,start_loc,end_loc,sentence,video_id])
    df_word_time_loc = pd.DataFrame(word_time_loc_list, columns=[f"{search}",f"{start_sent}",f"{end_sent}",f"{sent}",f"{sent_video_id}"])

    return df_word_time_loc

In [7]:
def word_count_result(df, column_list, set_condition=False): # df is dataframe, column_list is list value
    '''
    word_count_bool(df, column_list): df columns word count for word frequency\n
    df is dataframe, column_list is list value\n
    word_count_bool(df, ["word","twogram"]):
    '''
    list_all = []
    for i in df.loc[:,[x for x in column_list]].columns:
        if set_condition:
            var_list = set(df[f"{i}"].dropna().tolist())
        else:
            var_list = df[f"{i}"].dropna().tolist()
        for j in var_list:
            list_all.append(j)
    text = " ".join(list_all)
    word_list = re.findall(r"\w+",text, re.UNICODE)
    df_word_list = pd.DataFrame(word_list, columns=["word"])
    #df_word_list.rename(columns={0:"word"}, inplace=True)
    df_word_count = pd.DataFrame(df_word_list.value_counts())
    df_word_count.reset_index(inplace=True)
    df_word_count.rename(columns={0:"word_count"}, inplace=True)
    df_word_count.sort_values("word_count", ascending=False, inplace=True)
    df_word_count.reset_index(inplace=True, drop=True)
    
    return  df_word_count

In [None]:
def df_col_value_join_comma(df, df_columns_list):
    '''
    df_col_value_join_comma(df_test, ["video_id","start_time","end_time"])\n
    function used for selected column value join with comma in one row
    '''
    column_value_list = []
    for column in df_columns_list:
        list_var = df[f"{column}"].to_list()
        list_var_string = [str(x) for x in list_var] 
        list_var_join = ",".join(list_var_string)
        column_value_list.append(list_var_join)

    df_result = pd.DataFrame([[column_value_list[0],column_value_list[1],column_value_list[2]]], columns=df_columns_list)

    return df_result

In [8]:
path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Talk Time/Result/4-Talk Time Data Result"

Path(path).mkdir(parents=True, exist_ok=True)

In [9]:
df_word_all = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.lower().capitalize()}/Deployment/Data/Word/Word_Merge_Preprocess.xlsx")
df_word_all

Unnamed: 0,word,frequency
0,ik,22539531
1,je,20769946
2,het,15696161
3,de,15258816
4,dat,13387137
...,...,...
331769,oorlogsgezichten,5
331770,opdrachtenlijst,5
331771,verlsaafde,5
331772,oxidatieproces,5


In [10]:
df_word_select = df_word_all.iloc[word_start:word_end,]

In [11]:
df_word_select

Unnamed: 0,word,frequency
0,ik,22539531
1,je,20769946
2,het,15696161
3,de,15258816
4,dat,13387137
...,...,...
195,genoeg,353128
196,wist,352948
197,oh,351988
198,klaar,350506


In [12]:
#df_word_select.to_excel(f"{lang_folder.capitalize()}_200_Word.xlsx", index=False)

In [13]:
word_list = df_word_select["word"].to_list()
len(word_list)

200

In [14]:
df_youtube_sentence = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Youtube/Result/{lang_folder.capitalize()}/Sentence Clean Merge/Clean_Youtube_Sentence_Merge_Result.csv")
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,00:00:00.000,00:00:02.501,dit is de pangale v,BoaYsdPtJYA
1,00:00:04.501,00:00:05.434,unreal,BoaYsdPtJYA
2,00:00:08.518,00:00:13.066,dit is de eerste keer ik rijd in een supersport,BoaYsdPtJYA
3,00:00:13.066,00:00:14.316,ik heb dit in mijn hele leven,BoaYsdPtJYA
4,00:00:14.316,00:00:15.889,nog nooit eerder gedaan,BoaYsdPtJYA
...,...,...,...,...
252485,00:01:37.135,00:01:47.040,soms zitten de volwassenen met natte ogen te k...,MZUDbceIw4E
252486,00:01:47.040,00:01:49.340,dit ziet er al was super indrukwekkend uit en,MZUDbceIw4E
252487,00:01:49.340,00:01:53.600,het is nog maar een voorproefje dus kom zeker ...,MZUDbceIw4E
252488,00:01:53.600,00:01:57.914,tijdsblok te kiezen en zeker ook op tijd te ko...,MZUDbceIw4E


In [15]:
df_youtube_sentence['start_time'] = pd.to_timedelta(df_youtube_sentence['start_time']) # data type converted timedelta for second 
df_youtube_sentence['end_time'] = pd.to_timedelta(df_youtube_sentence['end_time'])

In [16]:
df_youtube_sentence['start_time'] = df_youtube_sentence['start_time'].apply(lambda x: x.total_seconds()) # convert seconds
df_youtube_sentence['end_time'] = df_youtube_sentence['end_time'].apply(lambda x: x.total_seconds())
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,0.000,2.501,dit is de pangale v,BoaYsdPtJYA
1,4.501,5.434,unreal,BoaYsdPtJYA
2,8.518,13.066,dit is de eerste keer ik rijd in een supersport,BoaYsdPtJYA
3,13.066,14.316,ik heb dit in mijn hele leven,BoaYsdPtJYA
4,14.316,15.889,nog nooit eerder gedaan,BoaYsdPtJYA
...,...,...,...,...
252485,97.135,107.040,soms zitten de volwassenen met natte ogen te k...,MZUDbceIw4E
252486,107.040,109.340,dit ziet er al was super indrukwekkend uit en,MZUDbceIw4E
252487,109.340,113.600,het is nog maar een voorproefje dus kom zeker ...,MZUDbceIw4E
252488,113.600,117.914,tijdsblok te kiezen en zeker ook op tijd te ko...,MZUDbceIw4E


In [17]:
# other option 
# mUf7VNqChac =>  black screen
# 0_CDMstFg7M => 10sn
# bj1JRuyYeco => 20sn
# cElhIDdGz7M => screensaver
default_video_id = "Q-8I-uMUMYA"
df_link_default = pd.DataFrame(data=[["repeat",5,7,"repeat_again",f"{default_video_id}",f"https://www.youtube.com/watch?v={default_video_id}&t=0s"]], columns=["search_string","start_time","end_time","sentence","video_id","video_url"])
df_link_default

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,repeat,5,7,repeat_again,Q-8I-uMUMYA,https://www.youtube.com/watch?v=Q-8I-uMUMYA&t=0s


In [18]:
df_word_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/3-Talk Time Data Prepare/{lang_folder.capitalize()}_{word_end}_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_word_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ik,19,21,we zijn een tijdje geleden opgericht maar wat ...,8jmOuLksBJ0,https://www.youtube.com/watch?v=8jmOuLksBJ0&t=19s
1,ik,2213,2214,ik zal ophouden over de oorzaak van die schaar...,QAjt9liE_a4,https://www.youtube.com/watch?v=QAjt9liE_a4&t=...
2,ik,547,549,ik vertel het verhaal aan tom en we passen de ...,HKCKdkGrazU,https://www.youtube.com/watch?v=HKCKdkGrazU&t=...
3,ik,38,39,ik denk dat dat de maatregelen zijn die je nie...,KgYv5wjScaM,https://www.youtube.com/watch?v=KgYv5wjScaM&t=38s
4,ik,1057,1059,ja dat denk ik wel,gFA9Q_RrOVo,https://www.youtube.com/watch?v=gFA9Q_RrOVo&t=...
...,...,...,...,...,...,...
995,neem,1030,1033,neem maar mee,Gj1X8jth_Bw,https://www.youtube.com/watch?v=Gj1X8jth_Bw&t=...
996,neem,313,315,op stoom geholpen dus mijn advies neem de tijd...,lYKUmLypP5s,https://www.youtube.com/watch?v=lYKUmLypP5s&t=...
997,neem,230,232,ik neem de volgende dan ga ik deze,ZqkfywfGIIY,https://www.youtube.com/watch?v=ZqkfywfGIIY&t=...
998,neem,48,49,en neem gerust een dekentje mee extra gezellig...,ELg3rJ5NeRk,https://www.youtube.com/watch?v=ELg3rJ5NeRk&t=48s


In [19]:
df_word_link[df_word_link["search_string"].duplicated()]

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
1,ik,2213,2214,ik zal ophouden over de oorzaak van die schaar...,QAjt9liE_a4,https://www.youtube.com/watch?v=QAjt9liE_a4&t=...
2,ik,547,549,ik vertel het verhaal aan tom en we passen de ...,HKCKdkGrazU,https://www.youtube.com/watch?v=HKCKdkGrazU&t=...
3,ik,38,39,ik denk dat dat de maatregelen zijn die je nie...,KgYv5wjScaM,https://www.youtube.com/watch?v=KgYv5wjScaM&t=38s
4,ik,1057,1059,ja dat denk ik wel,gFA9Q_RrOVo,https://www.youtube.com/watch?v=gFA9Q_RrOVo&t=...
6,je,12415,12416,maar het gebruik ervan dat kun je gebieden dus...,uJhEbO42qRo,https://www.youtube.com/watch?v=uJhEbO42qRo&t=...
...,...,...,...,...,...,...
994,klaar,478,480,als ik klaar ben met werken dan zit ik op de b...,P9u2BkJamhg,https://www.youtube.com/watch?v=P9u2BkJamhg&t=...
996,neem,313,315,op stoom geholpen dus mijn advies neem de tijd...,lYKUmLypP5s,https://www.youtube.com/watch?v=lYKUmLypP5s&t=...
997,neem,230,232,ik neem de volgende dan ga ik deze,ZqkfywfGIIY,https://www.youtube.com/watch?v=ZqkfywfGIIY&t=...
998,neem,48,49,en neem gerust een dekentje mee extra gezellig...,ELg3rJ5NeRk,https://www.youtube.com/watch?v=ELg3rJ5NeRk&t=48s


In [20]:
df_twogram_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/3-Talk Time Data Prepare/{lang_folder.capitalize()}_Twogram_With_{word_end}_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_twogram_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,weet je,1434,1436,weet je wat lopen we daar binnen,fPzfhneNmcc,https://www.youtube.com/watch?v=fPzfhneNmcc&t=...
1,hoe dan,137,139,stukje belgische geschiedenis en hoe dan de ch...,Dd6N0ZdQTvc,https://www.youtube.com/watch?v=Dd6N0ZdQTvc&t=...
2,ik ga,1137,1139,ik ga daarop klikken en dan krijg ik de mogeli...,p4Xe249nsTU,https://www.youtube.com/watch?v=p4Xe249nsTU&t=...
3,het is,68,70,want het is groen dat deze wijken met elkaar k...,Kh9znOzlKPw,https://www.youtube.com/watch?v=Kh9znOzlKPw&t=68s
4,dat is,215,216,dat is eigenlijk een levenswerk voor u,kRLm5pQi_xE,https://www.youtube.com/watch?v=kRLm5pQi_xE&t=...
5,ik ben,68,69,nooit in positieve zin en daarom ben ik op een...,8oxf7nVWqIE,https://www.youtube.com/watch?v=8oxf7nVWqIE&t=68s
6,dit is,1381,1383,moet ik zeggen dat als ik even voor ons beide ...,XhcjYIknnfA,https://www.youtube.com/watch?v=XhcjYIknnfA&t=...
7,dat kan,134,136,als je vragen hebt of je bent een beetje bang ...,rWNpd_54cqM,https://www.youtube.com/watch?v=rWNpd_54cqM&t=...
8,ik denk,563,565,ik denk daar is men dan ook wel weer mee bezig,qmqWWRrEvZk,https://www.youtube.com/watch?v=qmqWWRrEvZk&t=...
9,ik heb,129,132,hoe kijk jij daarnaar amber ik heb die spotjes...,Ud7n8yMn_h8,https://www.youtube.com/watch?v=Ud7n8yMn_h8&t=...


In [21]:
df_threegram_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/3-Talk Time Data Prepare/{lang_folder.capitalize()}_Threegram_With_{word_end}_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_threegram_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,weet je wat,4406,4408,meneer weet u dat u meneer weet u zo maar weet...,lApJy0bJNk0,https://www.youtube.com/watch?v=lApJy0bJNk0&t=...
1,en dat is,2593,2595,van opstellen en dat is ook de reden dat wij d...,idTEcq_P1WY,https://www.youtube.com/watch?v=idTEcq_P1WY&t=...
2,hoe dan ook,5212,5214,en des tilleuls hoe dan ook opgenomen te,R4Q1cNbNKU0,https://www.youtube.com/watch?v=R4Q1cNbNKU0&t=...
3,ik denk dat,228,230,in het onderwijs bijvoorbeeld en ik denk dat d...,S0MhQFLZGYg,https://www.youtube.com/watch?v=S0MhQFLZGYg&t=...
4,dat is wat,61,64,intense weekenden dat is wat ik zeggen,rdLMBm1Rtcs,https://www.youtube.com/watch?v=rdLMBm1Rtcs&t=61s
5,ik ben ik,430,432,alleen ben ik nog wel heel lang bezig en ik be...,NxX9X3tkSXw,https://www.youtube.com/watch?v=NxX9X3tkSXw&t=...
6,je weet wat,264,267,je weet wat te doen dankjewel,tHFynkakpVk,https://www.youtube.com/watch?v=tHFynkakpVk&t=...
7,nog veel meer,15,17,wie er belt waarom diegene belt en nog veel meer,Hvc226NRmSQ,https://www.youtube.com/watch?v=Hvc226NRmSQ&t=15s
8,wat het is,3928,3930,mensen willen aansluiten nou weet je wat het i...,lApJy0bJNk0,https://www.youtube.com/watch?v=lApJy0bJNk0&t=...
9,waar ik ben,116,118,waar ik ben ik zou mijn fiets moeten en ik ga ...,BoaYsdPtJYA,https://www.youtube.com/watch?v=BoaYsdPtJYA&t=...


In [22]:
df_sentence_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Talk Time/Result/2-Adjust Word Group In Youtube Sentence Word Usage Analysis/{lang_folder.capitalize()}_{word_end}_Word_Group_In_Youtube_Sentence_Sample_Manuel.xlsx")
df_sentence_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,van wat zou dat nou voor de,141,143,maar ik heb wel beter nagedacht van wat zou da...,JUQTVKIBFns,https://www.youtube.com/watch?v=JUQTVKIBFns&t=...
1,en we daar hebben wij een,167,168,en we daar hebben wij een efteling variant van...,JUQTVKIBFns,https://www.youtube.com/watch?v=JUQTVKIBFns&t=...
2,en we hebben weer een heel,332,334,het staat natuurlijk gewoon in de efteling en ...,JUQTVKIBFns,https://www.youtube.com/watch?v=JUQTVKIBFns&t=...
3,me af of dat wel goed zou komen,230,232,dus ik vroeg me af of dat wel goed zou komen,UltPsmBVTV0,https://www.youtube.com/watch?v=UltPsmBVTV0&t=...
4,dus ja er zijn veel mensen daar,480,483,ah het is dit hele complex dus ja er zijn veel...,UltPsmBVTV0,https://www.youtube.com/watch?v=UltPsmBVTV0&t=...
5,kunnen doen ja dan moet de,194,195,maar je moet het snel kunnen doen ja dan moet ...,4YSldeXmuXw,https://www.youtube.com/watch?v=4YSldeXmuXw&t=...
6,we hebben hem toch in een,274,275,we hebben hem toch in een korte deadline neerg...,KmcDJ7GyLNk,https://www.youtube.com/watch?v=KmcDJ7GyLNk&t=...
7,maar door me in hem te,102,103,ik kende alleen maar het verhaal van max amp m...,IJ2LOTPGSvA,https://www.youtube.com/watch?v=IJ2LOTPGSvA&t=...
8,en moeten wij hier weg zijn,65,67,en moeten wij hier weg zijn,b_HmOS8lNl4,https://www.youtube.com/watch?v=b_HmOS8lNl4&t=65s
9,dan moeten wij hier weg zijn anders,66,68,dan moeten wij hier weg zijn anders krijgen we...,b_HmOS8lNl4,https://www.youtube.com/watch?v=b_HmOS8lNl4&t=66s


In [23]:
df_sentence_link_word_count = word_count_result(df_sentence_link, ["search_string"], set_condition=False)
df_sentence_link_word_count

Unnamed: 0,word,word_count
0,een,6
1,en,5
2,te,4
3,de,4
4,hier,4
...,...,...
61,zien,1
62,werd,1
63,om,1
64,veel,1


In [24]:
set_sent_link_word = set(df_sentence_link_word_count["word"])
set_word_list = set(word_list)

In [25]:
set_word_list.difference(set_sent_link_word)

{'aan',
 'al',
 'alleen',
 'allemaal',
 'alles',
 'als',
 'andere',
 'bedankt',
 'ben',
 'bent',
 'beter',
 'bij',
 'dacht',
 'dag',
 'dank',
 'denk',
 'deze',
 'doe',
 'dood',
 'echt',
 'erg',
 'ga',
 'gaan',
 'gaat',
 'gedaan',
 'geef',
 'geen',
 'geld',
 'genoeg',
 'gewoon',
 'gezien',
 'h',
 'haar',
 'had',
 'heb',
 'hebt',
 'heeft',
 'helpen',
 'het',
 'hij',
 'hoe',
 'hou',
 'hun',
 'iedereen',
 'iemand',
 'iets',
 'ik',
 'is',
 'jaar',
 'je',
 'jij',
 'jou',
 'jouw',
 'jullie',
 'kan',
 'keer',
 'kijk',
 'klaar',
 'komt',
 'kon',
 'kunt',
 'laat',
 'laten',
 'leven',
 'm',
 'mag',
 'maken',
 'man',
 'meer',
 'met',
 'mijn',
 'misschien',
 'moeder',
 'mr',
 'n',
 'na',
 'naar',
 'neem',
 'net',
 'niemand',
 'niks',
 'nodig',
 'nooit',
 'nu',
 'oh',
 'ok',
 'onze',
 'over',
 'praten',
 's',
 'spijt',
 't',
 'tegen',
 'terug',
 'tijd',
 'toen',
 'tot',
 'twee',
 'u',
 'uit',
 'uw',
 'vader',
 'vind',
 'vinden',
 'vrouw',
 'waar',
 'waarom',
 'wacht',
 'waren',
 'was',
 'weet',
 'we

In [34]:
sent_string_list = df_sentence_link["search_string"].to_list()
sent_string_list

['van wat zou dat nou voor de',
 'en we daar hebben wij een',
 'en we hebben weer een heel',
 'me af of dat wel goed zou komen',
 'dus ja er zijn veel mensen daar',
 'kunnen doen ja dan moet de',
 'we hebben hem toch in een',
 'maar door me in hem te',
 'en moeten wij hier weg zijn',
 'dan moeten wij hier weg zijn anders',
 'dus hier staat er een en binnen staat er nog een',
 'ook nog wel eens in de',
 'om te weten of zij nog',
 'nee die hebben daar niets mee van doen dat doet de',
 'maar dat werd hem niet en',
 'kom dan in een huis omdat',
 'staat op wat voor mij op dit',
 'ook hier nog even te zien',
 'die ons toch altijd weer weten te']

In [36]:
sent_string_list = df_sentence_link["search_string"][:10].to_list()
sent_string_list

['van wat zou dat nou voor de',
 'en we daar hebben wij een',
 'en we hebben weer een heel',
 'me af of dat wel goed zou komen',
 'dus ja er zijn veel mensen daar',
 'kunnen doen ja dan moet de',
 'we hebben hem toch in een',
 'maar door me in hem te',
 'en moeten wij hier weg zijn',
 'dan moeten wij hier weg zijn anders']

In [32]:
sent_string_list[18:25]

['die ons toch altijd weer weten te']

In [None]:
#Rev 4
# used for multi search result and with word that repeat 4 times and 3 times different word. Output file result divide according to sentence num
#twogram_link_list = df_twogram_link["search_string"].to_list()
#threegram_link_list = df_threegram_link["search_string"].to_list()
# Parameter
part_num = 1
total_sent_num = 10 
string_step_num = 5
string_start_num = 0
string_end_num = string_step_num
for_loop_repeat = (total_sent_num/string_step_num)

for part in range(for_loop_repeat):
    df_result = pd.DataFrame()
    sent_string_list = df_sentence_link["search_string"][:total_sent_num].to_list()

    for sent in sent_string_list[string_start_num:string_end_num]:

        # words
        sent_words = word_tokenize(sent)

        # word repeat (word search result should more result each words)
        for sent_word in sent_words:
            df_word_search_repeat1 = df_word_link[df_word_link["search_string"] == sent_word].head(1)
            df_word_search_repeat2 = df_word_link[df_word_link["search_string"] == sent_word].iloc[1:,]  # will test
            for word_repeat_num in range(3):
                df_result = pd.concat([df_result,df_word_search_repeat1], axis=0)
            for word_repeat_num2 in range(len(df_word_search_repeat2)):
                df_word_search_repeat2_var = df_word_search_repeat2.iloc[[word_repeat_num2],:]
                df_result = pd.concat([df_result,df_word_search_repeat2_var], axis=0)

        # twogram
        twogram_zip = ngrams(sent.split(), 2)
        twogram_list = [" ".join(x) for x in twogram_zip]
        #df_twogram_var = pd.DataFrame(data=twogram, columns=["twogram"])

        # threegram
        threegram_zip = ngrams(sent.split(), 3)
        threegram_list = [" ".join(y) for y in threegram_zip]
        #df_threegram_var = pd.DataFrame(data=threegram, columns=["threegram"])

        # begin sent repeat 
        df_sent_search_begin = df_sentence_link[df_sentence_link["search_string"] == sent]
        for sent_repeat_num in range(3):
            df_result = pd.concat([df_result,df_sent_search_begin], axis=0)

        # word result
        for word in sent_words: 
            try:
                df_word_search_var = df_word_link[df_word_link["search_string"] == word].sample(1)
            except:
                pass
            df_word_search_var.reset_index(drop=True, inplace=True)
            for i in range(len(df_word_search_var)):
                df_link_default_var = df_link_default
                try:
                    word_time_diff_var = df_word_search_var.loc[i,"end_time"] - df_word_search_var.loc[i,"start_time"]
                    if word_time_diff_var < 4:
                        word_time_diff_var2 = 4
                    else:
                        word_time_diff_var2 = word_time_diff_var+1.0                                       
                    df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + word_time_diff_var2
                    df_result = pd.concat([df_result,df_word_search_var.iloc[[i,]]], axis=0)
                    df_result = pd.concat([df_result,df_link_default_var], axis=0)
                except:
                    pass

        # twogram result
        df_twogram_search_var = df_twogram_link[df_twogram_link["search_string"].isin(twogram_list)]
        df_twogram_search_var.reset_index(drop=True, inplace=True)
        for j in range(len(df_twogram_search_var)):
            df_link_default_var = df_link_default
            try:
                twogram_time_diff_var = df_twogram_search_var.loc[j,"end_time"] - df_twogram_search_var.loc[j,"start_time"]
                if twogram_time_diff_var < 4:
                    twogram_time_diff_var2 = 4
                else:
                    twogram_time_diff_var2 = twogram_time_diff_var+1.0 
                df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + twogram_time_diff_var2
                df_result = pd.concat([df_result,df_twogram_search_var.iloc[[j,]]], axis=0)
                df_result = pd.concat([df_result,df_link_default_var], axis=0)
            except:
                pass

        # threegram result
        df_threegram_search_var = df_threegram_link[df_threegram_link["search_string"].isin(threegram_list)]
        df_threegram_search_var.reset_index(drop=True, inplace=True)
        for k in range(len(df_threegram_search_var)):
            df_link_default_var = df_link_default
            try:
                threegram_time_diff_var = df_threegram_search_var.loc[k,"end_time"] - df_threegram_search_var.loc[k,"start_time"]
                if threegram_time_diff_var < 4:
                    threegram_time_diff_var2 = 4
                else:
                    threegram_time_diff_var2 = threegram_time_diff_var+1.0 
                df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + threegram_time_diff_var2
                df_result = pd.concat([df_result,df_threegram_search_var.iloc[[k,]]], axis=0)
                df_result = pd.concat([df_result,df_link_default_var], axis=0)
            except:
                pass

        # sentence added
        df_sent_search_var = df_sentence_link[df_sentence_link["search_string"] == sent]
        df_sent_search_var.reset_index(drop=True, inplace=True)
        for l in range(len(df_sent_search_var)):
            df_link_default_var = df_link_default
            try:
                sent_time_diff_var = df_sent_search_var.loc[l,"end_time"] - df_sent_search_var.loc[l,"start_time"]
                if sent_time_diff_var < 4:
                    sent_time_diff_var2 = 4
                else:
                    sent_time_diff_var2 = sent_time_diff_var+1.0 
                df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + sent_time_diff_var2
                df_result = pd.concat([df_result,df_sent_search_var.iloc[[l,]]], axis=0)
                df_result = pd.concat([df_result,df_link_default_var], axis=0)
            except:
                pass
            

    df_result.reset_index(drop=True, inplace=True)
    df_result_join = df_col_value_join_comma(df_result, ["video_id","start_time","end_time"])
    df_result.to_excel(f"{lang_folder.capitalize()}_{word_end}_Word_Talk_Time{part_num}.xlsx", index=False)
    df_result_join.to_excel(f"{lang_folder.capitalize()}_{word_end}_Word_Talk_Time1_Join{part_num}.xlsx", index=False)

    string_start_num += string_step_num
    string_end_num += string_step_num
    part_num += 1  

In [116]:
#Rev 3
# used for multi search result and with word that repeat 4 times and 3 times different word
#twogram_link_list = df_twogram_link["search_string"].to_list()
#threegram_link_list = df_threegram_link["search_string"].to_list()
df_result = pd.DataFrame()
for sent in df_sentence_link["search_string"]:

    # words
    sent_words = word_tokenize(sent)

    # word repeat (word search result should more result each words)
    for sent_word in sent_words:
        df_word_search_repeat1 = df_word_link[df_word_link["search_string"] == sent_word].head(1)
        df_word_search_repeat2 = df_word_link[df_word_link["search_string"] == sent_word].iloc[1:,]  # will test
        for word_repeat_num in range(3):
            df_result = pd.concat([df_result,df_word_search_repeat1], axis=0)
        for word_repeat_num2 in range(len(df_word_search_repeat2)):
            df_word_search_repeat2_var = df_word_search_repeat2.iloc[[word_repeat_num2],:]
            df_result = pd.concat([df_result,df_word_search_repeat2_var], axis=0)

    # twogram
    twogram_zip = ngrams(sent.split(), 2)
    twogram_list = [" ".join(x) for x in twogram_zip]
    #df_twogram_var = pd.DataFrame(data=twogram, columns=["twogram"])
    
    # threegram
    threegram_zip = ngrams(sent.split(), 3)
    threegram_list = [" ".join(y) for y in threegram_zip]
    #df_threegram_var = pd.DataFrame(data=threegram, columns=["threegram"])

    # begin sent repeat 
    df_sent_search_begin = df_sentence_link[df_sentence_link["search_string"] == sent]
    for sent_repeat_num in range(3):
        df_result = pd.concat([df_result,df_sent_search_begin], axis=0)

    # word result
    for word in sent_words: 
        try:
            df_word_search_var = df_word_link[df_word_link["search_string"] == word].sample(1)
        except:
            pass
        df_word_search_var.reset_index(drop=True, inplace=True)
        for i in range(len(df_word_search_var)):
            df_link_default_var = df_link_default
            try:
                word_time_diff_var = df_word_search_var.loc[i,"end_time"] - df_word_search_var.loc[i,"start_time"]
                if word_time_diff_var < 4:
                    word_time_diff_var2 = 4
                else:
                    word_time_diff_var2 = word_time_diff_var+1.0                                       
                df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + word_time_diff_var2
                df_result = pd.concat([df_result,df_word_search_var.iloc[[i,]]], axis=0)
                df_result = pd.concat([df_result,df_link_default_var], axis=0)
            except:
                pass

    # twogram result
    df_twogram_search_var = df_twogram_link[df_twogram_link["search_string"].isin(twogram_list)]
    df_twogram_search_var.reset_index(drop=True, inplace=True)
    for j in range(len(df_twogram_search_var)):
        df_link_default_var = df_link_default
        try:
            twogram_time_diff_var = df_twogram_search_var.loc[j,"end_time"] - df_twogram_search_var.loc[j,"start_time"]
            if twogram_time_diff_var < 4:
                twogram_time_diff_var2 = 4
            else:
                twogram_time_diff_var2 = twogram_time_diff_var+1.0 
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + twogram_time_diff_var2
            df_result = pd.concat([df_result,df_twogram_search_var.iloc[[j,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

    # threegram result
    df_threegram_search_var = df_threegram_link[df_threegram_link["search_string"].isin(threegram_list)]
    df_threegram_search_var.reset_index(drop=True, inplace=True)
    for k in range(len(df_threegram_search_var)):
        df_link_default_var = df_link_default
        try:
            threegram_time_diff_var = df_threegram_search_var.loc[k,"end_time"] - df_threegram_search_var.loc[k,"start_time"]
            if threegram_time_diff_var < 4:
                threegram_time_diff_var2 = 4
            else:
                threegram_time_diff_var2 = threegram_time_diff_var+1.0 
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + threegram_time_diff_var2
            df_result = pd.concat([df_result,df_threegram_search_var.iloc[[k,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

    # sentence added
    df_sent_search_var = df_sentence_link[df_sentence_link["search_string"] == sent]
    df_sent_search_var.reset_index(drop=True, inplace=True)
    for l in range(len(df_sent_search_var)):
        df_link_default_var = df_link_default
        try:
            sent_time_diff_var = df_sent_search_var.loc[l,"end_time"] - df_sent_search_var.loc[l,"start_time"]
            if sent_time_diff_var < 4:
                sent_time_diff_var2 = 4
            else:
                sent_time_diff_var2 = sent_time_diff_var+1.0 
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + sent_time_diff_var2
            df_result = pd.concat([df_result,df_sent_search_var.iloc[[l,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

df_result.reset_index(drop=True, inplace=True)  

In [117]:
##Rev 2
## used for multi search result and with word repeat that repeat 7 times same word
##twogram_link_list = df_twogram_link["search_string"].to_list()
##threegram_link_list = df_threegram_link["search_string"].to_list()
#df_result = pd.DataFrame()
#for sent in df_sentence_link["search_string"]:
#
#    # words
#    sent_words = word_tokenize(sent)
#
#    # word repeat (word search result should one result each words)
#    for sent_word in sent_words:
#        df_word_search_repeat1 = df_word_link[df_word_link["search_string"] == sent_word]
#        for word_repeat_num in range(7):
#            df_result = pd.concat([df_result,df_word_search_repeat1], axis=0)
#
#    # twogram
#    twogram_zip = ngrams(sent.split(), 2)
#    twogram_list = [" ".join(x) for x in twogram_zip]
#    #df_twogram_var = pd.DataFrame(data=twogram, columns=["twogram"])
#    
#    # threegram
#    threegram_zip = ngrams(sent.split(), 3)
#    threegram_list = [" ".join(y) for y in threegram_zip]
#    #df_threegram_var = pd.DataFrame(data=threegram, columns=["threegram"])
#
#    # begin sent repeat 
#    df_sent_search_begin = df_sentence_link[df_sentence_link["search_string"] == sent]
#    for sent_repeat_num in range(3):
#        df_result = pd.concat([df_result,df_sent_search_begin], axis=0)
#
#    # word result
#    for word in sent_words:        
#        df_word_search_var = df_word_link[df_word_link["search_string"] == word]
#        df_word_search_var.reset_index(drop=True, inplace=True)
#        for i in range(len(df_word_search_var)):
#            df_link_default_var = df_link_default
#            try:
#                word_time_diff_var = df_word_search_var.loc[i,"end_time"] - df_word_search_var.loc[i,"start_time"]
#                if word_time_diff_var < 4:
#                    word_time_diff_var2 = 4
#                else:
#                    word_time_diff_var2 = word_time_diff_var+1.0                                       
#                df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + word_time_diff_var2
#                df_result = pd.concat([df_result,df_word_search_var.iloc[[i,]]], axis=0)
#                df_result = pd.concat([df_result,df_link_default_var], axis=0)
#            except:
#                pass
#
#    # twogram result
#    df_twogram_search_var = df_twogram_link[df_twogram_link["search_string"].isin(twogram_list)]
#    df_twogram_search_var.reset_index(drop=True, inplace=True)
#    for j in range(len(df_twogram_search_var)):
#        df_link_default_var = df_link_default
#        try:
#            twogram_time_diff_var = df_twogram_search_var.loc[j,"end_time"] - df_twogram_search_var.loc[j,"start_time"]
#            if twogram_time_diff_var < 4:
#                twogram_time_diff_var2 = 4
#            else:
#                twogram_time_diff_var2 = twogram_time_diff_var+1.0 
#            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + twogram_time_diff_var2
#            df_result = pd.concat([df_result,df_twogram_search_var.iloc[[j,]]], axis=0)
#            df_result = pd.concat([df_result,df_link_default_var], axis=0)
#        except:
#            pass
#
#    # threegram result
#    df_threegram_search_var = df_threegram_link[df_threegram_link["search_string"].isin(threegram_list)]
#    df_threegram_search_var.reset_index(drop=True, inplace=True)
#    for k in range(len(df_threegram_search_var)):
#        df_link_default_var = df_link_default
#        try:
#            threegram_time_diff_var = df_threegram_search_var.loc[k,"end_time"] - df_threegram_search_var.loc[k,"start_time"]
#            if threegram_time_diff_var < 4:
#                threegram_time_diff_var2 = 4
#            else:
#                threegram_time_diff_var2 = threegram_time_diff_var+1.0 
#            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + threegram_time_diff_var2
#            df_result = pd.concat([df_result,df_threegram_search_var.iloc[[k,]]], axis=0)
#            df_result = pd.concat([df_result,df_link_default_var], axis=0)
#        except:
#            pass
#
#    # sentence added
#    df_sent_search_var = df_sentence_link[df_sentence_link["search_string"] == sent]
#    df_sent_search_var.reset_index(drop=True, inplace=True)
#    for l in range(len(df_sent_search_var)):
#        df_link_default_var = df_link_default
#        try:
#            sent_time_diff_var = df_sent_search_var.loc[l,"end_time"] - df_sent_search_var.loc[l,"start_time"]
#            if sent_time_diff_var < 4:
#                sent_time_diff_var2 = 4
#            else:
#                sent_time_diff_var2 = sent_time_diff_var+1.0 
#            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + sent_time_diff_var2
#            df_result = pd.concat([df_result,df_sent_search_var.iloc[[l,]]], axis=0)
#            df_result = pd.concat([df_result,df_link_default_var], axis=0)
#        except:
#            pass
#
#df_result.reset_index(drop=True, inplace=True)   

In [118]:
##Rev 1
## used for multi search result and without word repeat only sentence repeat
##twogram_link_list = df_twogram_link["search_string"].to_list()
##threegram_link_list = df_threegram_link["search_string"].to_list()
#df_result = pd.DataFrame()
#for sent in df_sentence_link["search_string"]:
#
#    # words
#    sent_words = word_tokenize(sent)
#
#    # twogram
#    twogram_zip = ngrams(sent.split(), 2)
#    twogram_list = [" ".join(x) for x in twogram_zip]
#    #df_twogram_var = pd.DataFrame(data=twogram, columns=["twogram"])
#    
#    # threegram
#    threegram_zip = ngrams(sent.split(), 3)
#    threegram_list = [" ".join(y) for y in threegram_zip]
#    #df_threegram_var = pd.DataFrame(data=threegram, columns=["threegram"])
#
#    # begin sent repeat 
#    df_sent_search_begin = df_sentence_link[df_sentence_link["search_string"] == sent]
#    for repeat_num in range(3):
#        df_result = pd.concat([df_result,df_sent_search_begin], axis=0)
#
#    # word result
#    for word in sent_words:        
#        df_word_search_var = df_word_link[df_word_link["search_string"] == word]
#        df_word_search_var.reset_index(drop=True, inplace=True)
#        for i in range(len(df_word_search_var)):
#            df_link_default_var = df_link_default
#            try:
#                word_time_diff_var = df_word_search_var.loc[i,"end_time"] - df_word_search_var.loc[i,"start_time"]
#                if word_time_diff_var < 4:
#                    word_time_diff_var2 = 4
#                else:
#                    word_time_diff_var2 = word_time_diff_var+1.0                                       
#                df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + word_time_diff_var2
#                df_result = pd.concat([df_result,df_word_search_var.iloc[[i,]]], axis=0)
#                df_result = pd.concat([df_result,df_link_default_var], axis=0)
#            except:
#                pass
#
#    # twogram result
#    df_twogram_search_var = df_twogram_link[df_twogram_link["search_string"].isin(twogram_list)]
#    df_twogram_search_var.reset_index(drop=True, inplace=True)
#    for j in range(len(df_twogram_search_var)):
#        df_link_default_var = df_link_default
#        try:
#            twogram_time_diff_var = df_twogram_search_var.loc[j,"end_time"] - df_twogram_search_var.loc[j,"start_time"]
#            if twogram_time_diff_var < 4:
#                twogram_time_diff_var2 = 4
#            else:
#                twogram_time_diff_var2 = twogram_time_diff_var+1.0 
#            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + twogram_time_diff_var2
#            df_result = pd.concat([df_result,df_twogram_search_var.iloc[[j,]]], axis=0)
#            df_result = pd.concat([df_result,df_link_default_var], axis=0)
#        except:
#            pass
#
#    # threegram result
#    df_threegram_search_var = df_threegram_link[df_threegram_link["search_string"].isin(threegram_list)]
#    df_threegram_search_var.reset_index(drop=True, inplace=True)
#    for k in range(len(df_threegram_search_var)):
#        df_link_default_var = df_link_default
#        try:
#            threegram_time_diff_var = df_threegram_search_var.loc[k,"end_time"] - df_threegram_search_var.loc[k,"start_time"]
#            if threegram_time_diff_var < 4:
#                threegram_time_diff_var2 = 4
#            else:
#                threegram_time_diff_var2 = threegram_time_diff_var+1.0 
#            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + threegram_time_diff_var2
#            df_result = pd.concat([df_result,df_threegram_search_var.iloc[[k,]]], axis=0)
#            df_result = pd.concat([df_result,df_link_default_var], axis=0)
#        except:
#            pass
#
#    # sentence added
#    df_sent_search_var = df_sentence_link[df_sentence_link["search_string"] == sent]
#    df_sent_search_var.reset_index(drop=True, inplace=True)
#    for l in range(len(df_sent_search_var)):
#        df_link_default_var = df_link_default
#        try:
#            sent_time_diff_var = df_sent_search_var.loc[l,"end_time"] - df_sent_search_var.loc[l,"start_time"]
#            if sent_time_diff_var < 4:
#                sent_time_diff_var2 = 4
#            else:
#                sent_time_diff_var2 = sent_time_diff_var+1.0 
#            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + sent_time_diff_var2
#            df_result = pd.concat([df_result,df_sent_search_var.iloc[[l,]]], axis=0)
#            df_result = pd.concat([df_result,df_link_default_var], axis=0)
#        except:
#            pass
#
#df_result.reset_index(drop=True, inplace=True)   

In [119]:
df_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,van,376,377,en merk je dat je weer kunt genieten van je leven,Imj3JLDlIZE,https://www.youtube.com/watch?v=Imj3JLDlIZE&t=...
1,van,376,377,en merk je dat je weer kunt genieten van je leven,Imj3JLDlIZE,https://www.youtube.com/watch?v=Imj3JLDlIZE&t=...
2,van,376,377,en merk je dat je weer kunt genieten van je leven,Imj3JLDlIZE,https://www.youtube.com/watch?v=Imj3JLDlIZE&t=...
3,van,60,62,om de kwaliteit van je producten te garanderen,mgH9ulHIsd8,https://www.youtube.com/watch?v=mgH9ulHIsd8&t=60s
4,van,612,614,ik binnen een halfjaar een woning had terwijl ...,miouy02UT58,https://www.youtube.com/watch?v=miouy02UT58&t=...
...,...,...,...,...,...,...
1279,repeat,5,9,repeat_again,Q-8I-uMUMYA,https://www.youtube.com/watch?v=Q-8I-uMUMYA&t=0s
1280,te,59,60,om de berg weer af te gaan nu is het alleen ja...,SicFPD83ToU,https://www.youtube.com/watch?v=SicFPD83ToU&t=59s
1281,repeat,5,9,repeat_again,Q-8I-uMUMYA,https://www.youtube.com/watch?v=Q-8I-uMUMYA&t=0s
1282,die ons toch altijd weer weten te,639,641,die ons toch altijd weer weten te verrassen,aWOJx8liOyU,https://www.youtube.com/watch?v=aWOJx8liOyU&t=...


In [120]:
df_result[df_result["search_string"] == "repeat"]["end_time"].max()

9

In [121]:
sample_num = df_result[df_result["search_string"] == "repeat"]["end_time"].count()
sample_num

155

In [122]:
((df_result[df_result["search_string"] == "repeat"]["end_time"].sum()*2)-sample_num)/60

43.916666666666664

In [123]:
df_result.to_excel(f"{lang_folder.capitalize()}_{word_end}_Word_Talk_Time_All.xlsx", index=False)

In [32]:
# used for divide part
from math import ceil
divide_num = 12
part_sample_num = ceil(len(df_result)/12)
sample_num_start = 0
sample_num_end = part_sample_num
for i in range(divide_num):
    df_var = df_result.iloc[sample_num_start:sample_num_end,]
    
    id_list = []
    start_list = []
    end_list = []
    for id, start, end in zip(df_var["video_id"].to_list(),df_var["start_time"].to_list(),df_var["end_time"].to_list()):
        id_list.append(str(id))
        start_list.append(str(start))
        end_list.append(str(end))

    id_join = ",".join(id_list)
    start_join = ",".join(start_list)
    end_join = ",".join(end_list)

    df_result_for_embedded = pd.DataFrame(data=[[id_join,start_join,end_join]], columns=["id","start_time","end_time"])
    df_result_for_embedded.to_excel(f"{lang_folder.capitalize()}_{word_end}_Word_Talk_Time1_Join_Sample{i+1}.xlsx", index=False)
    #df_var.to_excel(f"Turkish_200_Word_Talk_Time1_Sample{i+1}.xlsx", index=False)
    sample_num_start += part_sample_num
    sample_num_end += part_sample_num

#### Copy Move And Delete

In [None]:
output_file = glob.glob(f"{lang_folder.capitalize()}_*_Word_Talk_Time*.xlsx")
output_file

In [None]:
for y in output_file:
    source = y # source directory
    destination = path
    shutil.copy2(source, destination)

In [None]:
for z in output_file:
    try:
        os.remove(z)
    except:
        pass

In [None]:
# used for one search result
df_result = pd.DataFrame()
for sent in df_sentence_link["search_string"]:
    sent_words = word_tokenize(sent)
    for word in sent_words:
        df_link_default_var = df_link_default
        df_var = df_word_link[df_word_link["search_string"] == word]
        df_var.reset_index(drop=True, inplace=True)
        try:
            var_time_diff = (df_var.loc[0,"end_time"] - df_var.loc[0,"start_time"])
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + var_time_diff+1.0
            df_result = pd.concat([df_result,df_var], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass
df_result.reset_index(drop=True, inplace=True)
df_result

In [124]:
df_result_select = df_result.head(300)
df_result_select

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,van,376,377,en merk je dat je weer kunt genieten van je leven,Imj3JLDlIZE,https://www.youtube.com/watch?v=Imj3JLDlIZE&t=...
1,van,376,377,en merk je dat je weer kunt genieten van je leven,Imj3JLDlIZE,https://www.youtube.com/watch?v=Imj3JLDlIZE&t=...
2,van,376,377,en merk je dat je weer kunt genieten van je leven,Imj3JLDlIZE,https://www.youtube.com/watch?v=Imj3JLDlIZE&t=...
3,van,60,62,om de kwaliteit van je producten te garanderen,mgH9ulHIsd8,https://www.youtube.com/watch?v=mgH9ulHIsd8&t=60s
4,van,612,614,ik binnen een halfjaar een woning had terwijl ...,miouy02UT58,https://www.youtube.com/watch?v=miouy02UT58&t=...
...,...,...,...,...,...,...
295,veel,1162,1163,dus je krijgt eigenlijk heel veel informatie d...,pruDyt__jYs,https://www.youtube.com/watch?v=pruDyt__jYs&t=...
296,veel,5099,5100,meer kneedbaar veel meer toegankelijk en jij k...,EZMgSCfbiGs,https://www.youtube.com/watch?v=EZMgSCfbiGs&t=...
297,veel,286,288,ja heel veel eigenlijk te veel vertel eens wat...,tfzC9AvItfg,https://www.youtube.com/watch?v=tfzC9AvItfg&t=...
298,mensen,108,110,mij zeg voor mensen in een rolstoel voor ander...,1KaoCu5SII8,https://www.youtube.com/watch?v=1KaoCu5SII8&t=...


In [125]:
df_result_select.to_excel("Dutch_200_Word_Talk_Time1_Test.xlsx", index=False)

In [126]:
id_list = []
start_list = []
end_list = []
for id, start, end in zip(df_result_select["video_id"].to_list(),df_result_select["start_time"].to_list(),df_result_select["end_time"].to_list()):
    id_list.append(str(id))
    start_list.append(str(start))
    end_list.append(str(end))

id_join = ",".join(id_list)
start_join = ",".join(start_list)
end_join = ",".join(end_list)

df_result_for_embedded = pd.DataFrame(data=[[id_join,start_join,end_join]], columns=["id","start_time","end_time"])
df_result_for_embedded

Unnamed: 0,id,start_time,end_time
0,"Imj3JLDlIZE,Imj3JLDlIZE,Imj3JLDlIZE,mgH9ulHIsd...","376,376,376,60,612,296,9634,2970,2970,2970,813...","377,377,377,62,614,297,9635,2972,2972,2972,815..."


In [127]:
df_result_for_embedded.to_excel("Dutch_200_Word_Talk_Time1_Join_Test.xlsx", index=False)

#### Temp

In [46]:
def df_col_value_join_comma(df, df_columns_list):
    '''
    df_col_value_join_comma(df_test, ["video_id","start_time","end_time"])\n
    function used for selected column value join with comma in one row
    '''
    column_value_list = []
    for column in df_columns_list:
        list_var = df[f"{column}"].to_list()
        list_var_string = [str(x) for x in list_var] 
        list_var_join = ",".join(list_var_string)
        column_value_list.append(list_var_join)

    df_result = pd.DataFrame([[column_value_list[0],column_value_list[1],column_value_list[2]]], columns=df_columns_list)

    return df_result

In [44]:
df_test = pd.read_excel("Dutch_200_Word_Talk_Time1_Test.xlsx")
df_test

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,en,2592,2593,jaar en niet heel realistisch zijn geworden ov...,A0Fij7grUvs,https://www.youtube.com/watch?v=A0Fij7grUvs&t=...
1,en,2592,2593,jaar en niet heel realistisch zijn geworden ov...,A0Fij7grUvs,https://www.youtube.com/watch?v=A0Fij7grUvs&t=...
2,en,2592,2593,jaar en niet heel realistisch zijn geworden ov...,A0Fij7grUvs,https://www.youtube.com/watch?v=A0Fij7grUvs&t=...
3,en,2592,2593,jaar en niet heel realistisch zijn geworden ov...,A0Fij7grUvs,https://www.youtube.com/watch?v=A0Fij7grUvs&t=...
4,en,818,820,cultureel echt serieus meedeed en je ziet ook ...,AaVi4igmpYQ,https://www.youtube.com/watch?v=AaVi4igmpYQ&t=...
...,...,...,...,...,...,...
295,hoe dan ook zo in de,557,559,hoe dan ook zo in de,BoaYsdPtJYA,https://www.youtube.com/watch?v=BoaYsdPtJYA&t=...
296,hoe dan ook zo in de,557,559,hoe dan ook zo in de,BoaYsdPtJYA,https://www.youtube.com/watch?v=BoaYsdPtJYA&t=...
297,hoe,1291,1292,we nauw samenwerken met bedrijven die vanuit d...,xKNDkrAQ3iM,https://www.youtube.com/watch?v=xKNDkrAQ3iM&t=...
298,repeat,5,9,repeat_again,Q-8I-uMUMYA,https://www.youtube.com/watch?v=Q-8I-uMUMYA&t=0s


In [47]:
df_test_join = df_col_value_join_comma(df_test, ["video_id","start_time","end_time"])
df_test_join

Unnamed: 0,video_id,start_time,end_time
0,"A0Fij7grUvs,A0Fij7grUvs,A0Fij7grUvs,A0Fij7grUv...","2592,2592,2592,2592,818,818,818,1998,1998,1998...","2593,2593,2593,2593,820,820,820,1999,1999,1999..."


In [None]:
df_test_join.to_excel("Dutch_200_Word_Talk_Time1_Join_Test.xlsx", index=False)

In [None]:
# used for divide part will test
from math import ceil
divide_num = 12
part_sample_num = ceil(len(df_result)/12)
sample_num_start = 0
sample_num_end = part_sample_num
for i in range(divide_num):
    df_var = df_result.iloc[sample_num_start:sample_num_end,]
    
    df_result_for_embedded = df_col_value_join_comma(df_var, ["video_id","start_time","end_time"])
    df_result_for_embedded.to_excel(f"{lang_folder.capitalize()}_{word_end}_Word_Talk_Time1_Join_Sample{i+1}.xlsx", index=False)
    #df_var.to_excel(f"Turkish_200_Word_Talk_Time1_Sample{i+1}.xlsx", index=False)
    sample_num_start += part_sample_num
    sample_num_end += part_sample_num