### Adjust Repeat Data

In [1]:
import os
import multiprocessing
#import multiprocessing as mp
from multiprocessing import Process, Manager, Pool, Queue
from itertools import islice
from collections import Counter
import re
import pandas as pd
import numpy as np
import glob
import nltk
from nltk import word_tokenize
from nltk import ngrams
from functools import reduce
from pathlib import Path
import shutil

In [2]:
nprocs = multiprocessing.cpu_count()
print(f"Number of CPU cores: {nprocs}")

Number of CPU cores: 16


In [3]:
# language pair
lang_folder = "Dutch"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
#lang_pair = "Intersect"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> native language

# adding native word to shared word
word_start = 0  # 0 native word start index
word_end = 200  # 28 native word end index

# word usage
min_word_use_num = 1
max_word_use_num = 6

# sentence check
twogram_sentence_check = False  # True, False
threegram_sentence_check = False

# youtube
sample_num = 5  # 6
time_shift = 0.6

In [4]:
def df_col_value_join_comma(df, df_columns_list):
    '''
    df_col_value_join_comma(df_test, ["video_id","start_time","end_time"])\n
    function used for selected column value join with comma in one row
    '''
    column_value_list = []
    for column in df_columns_list:
        list_var = df[f"{column}"].to_list()
        list_var_string = [str(x) for x in list_var] 
        list_var_join = ",".join(list_var_string)
        column_value_list.append(list_var_join)

    df_result = pd.DataFrame([[column_value_list[0],column_value_list[1],column_value_list[2]]], columns=df_columns_list)

    return df_result

In [5]:
def word_in_wordgroup_simple(source_word_list, df_target, target_column, word_sample_num, simple=False):

    '''word_in_wordgroup(not_in_sent_word_list, df_youtube_sent_select, "search_string", 5, simple=False)\n
       source_word_list is searching word list\n
       df_target is dataframe, target_column are dataframe column string name\n
       word_sample_num is searching sample number.
       simple use for all column row result or only target column result 
    '''
    if simple:
        df_select = df_target[[f"{target_column}"]].dropna()
    else:
        df_select = df_target
        
    df_result = pd.DataFrame()
    for i in source_word_list:
        try:
            word_in_word_cluster = df_select[df_select[f"{target_column}"].str.contains(fr"(?:\s|^){i}(?:\s|$)", na=True)].head(word_sample_num)    
        except:
            pass        
        word_in_word_cluster.insert(0,"search_string",i)
        df_result = pd.concat([df_result,word_in_word_cluster], axis=0)
    df_result.reset_index(drop=True, inplace=True)

    return df_result

In [6]:
def word_group_youtube(df, search_list, target_column, sample_num):
    '''
    word_group_youtube(df_youtube_sentence, search_list, "sentence", 6)\n
    df_youtube_sentence is dataframe and "sentence" is its column for external searching_list
    ''' 
    df_search_result = pd.DataFrame()
    for j in search_list:
        try:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].sample(sample_num)
        except:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].head(sample_num)
        #df_result = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)]  # sentence length part
        #df_result.sort_values(f"{target_column}",key=lambda x:x.str.len(), inplace=True)
        #df_select = df_result.head(sample_num)
        df_select.insert(0,"search_string",j)
        df_search_result = pd.concat([df_search_result,df_select], axis=0)
    df_search_result.reset_index(inplace=True, drop=True)
    
    return df_search_result

In [7]:
def word_group_time_loc(df, search, start_sent, end_sent, sent, sent_video_id):
    '''
    word_group_time_loc(df_search_result, "search_string", "start_time", "end_time", "sentence", "video_id")\n
    df_search_result is dataframe and "search_string", "start_time", "end_time", "sentence", "video_id" are its columns
    '''
    df.reset_index(drop=True, inplace=True)
    word_time_loc_list = []
    for i in range(len(df)):
        word = df.loc[i,f"{search}"]
        start_time = df.loc[i,f"{start_sent}"]
        end_time = df.loc[i,f"{end_sent}"]
        sentence = df.loc[i,f"{sent}"]
        video_id = df.loc[i,f"{sent_video_id}"]
        time_length = end_time-start_time
        sentence_length = len(sentence)
        time_length_ratio = time_length/sentence_length
        loc_list = []
        for j in re.finditer(fr"(?:\s|^){word}(?:\s|$)", sentence, re.IGNORECASE|re.UNICODE):
            loc_list.append(j)
            start = loc_list[0].start()
            end = loc_list[0].end()
            start_loc = start_time+(start*time_length_ratio)
            end_loc = start_time+(end*time_length_ratio)
        word_time_loc_list.append([word,start_loc,end_loc,sentence,video_id])
    df_word_time_loc = pd.DataFrame(word_time_loc_list, columns=[f"{search}",f"{start_sent}",f"{end_sent}",f"{sent}",f"{sent_video_id}"])

    return df_word_time_loc

In [8]:
def word_count_result(df, column_list, set_condition=False): # df is dataframe, column_list is list value
    '''
    word_count_bool(df, column_list): df columns word count for word frequency\n
    df is dataframe, column_list is list value\n
    word_count_bool(df, ["word","twogram"]):
    '''
    list_all = []
    for i in df.loc[:,[x for x in column_list]].columns:
        if set_condition:
            var_list = set(df[f"{i}"].dropna().tolist())
        else:
            var_list = df[f"{i}"].dropna().tolist()
        for j in var_list:
            list_all.append(j)
    text = " ".join(list_all)
    word_list = re.findall(r"\w+",text, re.UNICODE)
    df_word_list = pd.DataFrame(word_list, columns=["word"])
    #df_word_list.rename(columns={0:"word"}, inplace=True)
    df_word_count = pd.DataFrame(df_word_list.value_counts())
    df_word_count.reset_index(inplace=True)
    df_word_count.rename(columns={0:"word_count"}, inplace=True)
    df_word_count.sort_values("word_count", ascending=False, inplace=True)
    df_word_count.reset_index(inplace=True, drop=True)
    
    return  df_word_count

In [9]:
def word_usage_result(word_list, df_target, target_column, target_opt_column, word_usage_min, word_usage_max):
    '''
    word_usage_result(word_list, df_ngram_pair, "threegram", "frequency", 1, 5) \n
    word_list is a list, df_target is a dateframe, target_column is df_target dataframe target column, \n
    target_opt_column is df_target dataframe opt_target column, \n
    word_usage_min and word_usage_max word usage condition.
    '''    
    word_num_dict = {}
    for i in word_list:
        word_num_dict[f"{i}"] = 0
    
    result_list_select = []
    var_list = []
    for i in range(len(df_target)):
        target_value = df_target.loc[i,f"{target_column}"]
        opt_value = df_target.loc[i,f"{target_opt_column}"]
        words = word_tokenize(target_value)   
        temp_list = [word for word in words]
        temp_list = temp_list + var_list
        # word count for max
        dict_list_count = Counter(temp_list)
        count_list = list(dict_list_count.values())
        # word count for min
        count_list2 = list(word_num_dict.values())
    
        if any([True if i>word_usage_max else False for i in count_list]) or not(any([True if j<word_usage_min else False for j in count_list2])):
            pass
        else:
            var_list = temp_list
            result_list_select.append([target_value,opt_value]) 
    
            for item2 in dict_list_count.items(): 
                word_num_dict[item2[0]] = item2[1]        
    df_result = pd.DataFrame(result_list_select, columns=[f"{target_column}",f"{target_opt_column}"])
    df_result.sort_values(by="frequency", ascending=False, inplace=True)
    df_result.reset_index(drop=True, inplace=True)
    
    return df_result

In [10]:
path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Talk Time/Result/Adjust Data Repeat"

Path(path).mkdir(parents=True, exist_ok=True)

In [11]:
disable_video_id_list = ["H6E6N70jYqI","BoaYsdPtJYA","HUZINsU40Fk","Et3diPcEmfY"]

In [12]:
df_word_all = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.lower().capitalize()}/Deployment/Data/Word/Word_Merge_Preprocess.xlsx")
df_word_all

Unnamed: 0,word,frequency
0,ik,22539531
1,je,20769946
2,het,15696161
3,de,15258816
4,dat,13387137
...,...,...
331769,oorlogsgezichten,5
331770,opdrachtenlijst,5
331771,verlsaafde,5
331772,oxidatieproces,5


In [13]:
df_word_select = df_word_all.iloc[word_start:word_end,]

In [14]:
df_word_select

Unnamed: 0,word,frequency
0,ik,22539531
1,je,20769946
2,het,15696161
3,de,15258816
4,dat,13387137
...,...,...
195,genoeg,353128
196,wist,352948
197,oh,351988
198,klaar,350506


In [15]:
word_list = df_word_select["word"].to_list()
len(word_list)

200

In [16]:
if twogram_sentence_check:
    df_twogram_sent = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/N Gram And Sentence/Two_Gram_And_Sentence.csv")  # ext. sentence and ngram
    df_twogram_sent.rename(columns={"two_gram":"twogram"}, inplace=True)  # ext. Not: Two_Gram_And_Sentence.csv convert to Two_Gram_And_Sentence_All.csv
    df_twogram = df_twogram_sent.loc[:,["twogram","frequency"]]
else:
    df_twogram = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/Merge/Twogram_Merge.csv")  
    df_twogram = df_twogram.loc[:,["twogram","frequency"]]

df_twogram

Unnamed: 0,twogram,frequency
0,ik heb,1113451
1,het is,1102152
2,ik ben,1025494
3,dat is,939846
4,wat is,698111
...,...,...
3892596,schat euh,3
3892597,schat eten,3
3892598,levensgevaarlijk volgens,3
3892599,schat erg,3


In [17]:
# Twogram Func
d_list2  = df_twogram.iloc[:,0].values.tolist()

resultlist2 = []
manager = multiprocessing.Manager()
resultlist2 = manager.list()

def word_in_wordgroup(d_list2):
    mergelist2 = []
    try:
        word = d_list2.split()
    except:
        word = []
        #pass  disabled for non split value
    var1 = range(len(word))
    for j in var1:
        if word[j] in word_list:
            mergelist2.append(word[j])
            if len(mergelist2) == len(word):
                    resultlist2.append(d_list2)
                        
if __name__ == '__main__':
    # with Pool(16) as p:
    with Pool(nprocs) as p: # Pool number CPU sayısına eşit olursa tüm CPU lar çalışır
        p.map(word_in_wordgroup, d_list2) # string_word liste

result_list2 = list(resultlist2)
df_result2 = pd.DataFrame(result_list2, columns=[0])  # add columns parameter for empty result
df_result2 = df_result2.rename(columns = {0: "twogram"})
df_merge2 = pd.merge(df_result2, df_twogram, how="left", on="twogram")
df_merge_result2 = df_merge2.sort_values(by="frequency", ascending=False)
df_merge_result2.drop_duplicates(inplace=True)
df_merge_result2.reset_index(drop=True, inplace=True)
df_twogram_select = df_merge_result2
df_twogram_select

Unnamed: 0,twogram,frequency
0,ik heb,1113451
1,het is,1102152
2,ik ben,1025494
3,dat is,939846
4,wat is,698111
...,...,...
34489,weten wacht,3
34490,weten vader,3
34491,over huis,3
34492,niks niemand,3


In [25]:
df_twogram_word_usage = word_usage_result(word_list, df_twogram_select, "twogram", "frequency", min_word_use_num, max_word_use_num)
df_twogram_word_usage

Unnamed: 0,twogram,frequency
0,ik heb,1113451
1,het is,1102152
2,ik ben,1025494
3,dat is,939846
4,wat is,698111
...,...,...
525,toen waren,607
526,kon worden,607
527,dood h,604
528,maken zich,589


In [18]:
if threegram_sentence_check:
    df_threegram_sent = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/N Gram And Sentence/Three_Gram_And_Sentence.csv")  # ext. sentence and ngram
    df_threegram_sent.rename(columns={"three_gram":"threegram"}, inplace=True)  # ext. Not: Two_Gram_And_Sentence.csv convert to Two_Gram_And_Sentence_All.csv
    df_threegram = df_threegram_sent.loc[:,["threegram","frequency"]]
else:
    df_threegram = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/Merge/Threegram_Merge.csv")  
    df_threegram = df_threegram.loc[:,["threegram","frequency"]]

df_threegram

Unnamed: 0,threegram,frequency
0,wat is er,364876
1,het spijt me,258840
2,ik weet het,237218
3,wat doe je,137651
4,het is een,135216
...,...,...
9257115,leven iets voorstelt,3
9257116,leven iets voor,3
9257117,leven iets slecht,3
9257118,leven iets onnatuurlijks,3


In [19]:
# Threegram Func
d_list3  = df_threegram.iloc[:,0].values.tolist()

resultlist3 = []
manager = multiprocessing.Manager()
resultlist3 = manager.list()

def word_in_wordgroup(d_list3):
    mergelist3 = []
    try:
        word = d_list3.split()
    except:
        word = []
        #pass  disabled for non split value
    var1 = range(len(word))
    for j in var1:
        if word[j] in word_list:
            mergelist3.append(word[j])
            if len(mergelist3) == len(word):
                    resultlist3.append(d_list3)
                        
if __name__ == '__main__':
    # with Pool(16) as p:
    with Pool(nprocs) as p: # Pool number CPU sayısına eşit olursa tüm CPU lar çalışır
        p.map(word_in_wordgroup, d_list3) # string_word liste

result_list3 = list(resultlist3)
df_result3 = pd.DataFrame(result_list3, columns=[0])  # add columns parameter for empty result
df_result3 = df_result3.rename(columns = {0: "threegram"})
df_merge3 = pd.merge(df_result3, df_threegram, how="left", on="threegram")
df_merge_result3 = df_merge3.sort_values(by="frequency", ascending=False)
df_merge_result3.drop_duplicates(inplace=True)
df_merge_result3.reset_index(drop=True, inplace=True)
df_threegram_select = df_merge_result3
df_threegram_select

Unnamed: 0,threegram,frequency
0,wat is er,364876
1,het spijt me,258840
2,ik weet het,237218
3,wat doe je,137651
4,het is een,135216
...,...,...
549820,ga leven zoals,3
549821,hij iets naar,3
549822,dan jou op,3
549823,was het kijk,3


In [26]:
df_threegram_word_usage = word_usage_result(word_list, df_threegram_select, "threegram", "frequency", min_word_use_num, max_word_use_num)
df_threegram_word_usage

Unnamed: 0,threegram,frequency
0,wat is er,364876
1,het spijt me,258840
2,ik weet het,237218
3,wat doe je,137651
4,het is een,135216
...,...,...
372,gezien onze staat,4
373,spijt spijt spijt,4
374,jaar gedaan denk,3
375,mr mr mr,3


In [20]:
df_youtube_sentence = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Youtube/Result/{lang_folder.capitalize()}/Sentence Clean Merge/Clean_Youtube_Sentence_Merge_Result.csv")
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,00:00:10.760,00:00:13.200,jongens hier ben ik dan,WA8MV8nX9s0
1,00:00:13.200,00:00:17.040,en we gaan iets heel maar dan ook heel geks me...,WA8MV8nX9s0
2,00:00:17.040,00:00:19.320,ik bedoel we gaan naar de royal bridge suite,WA8MV8nX9s0
3,00:00:19.900,00:00:23.760,dit is de meest luxe suite die je maar kan bed...,WA8MV8nX9s0
4,00:00:23.760,00:00:25.860,de meest bekende mensen hebben hier geslapen,WA8MV8nX9s0
...,...,...,...,...
324877,00:01:17.570,00:01:26.570,met de beschikbare instructie videos is het vr...,c8S8UMw5674
324878,00:01:26.570,00:01:35.090,alle materialen die nodig zijn om de montage z...,c8S8UMw5674
324879,00:01:37.610,00:01:41.690,het is ook mogelijk om de montage geheel door ...,c8S8UMw5674
324880,00:01:43.100,00:01:48.080,vraag daarvoor een offerte aan via telefoon of...,c8S8UMw5674


In [21]:
df_youtube_sentence = df_youtube_sentence[~df_youtube_sentence["video_id"].isin(disable_video_id_list)]
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,00:00:10.760,00:00:13.200,jongens hier ben ik dan,WA8MV8nX9s0
1,00:00:13.200,00:00:17.040,en we gaan iets heel maar dan ook heel geks me...,WA8MV8nX9s0
2,00:00:17.040,00:00:19.320,ik bedoel we gaan naar de royal bridge suite,WA8MV8nX9s0
3,00:00:19.900,00:00:23.760,dit is de meest luxe suite die je maar kan bed...,WA8MV8nX9s0
4,00:00:23.760,00:00:25.860,de meest bekende mensen hebben hier geslapen,WA8MV8nX9s0
...,...,...,...,...
324877,00:01:17.570,00:01:26.570,met de beschikbare instructie videos is het vr...,c8S8UMw5674
324878,00:01:26.570,00:01:35.090,alle materialen die nodig zijn om de montage z...,c8S8UMw5674
324879,00:01:37.610,00:01:41.690,het is ook mogelijk om de montage geheel door ...,c8S8UMw5674
324880,00:01:43.100,00:01:48.080,vraag daarvoor een offerte aan via telefoon of...,c8S8UMw5674


In [22]:
df_youtube_sentence['start_time'] = pd.to_timedelta(df_youtube_sentence['start_time']) # data type converted timedelta for second 
df_youtube_sentence['end_time'] = pd.to_timedelta(df_youtube_sentence['end_time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['start_time'] = pd.to_timedelta(df_youtube_sentence['start_time']) # data type converted timedelta for second
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['end_time'] = pd.to_timedelta(df_youtube_sentence['end_time'])


In [23]:
df_youtube_sentence['start_time'] = df_youtube_sentence['start_time'].apply(lambda x: x.total_seconds()) # convert seconds
df_youtube_sentence['end_time'] = df_youtube_sentence['end_time'].apply(lambda x: x.total_seconds())
df_youtube_sentence

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['start_time'] = df_youtube_sentence['start_time'].apply(lambda x: x.total_seconds()) # convert seconds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['end_time'] = df_youtube_sentence['end_time'].apply(lambda x: x.total_seconds())


Unnamed: 0,start_time,end_time,sentence,video_id
0,10.76,13.20,jongens hier ben ik dan,WA8MV8nX9s0
1,13.20,17.04,en we gaan iets heel maar dan ook heel geks me...,WA8MV8nX9s0
2,17.04,19.32,ik bedoel we gaan naar de royal bridge suite,WA8MV8nX9s0
3,19.90,23.76,dit is de meest luxe suite die je maar kan bed...,WA8MV8nX9s0
4,23.76,25.86,de meest bekende mensen hebben hier geslapen,WA8MV8nX9s0
...,...,...,...,...
324877,77.57,86.57,met de beschikbare instructie videos is het vr...,c8S8UMw5674
324878,86.57,95.09,alle materialen die nodig zijn om de montage z...,c8S8UMw5674
324879,97.61,101.69,het is ook mogelijk om de montage geheel door ...,c8S8UMw5674
324880,103.10,108.08,vraag daarvoor een offerte aan via telefoon of...,c8S8UMw5674


In [27]:
search_list = df_twogram_word_usage["twogram"].to_list()
search_list = df_threegram_word_usage["threegram"].to_list()

In [28]:
df_word_group = word_group_youtube(df_youtube_sentence, search_list, "sentence", sample_num)
df_word_group

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik heb,157.480,161.240,ik heb vaak gezegd witte mensen zijn niet de v...,4NNIn0HGnzs
1,ik heb,1172.410,1181.680,ik heb hier twee driehoekige houten bakken en ...,qFmXQ000zP0
2,ik heb,157.039,160.680,ik heb mijn collegas niet horen klagen,9NgBUpYt8q8
3,ik heb,19.349,21.901,en ik heb er zin in lets go,cgnJ5d6uakg
4,ik heb,42.190,45.230,ik heb bijvoorbeeld een patissier in dienst al...,2yENubQEjIw
...,...,...,...,...,...
2403,maken zich,194.630,201.710,daarnaast is een grote zorg economen maken zic...,5pDwEvEN_dA
2404,maken zich,1764.269,1772.529,ze maken zich zorgen over een tekort aan grond...,ycB_yozQW6U
2405,maken zich,8.000,10.520,mensen maken zich zorgen over hun bedrijf,rdFMsYyW0Oo
2406,erg vind,4941.630,4947.810,dit hele gevoelige dossier en blijft het erg v...,6b2CjjmooDA


In [29]:
df_word_group_time_loc = word_group_time_loc(df_word_group, "search_string", "start_time", "end_time", "sentence", "video_id")
df_word_group_time_loc

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik heb,157.480000,157.996078,ik heb vaak gezegd witte mensen zijn niet de v...,4NNIn0HGnzs
1,ik heb,1172.410000,1173.548421,ik heb hier twee driehoekige houten bakken en ...,qFmXQ000zP0
2,ik heb,157.039000,157.709711,ik heb mijn collegas niet horen klagen,9NgBUpYt8q8
3,ik heb,19.538037,20.294185,en ik heb er zin in lets go,cgnJ5d6uakg
4,ik heb,42.190000,42.481507,ik heb bijvoorbeeld een patissier in dienst al...,2yENubQEjIw
...,...,...,...,...,...
2403,maken zich,197.341489,198.245319,daarnaast is een grote zorg economen maken zic...,5pDwEvEN_dA
2404,maken zich,1764.430961,1765.402725,ze maken zich zorgen over een tekort aan grond...,ycB_yozQW6U
2405,maken zich,8.368780,9.106341,mensen maken zich zorgen over hun bedrijf,rdFMsYyW0Oo
2406,erg vind,4945.750000,4946.780000,dit hele gevoelige dossier en blijft het erg v...,6b2CjjmooDA


In [30]:
df_word_group_time_loc.start_time = df_word_group_time_loc.start_time.apply(lambda x: (x-time_shift))
df_word_group_time_loc.end_time = df_word_group_time_loc.end_time.apply(lambda x: (x+time_shift))
df_word_group_time_loc

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik heb,156.880000,158.596078,ik heb vaak gezegd witte mensen zijn niet de v...,4NNIn0HGnzs
1,ik heb,1171.810000,1174.148421,ik heb hier twee driehoekige houten bakken en ...,qFmXQ000zP0
2,ik heb,156.439000,158.309711,ik heb mijn collegas niet horen klagen,9NgBUpYt8q8
3,ik heb,18.938037,20.894185,en ik heb er zin in lets go,cgnJ5d6uakg
4,ik heb,41.590000,43.081507,ik heb bijvoorbeeld een patissier in dienst al...,2yENubQEjIw
...,...,...,...,...,...
2403,maken zich,196.741489,198.845319,daarnaast is een grote zorg economen maken zic...,5pDwEvEN_dA
2404,maken zich,1763.830961,1766.002725,ze maken zich zorgen over een tekort aan grond...,ycB_yozQW6U
2405,maken zich,7.768780,9.706341,mensen maken zich zorgen over hun bedrijf,rdFMsYyW0Oo
2406,erg vind,4945.150000,4947.380000,dit hele gevoelige dossier en blijft het erg v...,6b2CjjmooDA


In [31]:
df_word_group_time_loc.start_time = df_word_group_time_loc.start_time.apply(lambda x: round(x))
df_word_group_time_loc.end_time = df_word_group_time_loc.end_time.apply(lambda x: round(x))
df_word_group_time_loc 

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik heb,157,159,ik heb vaak gezegd witte mensen zijn niet de v...,4NNIn0HGnzs
1,ik heb,1172,1174,ik heb hier twee driehoekige houten bakken en ...,qFmXQ000zP0
2,ik heb,156,158,ik heb mijn collegas niet horen klagen,9NgBUpYt8q8
3,ik heb,19,21,en ik heb er zin in lets go,cgnJ5d6uakg
4,ik heb,42,43,ik heb bijvoorbeeld een patissier in dienst al...,2yENubQEjIw
...,...,...,...,...,...
2403,maken zich,197,199,daarnaast is een grote zorg economen maken zic...,5pDwEvEN_dA
2404,maken zich,1764,1766,ze maken zich zorgen over een tekort aan grond...,ycB_yozQW6U
2405,maken zich,8,10,mensen maken zich zorgen over hun bedrijf,rdFMsYyW0Oo
2406,erg vind,4945,4947,dit hele gevoelige dossier en blijft het erg v...,6b2CjjmooDA


In [32]:
df_word_group_time_loc["video_url"] = "https://www.youtube.com/watch?v="+df_word_group_time_loc['video_id'].map(str)+"&t="+df_word_group_time_loc['start_time'].map(str)+"s"
df_word_group_time_loc

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ik heb,157,159,ik heb vaak gezegd witte mensen zijn niet de v...,4NNIn0HGnzs,https://www.youtube.com/watch?v=4NNIn0HGnzs&t=...
1,ik heb,1172,1174,ik heb hier twee driehoekige houten bakken en ...,qFmXQ000zP0,https://www.youtube.com/watch?v=qFmXQ000zP0&t=...
2,ik heb,156,158,ik heb mijn collegas niet horen klagen,9NgBUpYt8q8,https://www.youtube.com/watch?v=9NgBUpYt8q8&t=...
3,ik heb,19,21,en ik heb er zin in lets go,cgnJ5d6uakg,https://www.youtube.com/watch?v=cgnJ5d6uakg&t=19s
4,ik heb,42,43,ik heb bijvoorbeeld een patissier in dienst al...,2yENubQEjIw,https://www.youtube.com/watch?v=2yENubQEjIw&t=42s
...,...,...,...,...,...,...
2403,maken zich,197,199,daarnaast is een grote zorg economen maken zic...,5pDwEvEN_dA,https://www.youtube.com/watch?v=5pDwEvEN_dA&t=...
2404,maken zich,1764,1766,ze maken zich zorgen over een tekort aan grond...,ycB_yozQW6U,https://www.youtube.com/watch?v=ycB_yozQW6U&t=...
2405,maken zich,8,10,mensen maken zich zorgen over hun bedrijf,rdFMsYyW0Oo,https://www.youtube.com/watch?v=rdFMsYyW0Oo&t=8s
2406,erg vind,4945,4947,dit hele gevoelige dossier en blijft het erg v...,6b2CjjmooDA,https://www.youtube.com/watch?v=6b2CjjmooDA&t=...


In [None]:
df_word_group_time_loc.to_excel(f"{lang_folder.capitalize()}_Word_Group_With_{word_end}_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result.xlsx", index=False) 