### Adjust Repeat Data

In [98]:
import os
import multiprocessing
#import multiprocessing as mp
from multiprocessing import Process, Manager, Pool, Queue
from itertools import islice
from collections import Counter
import re
import pandas as pd
import numpy as np
import glob
import nltk
from nltk import word_tokenize
from nltk import ngrams
from functools import reduce
from pathlib import Path
import shutil

In [99]:
nprocs = multiprocessing.cpu_count()
print(f"Number of CPU cores: {nprocs}")

Number of CPU cores: 16


In [100]:
# language pair
lang_folder = "Dutch"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
#lang_pair = "Intersect"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> native language

# adding native word to shared word
word_start = 0  # 0 native word start index
word_end = 200  # 28 native word end index

# word usage
min_word_use_num = 1
max_word_use_num = 4

# sentence check
twogram_sentence_check = True  # True, False
threegram_sentence_check = True

# youtube
sample_num = 5  # 6
time_shift = 0.6

In [101]:
def df_col_value_join_comma(df, df_columns_list):
    '''
    df_col_value_join_comma(df_test, ["video_id","start_time","end_time"])\n
    function used for selected column value join with comma in one row
    '''
    column_value_list = []
    for column in df_columns_list:
        list_var = df[f"{column}"].to_list()
        list_var_string = [str(x) for x in list_var] 
        list_var_join = ",".join(list_var_string)
        column_value_list.append(list_var_join)

    df_result = pd.DataFrame([[column_value_list[0],column_value_list[1],column_value_list[2]]], columns=df_columns_list)

    return df_result

In [102]:
def word_in_wordgroup_simple(source_word_list, df_target, target_column, word_sample_num, simple=False):

    '''word_in_wordgroup(not_in_sent_word_list, df_youtube_sent_select, "search_string", 5, simple=False)\n
       source_word_list is searching word list\n
       df_target is dataframe, target_column are dataframe column string name\n
       word_sample_num is searching sample number.
       simple use for all column row result or only target column result 
    '''
    if simple:
        df_select = df_target[[f"{target_column}"]].dropna()
    else:
        df_select = df_target
        
    df_result = pd.DataFrame()
    for i in source_word_list:
        try:
            word_in_word_cluster = df_select[df_select[f"{target_column}"].str.contains(fr"(?:\s|^){i}(?:\s|$)", na=True)].head(word_sample_num)    
        except:
            pass        
        word_in_word_cluster.insert(0,"search_string",i)
        df_result = pd.concat([df_result,word_in_word_cluster], axis=0)
    df_result.reset_index(drop=True, inplace=True)

    return df_result

In [103]:
def word_group_youtube(df, search_list, target_column, sample_num):
    '''
    word_group_youtube(df_youtube_sentence, search_list, "sentence", 6)\n
    df_youtube_sentence is dataframe and "sentence" is its column for external searching_list
    ''' 
    df_search_result = pd.DataFrame()
    for j in search_list:
        try:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].sample(sample_num)
        except:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].head(sample_num)
        #df_result = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)]  # sentence length part
        #df_result.sort_values(f"{target_column}",key=lambda x:x.str.len(), inplace=True)
        #df_select = df_result.head(sample_num)
        df_select.insert(0,"search_string",j)
        df_search_result = pd.concat([df_search_result,df_select], axis=0)
    df_search_result.reset_index(inplace=True, drop=True)
    
    return df_search_result

In [104]:
def word_group_time_loc(df, search, start_sent, end_sent, sent, sent_video_id):
    '''
    word_group_time_loc(df_search_result, "search_string", "start_time", "end_time", "sentence", "video_id")\n
    df_search_result is dataframe and "search_string", "start_time", "end_time", "sentence", "video_id" are its columns
    '''
    df.reset_index(drop=True, inplace=True)
    word_time_loc_list = []
    for i in range(len(df)):
        word = df.loc[i,f"{search}"]
        start_time = df.loc[i,f"{start_sent}"]
        end_time = df.loc[i,f"{end_sent}"]
        sentence = df.loc[i,f"{sent}"]
        video_id = df.loc[i,f"{sent_video_id}"]
        time_length = end_time-start_time
        sentence_length = len(sentence)
        time_length_ratio = time_length/sentence_length
        loc_list = []
        for j in re.finditer(fr"(?:\s|^){word}(?:\s|$)", sentence, re.IGNORECASE|re.UNICODE):
            loc_list.append(j)
            start = loc_list[0].start()
            end = loc_list[0].end()
            start_loc = start_time+(start*time_length_ratio)
            end_loc = start_time+(end*time_length_ratio)
        word_time_loc_list.append([word,start_loc,end_loc,sentence,video_id])
    df_word_time_loc = pd.DataFrame(word_time_loc_list, columns=[f"{search}",f"{start_sent}",f"{end_sent}",f"{sent}",f"{sent_video_id}"])

    return df_word_time_loc

In [105]:
def word_count_result(df, column_list, set_condition=False): # df is dataframe, column_list is list value
    '''
    word_count_bool(df, column_list): df columns word count for word frequency\n
    df is dataframe, column_list is list value\n
    word_count_bool(df, ["word","twogram"]):
    '''
    list_all = []
    for i in df.loc[:,[x for x in column_list]].columns:
        if set_condition:
            var_list = set(df[f"{i}"].dropna().tolist())
        else:
            var_list = df[f"{i}"].dropna().tolist()
        for j in var_list:
            list_all.append(j)
    text = " ".join(list_all)
    word_list = re.findall(r"\w+",text, re.UNICODE)
    df_word_list = pd.DataFrame(word_list, columns=["word"])
    #df_word_list.rename(columns={0:"word"}, inplace=True)
    df_word_count = pd.DataFrame(df_word_list.value_counts())
    df_word_count.reset_index(inplace=True)
    df_word_count.rename(columns={0:"word_count"}, inplace=True)
    df_word_count.sort_values("word_count", ascending=False, inplace=True)
    df_word_count.reset_index(inplace=True, drop=True)
    
    return  df_word_count

In [106]:
def word_usage_result(word_list, df_target, target_column, target_opt_column, word_usage_min, word_usage_max):
    '''
    word_usage_result(word_list, df_ngram_pair, "threegram", "frequency", 1, 5) \n
    word_list is a list, df_target is a dateframe, target_column is df_target dataframe target column, \n
    target_opt_column is df_target dataframe opt_target column, \n
    word_usage_min and word_usage_max word usage condition.
    '''    
    word_num_dict = {}
    for i in word_list:
        word_num_dict[f"{i}"] = 0
    
    result_list_select = []
    var_list = []
    for i in range(len(df_target)):
        target_value = df_target.loc[i,f"{target_column}"]
        opt_value = df_target.loc[i,f"{target_opt_column}"]
        words = word_tokenize(target_value)   
        temp_list = [word for word in words]
        temp_list = temp_list + var_list
        # word count for max
        dict_list_count = Counter(temp_list)
        count_list = list(dict_list_count.values())
        # word count for min
        count_list2 = list(word_num_dict.values())
    
        if any([True if i>word_usage_max else False for i in count_list]) or not(any([True if j<word_usage_min else False for j in count_list2])):
            pass
        else:
            var_list = temp_list
            result_list_select.append([target_value,opt_value]) 
    
            for item2 in dict_list_count.items(): 
                word_num_dict[item2[0]] = item2[1]        
    df_result = pd.DataFrame(result_list_select, columns=[f"{target_column}",f"{target_opt_column}"])
    df_result.sort_values(by="frequency", ascending=False, inplace=True)
    df_result.reset_index(drop=True, inplace=True)
    
    return df_result

In [107]:
path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Talk Time/Result/Adjust Data Repeat"

Path(path).mkdir(parents=True, exist_ok=True)

In [108]:
disable_video_id_list = ["H6E6N70jYqI","BoaYsdPtJYA","HUZINsU40Fk","Et3diPcEmfY"]

In [109]:
df_word_all = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.lower().capitalize()}/Deployment/Data/Word/Word_Merge_Preprocess.xlsx")
df_word_all

Unnamed: 0,word,frequency
0,ik,22539531
1,je,20769946
2,het,15696161
3,de,15258816
4,dat,13387137
...,...,...
331769,oorlogsgezichten,5
331770,opdrachtenlijst,5
331771,verlsaafde,5
331772,oxidatieproces,5


In [110]:
df_word_select = df_word_all.iloc[word_start:word_end,]

In [111]:
df_word_select

Unnamed: 0,word,frequency
0,ik,22539531
1,je,20769946
2,het,15696161
3,de,15258816
4,dat,13387137
...,...,...
195,genoeg,353128
196,wist,352948
197,oh,351988
198,klaar,350506


In [112]:
word_list = df_word_select["word"].to_list()
len(word_list)

200

In [113]:
if twogram_sentence_check:
    df_twogram_sent = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/N Gram And Sentence/Two_Gram_And_Sentence.csv")  # ext. sentence and ngram
    df_twogram_sent.rename(columns={"two_gram":"twogram"}, inplace=True)  # ext. Not: Two_Gram_And_Sentence.csv convert to Two_Gram_And_Sentence_All.csv
    df_twogram = df_twogram_sent.loc[:,["twogram","frequency"]]
else:
    df_twogram = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/Merge/Twogram_Merge.csv")  
    df_twogram = df_twogram.loc[:,["twogram","frequency"]]

df_twogram

Unnamed: 0,twogram,frequency
0,kom op,184578
1,dank je,127178
2,dank u,64014
3,wacht even,63826
4,echt waar,59238
...,...,...
328541,zesentwintig horizontaal,3
328542,volledig gedisciplineerd,3
328543,volledig geconcentreerd,3
328544,hero netegengif,3


In [114]:
# Twogram Func
d_list2  = df_twogram.iloc[:,0].values.tolist()

resultlist2 = []
manager = multiprocessing.Manager()
resultlist2 = manager.list()

def word_in_wordgroup(d_list2):
    mergelist2 = []
    try:
        word = d_list2.split()
    except:
        word = []
        #pass  disabled for non split value
    var1 = range(len(word))
    for j in var1:
        if word[j] in word_list:
            mergelist2.append(word[j])
            if len(mergelist2) == len(word):
                    resultlist2.append(d_list2)
                        
if __name__ == '__main__':
    # with Pool(16) as p:
    with Pool(nprocs) as p: # Pool number CPU sayısına eşit olursa tüm CPU lar çalışır
        p.map(word_in_wordgroup, d_list2) # string_word liste

result_list2 = list(resultlist2)
df_result2 = pd.DataFrame(result_list2, columns=[0])  # add columns parameter for empty result
df_result2 = df_result2.rename(columns = {0: "twogram"})
df_merge2 = pd.merge(df_result2, df_twogram, how="left", on="twogram")
df_merge_result2 = df_merge2.sort_values(by="frequency", ascending=False)
df_merge_result2.drop_duplicates(inplace=True)
df_merge_result2.reset_index(drop=True, inplace=True)
df_twogram_select = df_merge_result2
df_twogram_select

Unnamed: 0,twogram,frequency
0,kom op,184578
1,dank je,127178
2,dank u,64014
3,wacht even,63826
4,echt waar,59238
...,...,...
6564,of wist,3
6565,geen van,3
6566,wil ons,3
6567,allemaal wel,3


In [115]:
df_twogram_word_usage = word_usage_result(word_list, df_twogram_select, "twogram", "frequency", min_word_use_num, max_word_use_num)
df_twogram_word_usage

Unnamed: 0,twogram,frequency
0,kom op,184578
1,dank je,127178
2,dank u,64014
3,wacht even,63826
4,echt waar,59238
...,...,...
339,bij zich,3
340,heb nodig,3
341,toen zeker,3
342,toen dacht,3


In [116]:
if threegram_sentence_check:
    df_threegram_sent = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/N Gram And Sentence/Three_Gram_And_Sentence.csv")  # ext. sentence and ngram
    df_threegram_sent.rename(columns={"three_gram":"threegram"}, inplace=True)  # ext. Not: Two_Gram_And_Sentence.csv convert to Two_Gram_And_Sentence_All.csv
    df_threegram = df_threegram_sent.loc[:,["threegram","frequency"]]
else:
    df_threegram = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/Merge/Threegram_Merge.csv")  
    df_threegram = df_threegram.loc[:,["threegram","frequency"]]

df_threegram

Unnamed: 0,threegram,frequency
0,het spijt me,132758
1,wat is er,106497
2,ik weet het,67518
3,wat is dat,62768
4,wat doe je,60323
...,...,...
550436,vliegen onbekend aantal,3
550437,vliegen obstakel afnemen,3
550438,voor achtduizend euro,3
550439,voor de kijkcijfers,3


In [117]:
# Threegram Func
d_list3  = df_threegram.iloc[:,0].values.tolist()

resultlist3 = []
manager = multiprocessing.Manager()
resultlist3 = manager.list()

def word_in_wordgroup(d_list3):
    mergelist3 = []
    try:
        word = d_list3.split()
    except:
        word = []
        #pass  disabled for non split value
    var1 = range(len(word))
    for j in var1:
        if word[j] in word_list:
            mergelist3.append(word[j])
            if len(mergelist3) == len(word):
                    resultlist3.append(d_list3)
                        
if __name__ == '__main__':
    # with Pool(16) as p:
    with Pool(nprocs) as p: # Pool number CPU sayısına eşit olursa tüm CPU lar çalışır
        p.map(word_in_wordgroup, d_list3) # string_word liste

result_list3 = list(resultlist3)
df_result3 = pd.DataFrame(result_list3, columns=[0])  # add columns parameter for empty result
df_result3 = df_result3.rename(columns = {0: "threegram"})
df_merge3 = pd.merge(df_result3, df_threegram, how="left", on="threegram")
df_merge_result3 = df_merge3.sort_values(by="frequency", ascending=False)
df_merge_result3.drop_duplicates(inplace=True)
df_merge_result3.reset_index(drop=True, inplace=True)
df_threegram_select = df_merge_result3
df_threegram_select

Unnamed: 0,threegram,frequency
0,het spijt me,132758
1,wat is er,106497
2,ik weet het,67518
3,wat is dat,62768
4,wat doe je,60323
...,...,...
29304,wat doen praten,3
29305,en niets zijn,3
29306,n is t,3
29307,iedereen vind dat,3


In [118]:
df_threegram_word_usage = word_usage_result(word_list, df_threegram_select, "threegram", "frequency", min_word_use_num, max_word_use_num)
df_threegram_word_usage

Unnamed: 0,threegram,frequency
0,het spijt me,132758
1,wat is er,106497
2,ik weet het,67518
3,wat is dat,62768
4,wat doe je,60323
...,...,...
205,omdat omdat omdat,4
206,binnen binnen binnen,4
207,ben heeft geen,3
208,wordt beter mr,3


In [119]:
df_youtube_sentence = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Youtube/Result/{lang_folder.capitalize()}/Sentence Clean Merge/Clean_Youtube_Sentence_Merge_Result.csv")
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,00:00:10.760,00:00:13.200,jongens hier ben ik dan,WA8MV8nX9s0
1,00:00:13.200,00:00:17.040,en we gaan iets heel maar dan ook heel geks me...,WA8MV8nX9s0
2,00:00:17.040,00:00:19.320,ik bedoel we gaan naar de royal bridge suite,WA8MV8nX9s0
3,00:00:19.900,00:00:23.760,dit is de meest luxe suite die je maar kan bed...,WA8MV8nX9s0
4,00:00:23.760,00:00:25.860,de meest bekende mensen hebben hier geslapen,WA8MV8nX9s0
...,...,...,...,...
324877,00:01:17.570,00:01:26.570,met de beschikbare instructie videos is het vr...,c8S8UMw5674
324878,00:01:26.570,00:01:35.090,alle materialen die nodig zijn om de montage z...,c8S8UMw5674
324879,00:01:37.610,00:01:41.690,het is ook mogelijk om de montage geheel door ...,c8S8UMw5674
324880,00:01:43.100,00:01:48.080,vraag daarvoor een offerte aan via telefoon of...,c8S8UMw5674


In [120]:
df_youtube_sentence = df_youtube_sentence[~df_youtube_sentence["video_id"].isin(disable_video_id_list)]
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,00:00:10.760,00:00:13.200,jongens hier ben ik dan,WA8MV8nX9s0
1,00:00:13.200,00:00:17.040,en we gaan iets heel maar dan ook heel geks me...,WA8MV8nX9s0
2,00:00:17.040,00:00:19.320,ik bedoel we gaan naar de royal bridge suite,WA8MV8nX9s0
3,00:00:19.900,00:00:23.760,dit is de meest luxe suite die je maar kan bed...,WA8MV8nX9s0
4,00:00:23.760,00:00:25.860,de meest bekende mensen hebben hier geslapen,WA8MV8nX9s0
...,...,...,...,...
324877,00:01:17.570,00:01:26.570,met de beschikbare instructie videos is het vr...,c8S8UMw5674
324878,00:01:26.570,00:01:35.090,alle materialen die nodig zijn om de montage z...,c8S8UMw5674
324879,00:01:37.610,00:01:41.690,het is ook mogelijk om de montage geheel door ...,c8S8UMw5674
324880,00:01:43.100,00:01:48.080,vraag daarvoor een offerte aan via telefoon of...,c8S8UMw5674


In [121]:
df_youtube_sentence['start_time'] = pd.to_timedelta(df_youtube_sentence['start_time']) # data type converted timedelta for second 
df_youtube_sentence['end_time'] = pd.to_timedelta(df_youtube_sentence['end_time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['start_time'] = pd.to_timedelta(df_youtube_sentence['start_time']) # data type converted timedelta for second
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['end_time'] = pd.to_timedelta(df_youtube_sentence['end_time'])


In [122]:
df_youtube_sentence['start_time'] = df_youtube_sentence['start_time'].apply(lambda x: x.total_seconds()) # convert seconds
df_youtube_sentence['end_time'] = df_youtube_sentence['end_time'].apply(lambda x: x.total_seconds())
df_youtube_sentence

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['start_time'] = df_youtube_sentence['start_time'].apply(lambda x: x.total_seconds()) # convert seconds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['end_time'] = df_youtube_sentence['end_time'].apply(lambda x: x.total_seconds())


Unnamed: 0,start_time,end_time,sentence,video_id
0,10.76,13.20,jongens hier ben ik dan,WA8MV8nX9s0
1,13.20,17.04,en we gaan iets heel maar dan ook heel geks me...,WA8MV8nX9s0
2,17.04,19.32,ik bedoel we gaan naar de royal bridge suite,WA8MV8nX9s0
3,19.90,23.76,dit is de meest luxe suite die je maar kan bed...,WA8MV8nX9s0
4,23.76,25.86,de meest bekende mensen hebben hier geslapen,WA8MV8nX9s0
...,...,...,...,...
324877,77.57,86.57,met de beschikbare instructie videos is het vr...,c8S8UMw5674
324878,86.57,95.09,alle materialen die nodig zijn om de montage z...,c8S8UMw5674
324879,97.61,101.69,het is ook mogelijk om de montage geheel door ...,c8S8UMw5674
324880,103.10,108.08,vraag daarvoor een offerte aan via telefoon of...,c8S8UMw5674


In [123]:
search_list = df_twogram_word_usage["twogram"].to_list()
#search_list = df_threegram_word_usage["threegram"].to_list()

In [124]:
df_word_group = word_group_youtube(df_youtube_sentence, search_list, "sentence", sample_num)
df_word_group

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,kom op,13091.090,13093.840,ik kom op de brexit,mdNyTqKmEMs
1,kom op,441.692,446.125,tim nee maar ik wil gerrit kom op nee dat meen...,7Y8WyNm70Pg
2,kom op,256.360,258.960,jongens kom op zeg,D4hOqM6VeS0
3,kom op,1033.990,1041.850,sparen al tien jaar lang voor een borstvergrot...,FtX0SOorrFI
4,kom op,462.755,466.255,we gaan lekker jongens we gaan lekker kom op o...,3QeDhBvT2QM
...,...,...,...,...,...
1446,willen weten,351.510,355.740,lijst schriftelijke vragen hebben gesteld omda...,Xy97DH85zFk
1447,willen weten,72.390,81.960,wat zou je willen weten over de groep of wat z...,LPMFkDtXKvk
1448,willen weten,1926.980,1930.730,erg tof nog hier nou meer over willen weten ch...,po3SMpTUHow
1449,willen weten,12146.530,12146.820,dat is de kern van wat wij willen weten,mdNyTqKmEMs


In [125]:
df_word_group_time_loc = word_group_time_loc(df_word_group, "search_string", "start_time", "end_time", "sentence", "video_id")
df_word_group_time_loc

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,kom op,13091.379474,13092.537368,ik kom op de brexit,mdNyTqKmEMs
1,kom op,443.362406,443.876377,tim nee maar ik wil gerrit kom op nee dat meen...,7Y8WyNm70Pg
2,kom op,257.371111,258.526667,jongens kom op zeg,D4hOqM6VeS0
3,kom op,1040.976667,1041.850000,sparen al tien jaar lang voor een borstvergrot...,FtX0SOorrFI
4,kom op,465.026930,465.518158,we gaan lekker jongens we gaan lekker kom op o...,3QeDhBvT2QM
...,...,...,...,...,...
1446,willen weten,353.736316,354.359684,lijst schriftelijke vragen hebben gesteld omda...,Xy97DH85zFk
1447,willen weten,73.666000,75.452400,wat zou je willen weten over de groep of wat z...,LPMFkDtXKvk
1448,willen weten,1928.105000,1928.630000,erg tof nog hier nou meer over willen weten ch...,po3SMpTUHow
1449,willen weten,12146.723333,12146.820000,dat is de kern van wat wij willen weten,mdNyTqKmEMs


In [126]:
df_word_group_time_loc.start_time = df_word_group_time_loc.start_time.apply(lambda x: (x-time_shift))
df_word_group_time_loc.end_time = df_word_group_time_loc.end_time.apply(lambda x: (x+time_shift))
df_word_group_time_loc

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,kom op,13090.779474,13093.137368,ik kom op de brexit,mdNyTqKmEMs
1,kom op,442.762406,444.476377,tim nee maar ik wil gerrit kom op nee dat meen...,7Y8WyNm70Pg
2,kom op,256.771111,259.126667,jongens kom op zeg,D4hOqM6VeS0
3,kom op,1040.376667,1042.450000,sparen al tien jaar lang voor een borstvergrot...,FtX0SOorrFI
4,kom op,464.426930,466.118158,we gaan lekker jongens we gaan lekker kom op o...,3QeDhBvT2QM
...,...,...,...,...,...
1446,willen weten,353.136316,354.959684,lijst schriftelijke vragen hebben gesteld omda...,Xy97DH85zFk
1447,willen weten,73.066000,76.052400,wat zou je willen weten over de groep of wat z...,LPMFkDtXKvk
1448,willen weten,1927.505000,1929.230000,erg tof nog hier nou meer over willen weten ch...,po3SMpTUHow
1449,willen weten,12146.123333,12147.420000,dat is de kern van wat wij willen weten,mdNyTqKmEMs


In [127]:
df_word_group_time_loc.start_time = df_word_group_time_loc.start_time.apply(lambda x: round(x))
df_word_group_time_loc.end_time = df_word_group_time_loc.end_time.apply(lambda x: round(x))
df_word_group_time_loc 

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,kom op,13091,13093,ik kom op de brexit,mdNyTqKmEMs
1,kom op,443,444,tim nee maar ik wil gerrit kom op nee dat meen...,7Y8WyNm70Pg
2,kom op,257,259,jongens kom op zeg,D4hOqM6VeS0
3,kom op,1040,1042,sparen al tien jaar lang voor een borstvergrot...,FtX0SOorrFI
4,kom op,464,466,we gaan lekker jongens we gaan lekker kom op o...,3QeDhBvT2QM
...,...,...,...,...,...
1446,willen weten,353,355,lijst schriftelijke vragen hebben gesteld omda...,Xy97DH85zFk
1447,willen weten,73,76,wat zou je willen weten over de groep of wat z...,LPMFkDtXKvk
1448,willen weten,1928,1929,erg tof nog hier nou meer over willen weten ch...,po3SMpTUHow
1449,willen weten,12146,12147,dat is de kern van wat wij willen weten,mdNyTqKmEMs


In [128]:
df_word_group_time_loc["video_url"] = "https://www.youtube.com/watch?v="+df_word_group_time_loc['video_id'].map(str)+"&t="+df_word_group_time_loc['start_time'].map(str)+"s"
df_word_group_time_loc

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,kom op,13091,13093,ik kom op de brexit,mdNyTqKmEMs,https://www.youtube.com/watch?v=mdNyTqKmEMs&t=...
1,kom op,443,444,tim nee maar ik wil gerrit kom op nee dat meen...,7Y8WyNm70Pg,https://www.youtube.com/watch?v=7Y8WyNm70Pg&t=...
2,kom op,257,259,jongens kom op zeg,D4hOqM6VeS0,https://www.youtube.com/watch?v=D4hOqM6VeS0&t=...
3,kom op,1040,1042,sparen al tien jaar lang voor een borstvergrot...,FtX0SOorrFI,https://www.youtube.com/watch?v=FtX0SOorrFI&t=...
4,kom op,464,466,we gaan lekker jongens we gaan lekker kom op o...,3QeDhBvT2QM,https://www.youtube.com/watch?v=3QeDhBvT2QM&t=...
...,...,...,...,...,...,...
1446,willen weten,353,355,lijst schriftelijke vragen hebben gesteld omda...,Xy97DH85zFk,https://www.youtube.com/watch?v=Xy97DH85zFk&t=...
1447,willen weten,73,76,wat zou je willen weten over de groep of wat z...,LPMFkDtXKvk,https://www.youtube.com/watch?v=LPMFkDtXKvk&t=73s
1448,willen weten,1928,1929,erg tof nog hier nou meer over willen weten ch...,po3SMpTUHow,https://www.youtube.com/watch?v=po3SMpTUHow&t=...
1449,willen weten,12146,12147,dat is de kern van wat wij willen weten,mdNyTqKmEMs,https://www.youtube.com/watch?v=mdNyTqKmEMs&t=...


In [129]:
word_count_result(df_word_group_time_loc, ["search_string"], set_condition=True)

Unnamed: 0,word,word_count
0,aan,4
1,gedaan,4
2,doe,4
3,doen,4
4,is,4
...,...,...
182,wordt,1
183,zal,1
184,jaar,1
185,zich,1


In [None]:
df_word_group_time_loc.to_excel(f"{lang_folder.capitalize()}_Word_Group_With_{word_end}_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result.xlsx", index=False) 