### Talk Time Data Prepare

In [1]:
import os
import multiprocessing
#import multiprocessing as mp
from multiprocessing import Process, Manager, Pool, Queue
from itertools import islice
from collections import Counter
import re
import pandas as pd
import numpy as np
import glob
import nltk
from nltk import word_tokenize
from nltk import ngrams
from functools import reduce
from pathlib import Path
import shutil

In [2]:
nprocs = multiprocessing.cpu_count()
print(f"Number of CPU cores: {nprocs}")

Number of CPU cores: 16


In [3]:
# language pair
lang_folder = "Dutch"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
#lang_pair = "Intersect"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> native language

# adding native word to shared word
word_start = 0  # 0 native word start index
word_end = 200  # 28 native word end index

# word usage
min_word_use_num = 1
max_word_use_num = 6

# sentence check
twogram_sentence_check = False  # True, False
threegram_sentence_check = False

# youtube
sample_num = 1  # 6
time_shift = 0.6

In [4]:
def word_in_wordgroup_simple(source_word_list, df_target, target_column, word_sample_num, simple=False):

    '''word_in_wordgroup(not_in_sent_word_list, df_youtube_sent_select, "search_string", 5, simple=False)\n
       source_word_list is searching word list\n
       df_target is dataframe, target_column are dataframe column string name\n
       word_sample_num is searching sample number.
       simple use for all column row result or only target column result 
    '''
    if simple:
        df_select = df_target[[f"{target_column}"]].dropna()
    else:
        df_select = df_target
        
    df_result = pd.DataFrame()
    for i in source_word_list:
        try:
            word_in_word_cluster = df_select[df_select[f"{target_column}"].str.contains(fr"(?:\s|^){i}(?:\s|$)", na=True)].head(word_sample_num)    
        except:
            pass        
        word_in_word_cluster.insert(0,"search_string",i)
        df_result = pd.concat([df_result,word_in_word_cluster], axis=0)
    df_result.reset_index(drop=True, inplace=True)

    return df_result

In [5]:
def word_group_youtube(df, search_list, target_column, sample_num):
    '''
    word_group_youtube(df_youtube_sentence, search_list, "sentence", 6)\n
    df_youtube_sentence is dataframe and "sentence" is its column for external searching_list
    ''' 
    df_search_result = pd.DataFrame()
    for j in search_list:
        try:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].sample(sample_num)
        except:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].head(sample_num)
        #df_result = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)]  # sentence length part
        #df_result.sort_values(f"{target_column}",key=lambda x:x.str.len(), inplace=True)
        #df_select = df_result.head(sample_num)
        df_select.insert(0,"search_string",j)
        df_search_result = pd.concat([df_search_result,df_select], axis=0)
    df_search_result.reset_index(inplace=True, drop=True)
    
    return df_search_result

In [6]:
def word_group_time_loc(df, search, start_sent, end_sent, sent, sent_video_id):
    '''
    word_group_time_loc(df_search_result, "search_string", "start_time", "end_time", "sentence", "video_id")\n
    df_search_result is dataframe and "search_string", "start_time", "end_time", "sentence", "video_id" are its columns
    '''
    df.reset_index(drop=True, inplace=True)
    word_time_loc_list = []
    for i in range(len(df)):
        word = df.loc[i,f"{search}"]
        start_time = df.loc[i,f"{start_sent}"]
        end_time = df.loc[i,f"{end_sent}"]
        sentence = df.loc[i,f"{sent}"]
        video_id = df.loc[i,f"{sent_video_id}"]
        time_length = end_time-start_time
        sentence_length = len(sentence)
        time_length_ratio = time_length/sentence_length
        loc_list = []
        for j in re.finditer(fr"(?:\s|^){word}(?:\s|$)", sentence, re.IGNORECASE|re.UNICODE):
            loc_list.append(j)
            start = loc_list[0].start()
            end = loc_list[0].end()
            start_loc = start_time+(start*time_length_ratio)
            end_loc = start_time+(end*time_length_ratio)
        word_time_loc_list.append([word,start_loc,end_loc,sentence,video_id])
    df_word_time_loc = pd.DataFrame(word_time_loc_list, columns=[f"{search}",f"{start_sent}",f"{end_sent}",f"{sent}",f"{sent_video_id}"])

    return df_word_time_loc

In [7]:
def word_count_result(df, column_list, set_condition=False): # df is dataframe, column_list is list value
    '''
    word_count_bool(df, column_list): df columns word count for word frequency\n
    df is dataframe, column_list is list value\n
    word_count_bool(df, ["word","twogram"]):
    '''
    list_all = []
    for i in df.loc[:,[x for x in column_list]].columns:
        if set_condition:
            var_list = set(df[f"{i}"].dropna().tolist())
        else:
            var_list = df[f"{i}"].dropna().tolist()
        for j in var_list:
            list_all.append(j)
    text = " ".join(list_all)
    word_list = re.findall(r"\w+",text, re.UNICODE)
    df_word_list = pd.DataFrame(word_list, columns=["word"])
    #df_word_list.rename(columns={0:"word"}, inplace=True)
    df_word_count = pd.DataFrame(df_word_list.value_counts())
    df_word_count.reset_index(inplace=True)
    df_word_count.rename(columns={0:"word_count"}, inplace=True)
    df_word_count.sort_values("word_count", ascending=False, inplace=True)
    df_word_count.reset_index(inplace=True, drop=True)
    
    return  df_word_count

In [8]:
def word_usage_result(word_list, df_target, target_column, target_opt_column, word_usage_min, word_usage_max):
    '''
    word_usage_result(word_list, df_ngram_pair, "threegram", "frequency", 1, 5) \n
    word_list is a list, df_target is a dateframe, target_column is df_target dataframe target column, \n
    target_opt_column is df_target dataframe opt_target column, \n
    word_usage_min and word_usage_max word usage condition.
    '''    
    word_num_dict = {}
    for i in word_list:
        word_num_dict[f"{i}"] = 0
    
    result_list_select = []
    var_list = []
    for i in range(len(df_target)):
        target_value = df_target.loc[i,f"{target_column}"]
        opt_value = df_target.loc[i,f"{target_opt_column}"]
        words = word_tokenize(target_value)   
        temp_list = [word for word in words]
        temp_list = temp_list + var_list
        # word count for max
        dict_list_count = Counter(temp_list)
        count_list = list(dict_list_count.values())
        # word count for min
        count_list2 = list(word_num_dict.values())
    
        if any([True if i>word_usage_max else False for i in count_list]) or not(any([True if j<word_usage_min else False for j in count_list2])):
            pass
        else:
            var_list = temp_list
            result_list_select.append([target_value,opt_value]) 
    
            for item2 in dict_list_count.items(): 
                word_num_dict[item2[0]] = item2[1]        
    df_result = pd.DataFrame(result_list_select, columns=[f"{target_column}",f"{target_opt_column}"])
    df_result.sort_values(by="frequency", ascending=False, inplace=True)
    df_result.reset_index(drop=True, inplace=True)
    
    return df_result

In [9]:
#path = f"/media/kurubal/SSD1/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
#Talk Time/Talk Time 1/Data/3-Talk Time Data Prepare"
#
#Path(path).mkdir(parents=True, exist_ok=True)

In [10]:
disable_video_id_list = ["H6E6N70jYqI","BoaYsdPtJYA","HUZINsU40Fk","Et3diPcEmfY"]

In [11]:
df_word_all = pd.read_excel(f"/media/kurubal/SSD1/Data Scientist/Work/Modern Ways/Project/{lang_folder.lower().capitalize()}/Deployment/Data/Word/Word_Merge_Preprocess.xlsx")
df_word_all

Unnamed: 0,word,frequency
0,ik,22539531
1,je,20769946
2,het,15696161
3,de,15258816
4,dat,13387137
...,...,...
331769,oorlogsgezichten,5
331770,opdrachtenlijst,5
331771,verlsaafde,5
331772,oxidatieproces,5


In [12]:
df_word_select = df_word_all.iloc[word_start:word_end,]

In [13]:
df_word_select

Unnamed: 0,word,frequency
0,ik,22539531
1,je,20769946
2,het,15696161
3,de,15258816
4,dat,13387137
...,...,...
195,genoeg,353128
196,wist,352948
197,oh,351988
198,klaar,350506


In [14]:
df_word_select.to_excel(f"{lang_folder.capitalize()}_{word_end}_Word.xlsx", index=False)

In [15]:
word_list = df_word_select["word"].to_list()
len(word_list)

200

In [16]:
if twogram_sentence_check:
    df_twogram_sent = pd.read_csv(f"/media/kurubal/SSD1/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/N Gram And Sentence/Two_Gram_And_Sentence.csv")  # ext. sentence and ngram
    df_twogram_sent.rename(columns={"two_gram":"twogram"}, inplace=True)  # ext. Not: Two_Gram_And_Sentence.csv convert to Two_Gram_And_Sentence_All.csv
    df_twogram = df_twogram_sent.loc[:,["twogram","frequency"]]
else:
    df_twogram = pd.read_csv(f"/media/kurubal/SSD1/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/Merge/Twogram_Merge.csv")  
    df_twogram = df_twogram.loc[:,["twogram","frequency"]]

df_twogram

Unnamed: 0,twogram,frequency
0,ik heb,1113451
1,het is,1102152
2,ik ben,1025494
3,dat is,939846
4,wat is,698111
...,...,...
3892596,schat euh,3
3892597,schat eten,3
3892598,levensgevaarlijk volgens,3
3892599,schat erg,3


In [17]:
# Twogram Func
d_list2  = df_twogram.iloc[:,0].values.tolist()

resultlist2 = []
manager = multiprocessing.Manager()
resultlist2 = manager.list()

def word_in_wordgroup(d_list2):
    mergelist2 = []
    try:
        word = d_list2.split()
    except:
        word = []
        #pass  disabled for non split value
    var1 = range(len(word))
    for j in var1:
        if word[j] in word_list:
            mergelist2.append(word[j])
            if len(mergelist2) == len(word):
                    resultlist2.append(d_list2)
                        
if __name__ == '__main__':
    # with Pool(16) as p:
    with Pool(nprocs) as p: # Pool number CPU sayısına eşit olursa tüm CPU lar çalışır
        p.map(word_in_wordgroup, d_list2) # string_word liste

result_list2 = list(resultlist2)
df_result2 = pd.DataFrame(result_list2, columns=[0])  # add columns parameter for empty result
df_result2 = df_result2.rename(columns = {0: "twogram"})
df_merge2 = pd.merge(df_result2, df_twogram, how="left", on="twogram")
df_merge_result2 = df_merge2.sort_values(by="frequency", ascending=False)
df_merge_result2.drop_duplicates(inplace=True)
df_merge_result2.reset_index(drop=True, inplace=True)
df_twogram_select = df_merge_result2
df_twogram_select

Unnamed: 0,twogram,frequency
0,ik heb,1113451
1,het is,1102152
2,ik ben,1025494
3,dat is,939846
4,wat is,698111
...,...,...
34489,waren zie,3
34490,waren zien,3
34491,zeker spijt,3
34492,eens keer,3


In [18]:
if threegram_sentence_check:
    df_threegram_sent = pd.read_csv(f"/media/kurubal/SSD1/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/N Gram And Sentence/Three_Gram_And_Sentence.csv")  # ext. sentence and ngram
    df_threegram_sent.rename(columns={"three_gram":"threegram"}, inplace=True)  # ext. Not: Two_Gram_And_Sentence.csv convert to Two_Gram_And_Sentence_All.csv
    df_threegram = df_threegram_sent.loc[:,["threegram","frequency"]]
else:
    df_threegram = pd.read_csv(f"/media/kurubal/SSD1/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Result/N Gram/Merge/Threegram_Merge.csv")  
    df_threegram = df_threegram.loc[:,["threegram","frequency"]]

df_threegram

Unnamed: 0,threegram,frequency
0,wat is er,364876
1,het spijt me,258840
2,ik weet het,237218
3,wat doe je,137651
4,het is een,135216
...,...,...
9257115,leven iets voorstelt,3
9257116,leven iets voor,3
9257117,leven iets slecht,3
9257118,leven iets onnatuurlijks,3


In [19]:
# Threegram Func
d_list3  = df_threegram.iloc[:,0].values.tolist()

resultlist3 = []
manager = multiprocessing.Manager()
resultlist3 = manager.list()

def word_in_wordgroup(d_list3):
    mergelist3 = []
    try:
        word = d_list3.split()
    except:
        word = []
        #pass  disabled for non split value
    var1 = range(len(word))
    for j in var1:
        if word[j] in word_list:
            mergelist3.append(word[j])
            if len(mergelist3) == len(word):
                    resultlist3.append(d_list3)
                        
if __name__ == '__main__':
    # with Pool(16) as p:
    with Pool(nprocs) as p: # Pool number CPU sayısına eşit olursa tüm CPU lar çalışır
        p.map(word_in_wordgroup, d_list3) # string_word liste

result_list3 = list(resultlist3)
df_result3 = pd.DataFrame(result_list3, columns=[0])  # add columns parameter for empty result
df_result3 = df_result3.rename(columns = {0: "threegram"})
df_merge3 = pd.merge(df_result3, df_threegram, how="left", on="threegram")
df_merge_result3 = df_merge3.sort_values(by="frequency", ascending=False)
df_merge_result3.drop_duplicates(inplace=True)
df_merge_result3.reset_index(drop=True, inplace=True)
df_threegram_select = df_merge_result3
df_threegram_select

Unnamed: 0,threegram,frequency
0,wat is er,364876
1,het spijt me,258840
2,ik weet het,237218
3,wat doe je,137651
4,het is een,135216
...,...,...
549820,een moeder maar,3
549821,u toch zie,3
549822,gaan nu of,3
549823,ze uit anders,3


In [20]:
df_youtube_sentence = pd.read_csv(f"/media/kurubal/SSD1/Data Scientist/Work/Modern Ways/Project/Youtube/Result/{lang_folder.capitalize()}/Sentence Clean Merge/Clean_Youtube_Sentence_Merge_Result.csv")
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,00:00:00.000,00:00:02.501,dit is de pangale v,BoaYsdPtJYA
1,00:00:04.501,00:00:05.434,unreal,BoaYsdPtJYA
2,00:00:08.518,00:00:13.066,dit is de eerste keer ik rijd in een supersport,BoaYsdPtJYA
3,00:00:13.066,00:00:14.316,ik heb dit in mijn hele leven,BoaYsdPtJYA
4,00:00:14.316,00:00:15.889,nog nooit eerder gedaan,BoaYsdPtJYA
...,...,...,...,...
252485,00:01:37.135,00:01:47.040,soms zitten de volwassenen met natte ogen te k...,MZUDbceIw4E
252486,00:01:47.040,00:01:49.340,dit ziet er al was super indrukwekkend uit en,MZUDbceIw4E
252487,00:01:49.340,00:01:53.600,het is nog maar een voorproefje dus kom zeker ...,MZUDbceIw4E
252488,00:01:53.600,00:01:57.914,tijdsblok te kiezen en zeker ook op tijd te ko...,MZUDbceIw4E


In [21]:
df_youtube_sentence = df_youtube_sentence[~df_youtube_sentence["video_id"].isin(disable_video_id_list)]
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
938,00:00:00.000,00:00:04.001,voet op de rem voet op de rem,qp0deztLqgU
939,00:00:07.002,00:00:08.986,ja geweldig dat op camera vastgelegd,qp0deztLqgU
940,00:00:21.005,00:00:22.989,man dit is zo cool,qp0deztLqgU
941,00:00:42.993,00:00:45.676,goedemorgen internet het is,qp0deztLqgU
942,00:00:45.676,00:00:47.165,en welkom terug op het kanaal,qp0deztLqgU
...,...,...,...,...
252485,00:01:37.135,00:01:47.040,soms zitten de volwassenen met natte ogen te k...,MZUDbceIw4E
252486,00:01:47.040,00:01:49.340,dit ziet er al was super indrukwekkend uit en,MZUDbceIw4E
252487,00:01:49.340,00:01:53.600,het is nog maar een voorproefje dus kom zeker ...,MZUDbceIw4E
252488,00:01:53.600,00:01:57.914,tijdsblok te kiezen en zeker ook op tijd te ko...,MZUDbceIw4E


In [22]:
df_youtube_sentence['start_time'] = pd.to_timedelta(df_youtube_sentence['start_time']) # data type converted timedelta for second 
df_youtube_sentence['end_time'] = pd.to_timedelta(df_youtube_sentence['end_time'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['start_time'] = pd.to_timedelta(df_youtube_sentence['start_time']) # data type converted timedelta for second
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['end_time'] = pd.to_timedelta(df_youtube_sentence['end_time'])


In [23]:
df_youtube_sentence['start_time'] = df_youtube_sentence['start_time'].apply(lambda x: x.total_seconds()) # convert seconds
df_youtube_sentence['end_time'] = df_youtube_sentence['end_time'].apply(lambda x: x.total_seconds())
df_youtube_sentence

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['start_time'] = df_youtube_sentence['start_time'].apply(lambda x: x.total_seconds()) # convert seconds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_youtube_sentence['end_time'] = df_youtube_sentence['end_time'].apply(lambda x: x.total_seconds())


Unnamed: 0,start_time,end_time,sentence,video_id
938,0.000,4.001,voet op de rem voet op de rem,qp0deztLqgU
939,7.002,8.986,ja geweldig dat op camera vastgelegd,qp0deztLqgU
940,21.005,22.989,man dit is zo cool,qp0deztLqgU
941,42.993,45.676,goedemorgen internet het is,qp0deztLqgU
942,45.676,47.165,en welkom terug op het kanaal,qp0deztLqgU
...,...,...,...,...
252485,97.135,107.040,soms zitten de volwassenen met natte ogen te k...,MZUDbceIw4E
252486,107.040,109.340,dit ziet er al was super indrukwekkend uit en,MZUDbceIw4E
252487,109.340,113.600,het is nog maar een voorproefje dus kom zeker ...,MZUDbceIw4E
252488,113.600,117.914,tijdsblok te kiezen en zeker ook op tijd te ko...,MZUDbceIw4E


In [24]:
df_sentence_link = pd.read_excel(f"/media/kurubal/SSD1/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Talk Time/Result/2-Adjust Word Group In Youtube Sentence Word Usage Analysis/{lang_folder.capitalize()}_{word_end}_Word_Group_In_Youtube_Sentence_Sample_Manuel.xlsx")
df_sentence_link = df_sentence_link.iloc[:,[0,1,2,3,4,5]]
df_sentence_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ga ik u,40,41,en zoals gezegd in de inleiding ga ik u vandaa...,Gokx4ZpGRC0,https://www.youtube.com/watch?v=Gokx4ZpGRC0&t=40s


In [25]:
twogram_set = set()
threegram_set = set()
for sent in df_sentence_link["search_string"]:
    twogram_zip = ngrams(sent.split(), 2)
    for two in twogram_zip:
        twogram = " ".join(two)
        twogram_set.add(f"{twogram}") 
    threegram_zip = ngrams(sent.split(), 3)
    for three in threegram_zip:
        threegram = " ".join(three)
        threegram_set.add(f"{threegram}")

In [26]:
len(twogram_set)

2

In [27]:
len(threegram_set)

1

In [28]:
twogram_list = list(twogram_set)
#twogram_list

In [29]:
threegram_list = list(threegram_set)
#threegram_list

In [30]:
df_twogram_for_link = df_twogram_select[df_twogram_select["twogram"].isin(twogram_list)]
df_twogram_for_link.reset_index(drop=True, inplace=True)
df_twogram_for_link

Unnamed: 0,twogram,frequency
0,ik u,44902
1,ga ik,31140


In [31]:
df_twogram_selected_for_link = word_usage_result(word_list, df_twogram_for_link, "twogram", "frequency", min_word_use_num, max_word_use_num)
df_twogram_selected_for_link 

Unnamed: 0,twogram,frequency
0,ik u,44902
1,ga ik,31140


In [32]:
df_threegram_for_link = df_threegram_select[df_threegram_select["threegram"].isin(threegram_list)]
df_threegram_for_link.reset_index(drop=True, inplace=True)
df_threegram_for_link

Unnamed: 0,threegram,frequency
0,ga ik u,38


In [33]:
df_threegram_selected_for_link = word_usage_result(word_list, df_threegram_for_link, "threegram", "frequency", min_word_use_num, max_word_use_num)
df_threegram_selected_for_link 

Unnamed: 0,threegram,frequency
0,ga ik u,38


In [34]:
df_twogram_search_result = word_group_youtube(df_youtube_sentence, df_twogram_selected_for_link["twogram"], "sentence", sample_num)
df_twogram_search_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik u,10149.406,10152.105,is want dan wil ik u graag volgend jaar,R4Q1cNbNKU0
1,ik u,115.98,118.98,nu de sluizen die ik u in deze vlog ga laten b...,Gokx4ZpGRC0
2,ik u,1303.99,1307.56,verzoek ik u vriendelijk uw vingers in,R4Q1cNbNKU0
3,ik u,118.02,123.12,laat het ons dan weten hieronder in de reactie...,NQ49zrXMZa0
4,ik u,5399.51,5404.73,er echt super backdoor maar niet gered maar ma...,EmxRrMXp3UY
5,ga ik,360.479,366.029,op maar die ga ik even schoonmaken thuis voor ...,Grus9PScjRM
6,ga ik,3.81,9.96,zeeland en zuidholland mijn landbouw ons stran...,uuv1RPvM3Fk
7,ga ik,67.74,72.5,in deze serie ga ik langs jongeren die net als...,xp578J4RRk0
8,ga ik,421.918,426.792,dan moet je stappen gaan ondernemen hoe ga ik ...,5S5QX4ZK3Q0
9,ga ik,16.529,23.34,vind het gewoon weg de perfecte selfie stok ik...,TwEqmxmUcLg


In [35]:
df_word_group_time_loc_twogram_result = word_group_time_loc(df_twogram_search_result, "search_string", "start_time", "end_time", "sentence", "video_id")
df_word_group_time_loc_twogram_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik u,10150.444077,10150.859308,is want dan wil ik u graag volgend jaar,R4Q1cNbNKU0
1,ik u,116.942264,117.281887,nu de sluizen die ik u in deze vlog ga laten b...,Gokx4ZpGRC0
2,ik u,1304.647632,1305.211316,verzoek ik u vriendelijk uw vingers in,R4Q1cNbNKU0
3,ik u,121.270549,121.606813,laat het ons dan weten hieronder in de reactie...,NQ49zrXMZa0
4,ik u,5403.531967,5404.04541,er echt super backdoor maar niet gered maar ma...,EmxRrMXp3UY
5,ga ik,361.531586,362.201414,op maar die ga ik even schoonmaken thuis voor ...,Grus9PScjRM
6,ga ik,8.628557,9.072371,zeeland en zuidholland mijn landbouw ons stran...,uuv1RPvM3Fk
7,ga ik,68.443182,68.821818,in deze serie ga ik langs jongeren die net als...,xp578J4RRk0
8,ga ik,423.694505,424.013364,dan moet je stappen gaan ondernemen hoe ga ik ...,5S5QX4ZK3Q0
9,ga ik,20.762865,21.407149,vind het gewoon weg de perfecte selfie stok ik...,TwEqmxmUcLg


In [36]:
df_word_group_time_loc_twogram_result.start_time = df_word_group_time_loc_twogram_result.start_time.apply(lambda x: (x-time_shift))
df_word_group_time_loc_twogram_result.end_time = df_word_group_time_loc_twogram_result.end_time.apply(lambda x: (x+time_shift))
df_word_group_time_loc_twogram_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik u,10149.844077,10151.459308,is want dan wil ik u graag volgend jaar,R4Q1cNbNKU0
1,ik u,116.342264,117.881887,nu de sluizen die ik u in deze vlog ga laten b...,Gokx4ZpGRC0
2,ik u,1304.047632,1305.811316,verzoek ik u vriendelijk uw vingers in,R4Q1cNbNKU0
3,ik u,120.670549,122.206813,laat het ons dan weten hieronder in de reactie...,NQ49zrXMZa0
4,ik u,5402.931967,5404.64541,er echt super backdoor maar niet gered maar ma...,EmxRrMXp3UY
5,ga ik,360.931586,362.801414,op maar die ga ik even schoonmaken thuis voor ...,Grus9PScjRM
6,ga ik,8.028557,9.672371,zeeland en zuidholland mijn landbouw ons stran...,uuv1RPvM3Fk
7,ga ik,67.843182,69.421818,in deze serie ga ik langs jongeren die net als...,xp578J4RRk0
8,ga ik,423.094505,424.613364,dan moet je stappen gaan ondernemen hoe ga ik ...,5S5QX4ZK3Q0
9,ga ik,20.162865,22.007149,vind het gewoon weg de perfecte selfie stok ik...,TwEqmxmUcLg


In [37]:
df_word_group_time_loc_twogram_result.start_time = df_word_group_time_loc_twogram_result.start_time.apply(lambda x: round(x))
df_word_group_time_loc_twogram_result.end_time = df_word_group_time_loc_twogram_result.end_time.apply(lambda x: round(x))
df_word_group_time_loc_twogram_result 

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik u,10150,10151,is want dan wil ik u graag volgend jaar,R4Q1cNbNKU0
1,ik u,116,118,nu de sluizen die ik u in deze vlog ga laten b...,Gokx4ZpGRC0
2,ik u,1304,1306,verzoek ik u vriendelijk uw vingers in,R4Q1cNbNKU0
3,ik u,121,122,laat het ons dan weten hieronder in de reactie...,NQ49zrXMZa0
4,ik u,5403,5405,er echt super backdoor maar niet gered maar ma...,EmxRrMXp3UY
5,ga ik,361,363,op maar die ga ik even schoonmaken thuis voor ...,Grus9PScjRM
6,ga ik,8,10,zeeland en zuidholland mijn landbouw ons stran...,uuv1RPvM3Fk
7,ga ik,68,69,in deze serie ga ik langs jongeren die net als...,xp578J4RRk0
8,ga ik,423,425,dan moet je stappen gaan ondernemen hoe ga ik ...,5S5QX4ZK3Q0
9,ga ik,20,22,vind het gewoon weg de perfecte selfie stok ik...,TwEqmxmUcLg


In [38]:
df_word_group_time_loc_twogram_result["video_url"] = "https://www.youtube.com/watch?v="+df_word_group_time_loc_twogram_result['video_id'].map(str)+"&t="+df_word_group_time_loc_twogram_result['start_time'].map(str)+"s"
df_word_group_time_loc_twogram_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ik u,10150,10151,is want dan wil ik u graag volgend jaar,R4Q1cNbNKU0,https://www.youtube.com/watch?v=R4Q1cNbNKU0&t=...
1,ik u,116,118,nu de sluizen die ik u in deze vlog ga laten b...,Gokx4ZpGRC0,https://www.youtube.com/watch?v=Gokx4ZpGRC0&t=...
2,ik u,1304,1306,verzoek ik u vriendelijk uw vingers in,R4Q1cNbNKU0,https://www.youtube.com/watch?v=R4Q1cNbNKU0&t=...
3,ik u,121,122,laat het ons dan weten hieronder in de reactie...,NQ49zrXMZa0,https://www.youtube.com/watch?v=NQ49zrXMZa0&t=...
4,ik u,5403,5405,er echt super backdoor maar niet gered maar ma...,EmxRrMXp3UY,https://www.youtube.com/watch?v=EmxRrMXp3UY&t=...
5,ga ik,361,363,op maar die ga ik even schoonmaken thuis voor ...,Grus9PScjRM,https://www.youtube.com/watch?v=Grus9PScjRM&t=...
6,ga ik,8,10,zeeland en zuidholland mijn landbouw ons stran...,uuv1RPvM3Fk,https://www.youtube.com/watch?v=uuv1RPvM3Fk&t=8s
7,ga ik,68,69,in deze serie ga ik langs jongeren die net als...,xp578J4RRk0,https://www.youtube.com/watch?v=xp578J4RRk0&t=68s
8,ga ik,423,425,dan moet je stappen gaan ondernemen hoe ga ik ...,5S5QX4ZK3Q0,https://www.youtube.com/watch?v=5S5QX4ZK3Q0&t=...
9,ga ik,20,22,vind het gewoon weg de perfecte selfie stok ik...,TwEqmxmUcLg,https://www.youtube.com/watch?v=TwEqmxmUcLg&t=20s


In [39]:
df_word_group_time_loc_twogram_result.to_excel(f"{lang_folder.capitalize()}_Twogram_With_{word_end}_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result.xlsx", index=False) 

In [40]:
df_threegram_search_result = word_group_youtube(df_youtube_sentence, df_threegram_selected_for_link["threegram"], "sentence", sample_num)
df_threegram_search_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ga ik u,3.066,5.915,neen wel dat ga ik u laten zien in deze vlog,S_rAt9tyk6Q
1,ga ik u,38.82,42.32,en zoals gezegd in de inleiding ga ik u vandaa...,Gokx4ZpGRC0
2,ga ik u,98.8,101.0,dat ga ik u vertellen,nASlwkh7EGA
3,ga ik u,82.108,89.391,zo beste volgers zoals gezegd ga ik u meenemen...,JqSppQf6wMU
4,ga ik u,1009.12,1012.66,ik wil u wel een bedrag noemen en dat ga ik u ...,ycB_yozQW6U


In [41]:
df_word_group_time_loc_threegram_result = word_group_time_loc(df_threegram_search_result, "search_string", "start_time", "end_time", "sentence", "video_id")
df_word_group_time_loc_threegram_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ga ik u,3.843,4.42575,neen wel dat ga ik u laten zien in deze vlog,S_rAt9tyk6Q
1,ga ik u,40.392464,40.848986,en zoals gezegd in de inleiding ga ik u vandaa...,Gokx4ZpGRC0
2,ga ik u,99.114286,100.057143,dat ga ik u vertellen,nASlwkh7EGA
3,ga ik u,83.944583,84.514557,zo beste volgers zoals gezegd ga ik u meenemen...,JqSppQf6wMU
4,ga ik u,1011.545556,1012.135556,ik wil u wel een bedrag noemen en dat ga ik u ...,ycB_yozQW6U


In [42]:
df_word_group_time_loc_threegram_result.start_time = df_word_group_time_loc_threegram_result.start_time.apply(lambda x: (x-time_shift))
df_word_group_time_loc_threegram_result.end_time = df_word_group_time_loc_threegram_result.end_time.apply(lambda x: (x+time_shift))
df_word_group_time_loc_threegram_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ga ik u,3.243,5.02575,neen wel dat ga ik u laten zien in deze vlog,S_rAt9tyk6Q
1,ga ik u,39.792464,41.448986,en zoals gezegd in de inleiding ga ik u vandaa...,Gokx4ZpGRC0
2,ga ik u,98.514286,100.657143,dat ga ik u vertellen,nASlwkh7EGA
3,ga ik u,83.344583,85.114557,zo beste volgers zoals gezegd ga ik u meenemen...,JqSppQf6wMU
4,ga ik u,1010.945556,1012.735556,ik wil u wel een bedrag noemen en dat ga ik u ...,ycB_yozQW6U


In [43]:
df_word_group_time_loc_threegram_result.start_time = df_word_group_time_loc_threegram_result.start_time.apply(lambda x: round(x))
df_word_group_time_loc_threegram_result.end_time = df_word_group_time_loc_threegram_result.end_time.apply(lambda x: round(x))
df_word_group_time_loc_threegram_result 

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ga ik u,3,5,neen wel dat ga ik u laten zien in deze vlog,S_rAt9tyk6Q
1,ga ik u,40,41,en zoals gezegd in de inleiding ga ik u vandaa...,Gokx4ZpGRC0
2,ga ik u,99,101,dat ga ik u vertellen,nASlwkh7EGA
3,ga ik u,83,85,zo beste volgers zoals gezegd ga ik u meenemen...,JqSppQf6wMU
4,ga ik u,1011,1013,ik wil u wel een bedrag noemen en dat ga ik u ...,ycB_yozQW6U


In [44]:
df_word_group_time_loc_threegram_result["video_url"] = "https://www.youtube.com/watch?v="+df_word_group_time_loc_threegram_result['video_id'].map(str)+"&t="+df_word_group_time_loc_threegram_result['start_time'].map(str)+"s"
df_word_group_time_loc_threegram_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ga ik u,3,5,neen wel dat ga ik u laten zien in deze vlog,S_rAt9tyk6Q,https://www.youtube.com/watch?v=S_rAt9tyk6Q&t=3s
1,ga ik u,40,41,en zoals gezegd in de inleiding ga ik u vandaa...,Gokx4ZpGRC0,https://www.youtube.com/watch?v=Gokx4ZpGRC0&t=40s
2,ga ik u,99,101,dat ga ik u vertellen,nASlwkh7EGA,https://www.youtube.com/watch?v=nASlwkh7EGA&t=99s
3,ga ik u,83,85,zo beste volgers zoals gezegd ga ik u meenemen...,JqSppQf6wMU,https://www.youtube.com/watch?v=JqSppQf6wMU&t=83s
4,ga ik u,1011,1013,ik wil u wel een bedrag noemen en dat ga ik u ...,ycB_yozQW6U,https://www.youtube.com/watch?v=ycB_yozQW6U&t=...


In [45]:
df_word_group_time_loc_threegram_result.to_excel(f"{lang_folder.capitalize()}_Threegram_With_{word_end}_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result.xlsx", index=False) 

In [46]:
df_word_search_result = word_group_youtube(df_youtube_sentence, word_list, "sentence", 5)  # (sample_num+1)
df_word_search_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik,10506.100,10509.230,de voorzitter op verzoek van de heer bisschop ...,JRhFqrIDklA
1,ik,96.030,99.123,volgend product die ik jullie zal voorstellen ...,4yythXPrOLw
2,ik,915.330,923.400,uit haar beste intenties aan haar liefde zei d...,TzzUV7hTVWk
3,ik,2785.700,2792.750,van oké wat is dan dat is dan voldoende om zon...,iLJg6UP0y_8
4,ik,2.328,6.631,en dan natuurlijk ook als ik het verschil tuss...,PzqrzEue4Ec
...,...,...,...,...,...
995,neem,3618.850,3626.830,een punt nul diggle waar het voeren met uw raa...,idTEcq_P1WY
996,neem,63.760,69.780,wilt u meer weten over frothpack neem dan cont...,aEICbq3QzvU
997,neem,75.000,82.000,maar mijn vrouw en ik doen veel om de beurt du...,qqw7KwEImIs
998,neem,692.420,700.850,was je het oudste de jongste neem mijn vader w...,wehip3eUEi4


In [47]:
df_word_group_time_loc_word_result = word_group_time_loc(df_word_search_result, "search_string", "start_time", "end_time", "sentence", "video_id")
df_word_group_time_loc_word_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik,10508.368116,10508.549565,de voorzitter op verzoek van de heer bisschop ...,JRhFqrIDklA
1,ik,96.881696,97.061000,volgend product die ik jullie zal voorstellen ...,4yythXPrOLw
2,ik,920.737732,921.070515,uit haar beste intenties aan haar liefde zei d...,TzzUV7hTVWk
3,ik,2791.742857,2792.030612,van oké wat is dan dat is dan voldoende om zon...,iLJg6UP0y_8
4,ik,3.608655,3.813560,en dan natuurlijk ook als ik het verschil tuss...,PzqrzEue4Ec
...,...,...,...,...,...
995,neem,3626.418660,3626.830000,een punt nul diggle waar het voeren met uw raa...,idTEcq_P1WY
996,neem,66.026353,66.451294,wilt u meer weten over frothpack neem dan cont...,aEICbq3QzvU
997,neem,78.932584,79.404494,maar mijn vrouw en ik doen veel om de beurt du...,qqw7KwEImIs
998,neem,695.013846,695.569670,was je het oudste de jongste neem mijn vader w...,wehip3eUEi4


In [48]:
df_word_group_time_loc_word_result.start_time = df_word_group_time_loc_word_result.start_time.apply(lambda x: (x-time_shift))
df_word_group_time_loc_word_result.end_time = df_word_group_time_loc_word_result.end_time.apply(lambda x: (x+time_shift))
df_word_group_time_loc_word_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik,10507.768116,10509.149565,de voorzitter op verzoek van de heer bisschop ...,JRhFqrIDklA
1,ik,96.281696,97.661000,volgend product die ik jullie zal voorstellen ...,4yythXPrOLw
2,ik,920.137732,921.670515,uit haar beste intenties aan haar liefde zei d...,TzzUV7hTVWk
3,ik,2791.142857,2792.630612,van oké wat is dan dat is dan voldoende om zon...,iLJg6UP0y_8
4,ik,3.008655,4.413560,en dan natuurlijk ook als ik het verschil tuss...,PzqrzEue4Ec
...,...,...,...,...,...
995,neem,3625.818660,3627.430000,een punt nul diggle waar het voeren met uw raa...,idTEcq_P1WY
996,neem,65.426353,67.051294,wilt u meer weten over frothpack neem dan cont...,aEICbq3QzvU
997,neem,78.332584,80.004494,maar mijn vrouw en ik doen veel om de beurt du...,qqw7KwEImIs
998,neem,694.413846,696.169670,was je het oudste de jongste neem mijn vader w...,wehip3eUEi4


In [49]:
df_word_group_time_loc_word_result.start_time = df_word_group_time_loc_word_result.start_time.apply(lambda x: round(x))
df_word_group_time_loc_word_result.end_time = df_word_group_time_loc_word_result.end_time.apply(lambda x: round(x))
df_word_group_time_loc_word_result 

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ik,10508,10509,de voorzitter op verzoek van de heer bisschop ...,JRhFqrIDklA
1,ik,96,98,volgend product die ik jullie zal voorstellen ...,4yythXPrOLw
2,ik,920,922,uit haar beste intenties aan haar liefde zei d...,TzzUV7hTVWk
3,ik,2791,2793,van oké wat is dan dat is dan voldoende om zon...,iLJg6UP0y_8
4,ik,3,4,en dan natuurlijk ook als ik het verschil tuss...,PzqrzEue4Ec
...,...,...,...,...,...
995,neem,3626,3627,een punt nul diggle waar het voeren met uw raa...,idTEcq_P1WY
996,neem,65,67,wilt u meer weten over frothpack neem dan cont...,aEICbq3QzvU
997,neem,78,80,maar mijn vrouw en ik doen veel om de beurt du...,qqw7KwEImIs
998,neem,694,696,was je het oudste de jongste neem mijn vader w...,wehip3eUEi4


In [50]:
df_word_group_time_loc_word_result["video_url"] = "https://www.youtube.com/watch?v="+df_word_group_time_loc_word_result['video_id'].map(str)+"&t="+df_word_group_time_loc_word_result['start_time'].map(str)+"s"
df_word_group_time_loc_word_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ik,10508,10509,de voorzitter op verzoek van de heer bisschop ...,JRhFqrIDklA,https://www.youtube.com/watch?v=JRhFqrIDklA&t=...
1,ik,96,98,volgend product die ik jullie zal voorstellen ...,4yythXPrOLw,https://www.youtube.com/watch?v=4yythXPrOLw&t=96s
2,ik,920,922,uit haar beste intenties aan haar liefde zei d...,TzzUV7hTVWk,https://www.youtube.com/watch?v=TzzUV7hTVWk&t=...
3,ik,2791,2793,van oké wat is dan dat is dan voldoende om zon...,iLJg6UP0y_8,https://www.youtube.com/watch?v=iLJg6UP0y_8&t=...
4,ik,3,4,en dan natuurlijk ook als ik het verschil tuss...,PzqrzEue4Ec,https://www.youtube.com/watch?v=PzqrzEue4Ec&t=3s
...,...,...,...,...,...,...
995,neem,3626,3627,een punt nul diggle waar het voeren met uw raa...,idTEcq_P1WY,https://www.youtube.com/watch?v=idTEcq_P1WY&t=...
996,neem,65,67,wilt u meer weten over frothpack neem dan cont...,aEICbq3QzvU,https://www.youtube.com/watch?v=aEICbq3QzvU&t=65s
997,neem,78,80,maar mijn vrouw en ik doen veel om de beurt du...,qqw7KwEImIs,https://www.youtube.com/watch?v=qqw7KwEImIs&t=78s
998,neem,694,696,was je het oudste de jongste neem mijn vader w...,wehip3eUEi4,https://www.youtube.com/watch?v=wehip3eUEi4&t=...


In [51]:
df_word_group_time_loc_word_result.to_excel(f"{lang_folder.capitalize()}_{word_end}_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result.xlsx", index=False)

#### Copy Move And Delete

In [52]:
output_file = glob.glob(f"{lang_folder.capitalize()}_*_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result.xlsx")
output_file

['Dutch_Twogram_With_200_Word_5_Youtube_0.6s_Timeshift_For_Talk_Time_Result.xlsx',
 'Dutch_Threegram_With_200_Word_5_Youtube_0.6s_Timeshift_For_Talk_Time_Result.xlsx',
 'Dutch_200_Word_5_Youtube_0.6s_Timeshift_For_Talk_Time_Result.xlsx']

In [53]:
for y in output_file:
    source = y # source directory
    destination = path
    shutil.copy2(source, destination)

In [54]:
for z in output_file:
    try:
        os.remove(z)
    except:
        pass