### Talk Time

In [1]:
import os
import multiprocessing
#import multiprocessing as mp
from multiprocessing import Process, Manager, Pool, Queue
from itertools import islice
from collections import Counter
import re
import pandas as pd
import numpy as np
import glob
import nltk
from nltk import word_tokenize
from nltk import ngrams
from functools import reduce
from pathlib import Path
import shutil

In [2]:
nprocs = multiprocessing.cpu_count()
print(f"Number of CPU cores: {nprocs}")

Number of CPU cores: 16


In [3]:
# language pair
lang_folder = "Dutch"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
#lang_pair = "Intersect"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> native language

# adding native word to shared word
word_start = 0  # 0 native word start index
word_end = 200  # 28 native word end index

# youtube read data
sample_num = 1  # 6
time_shift = 0.6

In [4]:
def word_in_wordgroup_simple(source_word_list, df_target, target_column, word_sample_num, simple=False):

    '''word_in_wordgroup(not_in_sent_word_list, df_youtube_sent_select, "search_string", 5, simple=False)\n
       source_word_list is searching word list\n
       df_target is dataframe, target_column are dataframe column string name\n
       word_sample_num is searching sample number.
       simple use for all column row result or only target column result 
    '''
    if simple:
        df_select = df_target[[f"{target_column}"]].dropna()
    else:
        df_select = df_target
        
    df_result = pd.DataFrame()
    for i in source_word_list:
        try:
            word_in_word_cluster = df_select[df_select[f"{target_column}"].str.contains(fr"(?:\s|^){i}(?:\s|$)", na=True)].head(word_sample_num)    
        except:
            pass        
        word_in_word_cluster.insert(0,"search_string",i)
        df_result = pd.concat([df_result,word_in_word_cluster], axis=0)
    df_result.reset_index(drop=True, inplace=True)

    return df_result

In [5]:
def word_group_youtube(df, search_list, target_column, sample_num):
    '''
    word_group_youtube(df_youtube_sentence, search_list, "sentence", 6)
    ''' 
    df_search_result = pd.DataFrame()
    for j in search_list:
        try:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].sample(sample_num)
        except:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].head(sample_num)
        #df_result = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)]  # sentence length part
        #df_result.sort_values(f"{target_column}",key=lambda x:x.str.len(), inplace=True)
        #df_select = df_result.head(sample_num)
        df_select.insert(0,"search_string",j)
        df_search_result = pd.concat([df_search_result,df_select], axis=0)
    df_search_result.reset_index(inplace=True, drop=True)
    
    return df_search_result

In [6]:
def word_group_time_loc(df, search, start_sent, end_sent, sent, sent_video_id):
    '''
    word_group_time_loc(df_search_result, "search_string", "start_time", "end_time", "sentence", "video_id")
    '''
    df.reset_index(drop=True, inplace=True)
    word_time_loc_list = []
    for i in range(len(df)):
        word = df.loc[i,f"{search}"]
        start_time = df.loc[i,f"{start_sent}"]
        end_time = df.loc[i,f"{end_sent}"]
        sentence = df.loc[i,f"{sent}"]
        video_id = df.loc[i,f"{sent_video_id}"]
        time_length = end_time-start_time
        sentence_length = len(sentence)
        time_length_ratio = time_length/sentence_length
        loc_list = []
        for j in re.finditer(fr"(?:\s|^){word}(?:\s|$)", sentence, re.IGNORECASE|re.UNICODE):
            loc_list.append(j)
            start = loc_list[0].start()
            end = loc_list[0].end()
            start_loc = start_time+(start*time_length_ratio)
            end_loc = start_time+(end*time_length_ratio)
        word_time_loc_list.append([word,start_loc,end_loc,sentence,video_id])
    df_word_time_loc = pd.DataFrame(word_time_loc_list, columns=[f"{search}",f"{start_sent}",f"{end_sent}",f"{sent}",f"{sent_video_id}"])

    return df_word_time_loc

In [7]:
def word_count_result(df, column_list, set_condition=False): # df is dataframe, column_list is list value
    '''
    word_count_bool(df, column_list): df columns word count for word frequency\n
    df is dataframe, column_list is list value\n
    word_count_bool(df, ["word","twogram"]):
    '''
    list_all = []
    for i in df.loc[:,[x for x in column_list]].columns:
        if set_condition:
            var_list = set(df[f"{i}"].dropna().tolist())
        else:
            var_list = df[f"{i}"].dropna().tolist()
        for j in var_list:
            list_all.append(j)
    text = " ".join(list_all)
    word_list = re.findall(r"\w+",text, re.UNICODE)
    df_word_list = pd.DataFrame(word_list, columns=["word"])
    #df_word_list.rename(columns={0:"word"}, inplace=True)
    df_word_count = pd.DataFrame(df_word_list.value_counts())
    df_word_count.reset_index(inplace=True)
    df_word_count.rename(columns={0:"word_count"}, inplace=True)
    df_word_count.sort_values("word_count", ascending=False, inplace=True)
    df_word_count.reset_index(inplace=True, drop=True)
    
    return  df_word_count

In [8]:
path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Talk Time/Result/4-Talk Time Data Result"

Path(path).mkdir(parents=True, exist_ok=True)

In [9]:
df_word_all = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.lower().capitalize()}/Deployment/Data/Word/Word_Merge_Preprocess.xlsx")
df_word_all

Unnamed: 0,word,frequency
0,ik,22539531
1,je,20769946
2,het,15696161
3,de,15258816
4,dat,13387137
...,...,...
331769,oorlogsgezichten,5
331770,opdrachtenlijst,5
331771,verlsaafde,5
331772,oxidatieproces,5


In [10]:
df_word_select = df_word_all.iloc[word_start:word_end,]

In [11]:
df_word_select

Unnamed: 0,word,frequency
0,ik,22539531
1,je,20769946
2,het,15696161
3,de,15258816
4,dat,13387137
...,...,...
195,genoeg,353128
196,wist,352948
197,oh,351988
198,klaar,350506


In [12]:
#df_word_select.to_excel(f"{lang_folder.capitalize()}_200_Word.xlsx", index=False)

In [13]:
word_list = df_word_select["word"].to_list()
len(word_list)

200

In [14]:
df_youtube_sentence = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Youtube/Result/{lang_folder.capitalize()}/Sentence Clean Merge/Clean_Youtube_Sentence_Merge_Result.csv")
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,00:00:00.000,00:00:02.501,dit is de pangale v,BoaYsdPtJYA
1,00:00:04.501,00:00:05.434,unreal,BoaYsdPtJYA
2,00:00:08.518,00:00:13.066,dit is de eerste keer ik rijd in een supersport,BoaYsdPtJYA
3,00:00:13.066,00:00:14.316,ik heb dit in mijn hele leven,BoaYsdPtJYA
4,00:00:14.316,00:00:15.889,nog nooit eerder gedaan,BoaYsdPtJYA
...,...,...,...,...
252485,00:01:37.135,00:01:47.040,soms zitten de volwassenen met natte ogen te k...,MZUDbceIw4E
252486,00:01:47.040,00:01:49.340,dit ziet er al was super indrukwekkend uit en,MZUDbceIw4E
252487,00:01:49.340,00:01:53.600,het is nog maar een voorproefje dus kom zeker ...,MZUDbceIw4E
252488,00:01:53.600,00:01:57.914,tijdsblok te kiezen en zeker ook op tijd te ko...,MZUDbceIw4E


In [15]:
df_youtube_sentence['start_time'] = pd.to_timedelta(df_youtube_sentence['start_time']) # data type converted timedelta for second 
df_youtube_sentence['end_time'] = pd.to_timedelta(df_youtube_sentence['end_time'])

In [16]:
df_youtube_sentence['start_time'] = df_youtube_sentence['start_time'].apply(lambda x: x.total_seconds()) # convert seconds
df_youtube_sentence['end_time'] = df_youtube_sentence['end_time'].apply(lambda x: x.total_seconds())
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,0.000,2.501,dit is de pangale v,BoaYsdPtJYA
1,4.501,5.434,unreal,BoaYsdPtJYA
2,8.518,13.066,dit is de eerste keer ik rijd in een supersport,BoaYsdPtJYA
3,13.066,14.316,ik heb dit in mijn hele leven,BoaYsdPtJYA
4,14.316,15.889,nog nooit eerder gedaan,BoaYsdPtJYA
...,...,...,...,...
252485,97.135,107.040,soms zitten de volwassenen met natte ogen te k...,MZUDbceIw4E
252486,107.040,109.340,dit ziet er al was super indrukwekkend uit en,MZUDbceIw4E
252487,109.340,113.600,het is nog maar een voorproefje dus kom zeker ...,MZUDbceIw4E
252488,113.600,117.914,tijdsblok te kiezen en zeker ook op tijd te ko...,MZUDbceIw4E


In [17]:
# other option 
# mUf7VNqChac =>  black screen
# 0_CDMstFg7M => 10sn
# bj1JRuyYeco => 20sn
# cElhIDdGz7M => screensaver
default_video_id = "cElhIDdGz7M"
df_link_default = pd.DataFrame(data=[["repeat",5,7,"repeat_again",f"{default_video_id}",f"https://www.youtube.com/watch?v={default_video_id}&t=0s"]], columns=["search_string","start_time","end_time","sentence","video_id","video_url"])
df_link_default

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,repeat,5,7,repeat_again,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s


In [18]:
df_word_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/3-Talk Time Data Prepare/{lang_folder.capitalize()}_{word_end}_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_word_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ik,310,311,zo cool bang is ga ik naar het toilet even lat...,fK39cuorTzE,https://www.youtube.com/watch?v=fK39cuorTzE&t=...
1,je,219,220,enbekijk of je dit soort bordjes in huis kunt ...,lO7BdU_5RR8,https://www.youtube.com/watch?v=lO7BdU_5RR8&t=...
2,het,22718,22719,het doen zoals ik het al deed,QAjt9liE_a4,https://www.youtube.com/watch?v=QAjt9liE_a4&t=...
3,de,202,203,want de wolf is in principe een luie jager en ...,s_MPfmMt7Rw,https://www.youtube.com/watch?v=s_MPfmMt7Rw&t=...
4,dat,707,709,bijvoorbeeld dat zijn emailmarketing service h...,loL9z89lf28,https://www.youtube.com/watch?v=loL9z89lf28&t=...
...,...,...,...,...,...,...
195,genoeg,382,384,dat weten ze niet weten ze dat over het algeme...,MAIVQd4EF20,https://www.youtube.com/watch?v=MAIVQd4EF20&t=...
196,wist,426,428,en ik wist dat ik niets nieuws wou creêren maa...,MtPxtHKn0CQ,https://www.youtube.com/watch?v=MtPxtHKn0CQ&t=...
197,oh,78,79,oh en we moeten op tijd zijn want als we te la...,dMd0RQ5lGYE,https://www.youtube.com/watch?v=dMd0RQ5lGYE&t=78s
198,klaar,649,650,ben je klaar met het aanmaken van je voorraad,b5b9Xs461jU,https://www.youtube.com/watch?v=b5b9Xs461jU&t=...


In [19]:
df_word_link[df_word_link["search_string"].duplicated()]

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url


In [20]:
df_twogram_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/3-Talk Time Data Prepare/{lang_folder.capitalize()}_Twogram_With_{word_end}_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_twogram_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ik ga,888,889,ik ga alleen nog drie uur kijken en die voor m...,WlJ12LESGP0,https://www.youtube.com/watch?v=WlJ12LESGP0&t=...
1,niets anders,3273,3275,moeten makkelijk te manipuleren het is in weze...,vwUaEiSxeko,https://www.youtube.com/watch?v=vwUaEiSxeko&t=...
2,willen jullie,185,187,de film is nu bereikbaar en succes jullie eers...,bwcfbGYr12k,https://www.youtube.com/watch?v=bwcfbGYr12k&t=...
3,zoals jullie,194,196,leer jezelf gewoon aan om vanuit hier te werke...,IkZi2LmR_4c,https://www.youtube.com/watch?v=IkZi2LmR_4c&t=...
4,jullie hebben,345,347,wauw en zo komt dan hulp uit onverwachte hoek ...,_NQws_a77No,https://www.youtube.com/watch?v=_NQws_a77No&t=...
5,jullie weten,56,58,ja jullie weten allemaal beter hoe dat werkt d...,E7JNbhAGQpE,https://www.youtube.com/watch?v=E7JNbhAGQpE&t=56s
6,hebben hem,426,428,even kiezen om elkaar woont en hebben hem te g...,6QhQ4GVUhmI,https://www.youtube.com/watch?v=6QhQ4GVUhmI&t=...
7,wie iedereen,612,614,willen jullie weten wie iedereen roept ja,UbIX71sXnSY,https://www.youtube.com/watch?v=UbIX71sXnSY&t=...
8,ik u,106,108,burgemeester goed dames heren mag ik u uitnodi...,6b2CjjmooDA,https://www.youtube.com/watch?v=6b2CjjmooDA&t=...
9,dat u,4,6,ik kan mij voorstellen dat u dat zat wordt,ZNmu2g6lyC4,https://www.youtube.com/watch?v=ZNmu2g6lyC4&t=4s


In [21]:
df_threegram_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/3-Talk Time Data Prepare/{lang_folder.capitalize()}_Threegram_With_{word_end}_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_threegram_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url


In [22]:
df_sentence_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Talk Time/Result/2-Adjust Word Group In Youtube Sentence Word Usage Analysis/{lang_folder.capitalize()}_{word_end}_Word_Group_In_Youtube_Sentence_Sample_Manuel.xlsx")
df_sentence_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ik ga u in,30,31,ik ga u in drie stappen uitleggen hoe u de tec,VLM6zkuzvt0,https://www.youtube.com/watch?v=VLM6zkuzvt0&t=30s
1,ik u om uw,66,68,adviseer ik u om uw telefoon erbij te,F0uYZ4uI47w,https://www.youtube.com/watch?v=F0uYZ4uI47w&t=66s
2,u dat u de,292,294,denkt u dat u de enige bent,flv8b2UzJaY,https://www.youtube.com/watch?v=flv8b2UzJaY&t=...
3,ik ga u de,242,243,ik ga u de volgende keer weer zien,NGL5pzrcRKI,https://www.youtube.com/watch?v=NGL5pzrcRKI&t=...
4,hebben hem toen kunnen,51,53,gekomen en kwam de molen buiten werking mijn o...,0rYa2WvEAEo,https://www.youtube.com/watch?v=0rYa2WvEAEo&t=51s
5,gewoon geen genoeg geld,129,130,maar we krijgen gewoon geen genoeg geld,UvDi1HCeLpc,https://www.youtube.com/watch?v=UvDi1HCeLpc&t=...
6,waar mensen door kunnen,145,148,een abonneer button en drie videos waar mensen...,JSDirXX40cU,https://www.youtube.com/watch?v=JSDirXX40cU&t=...
7,mensen hebben echt geen,298,301,van ons want mensen hebben echt geen idee wat ...,PltmSE2Gjvg,https://www.youtube.com/watch?v=PltmSE2Gjvg&t=...
8,niets anders gedaan moet,460,462,ik heb echter niets anders gedaan moet geluk z...,4YSldeXmuXw,https://www.youtube.com/watch?v=4YSldeXmuXw&t=...
9,hebben moeten doen uit onze,86,89,dat we constant grepen hebben moeten doen uit ...,UvDi1HCeLpc,https://www.youtube.com/watch?v=UvDi1HCeLpc&t=86s


In [23]:
df_sentence_link_word_count = word_count_result(df_sentence_link, ["search_string"], set_condition=False)
df_sentence_link_word_count

Unnamed: 0,word,word_count
0,u,5
1,hebben,4
2,kunnen,3
3,ik,3
4,jullie,2
5,de,2
6,mensen,2
7,ga,2
8,geen,2
9,genoeg,1


In [24]:
set_sent_link_word = set(df_sentence_link_word_count["word"])
set_word_list = set(word_list)

In [25]:
set_word_list.difference(set_sent_link_word)

{'aan',
 'af',
 'al',
 'alleen',
 'allemaal',
 'alles',
 'als',
 'altijd',
 'andere',
 'bedankt',
 'ben',
 'bent',
 'beter',
 'bij',
 'binnen',
 'daar',
 'dacht',
 'dag',
 'dan',
 'dank',
 'denk',
 'deze',
 'die',
 'dit',
 'doe',
 'doet',
 'dood',
 'dus',
 'een',
 'eens',
 'en',
 'er',
 'erg',
 'even',
 'gaan',
 'gaat',
 'geef',
 'gezien',
 'goed',
 'h',
 'haar',
 'had',
 'heb',
 'hebt',
 'heeft',
 'heel',
 'helpen',
 'het',
 'hier',
 'hij',
 'hoe',
 'hou',
 'huis',
 'hun',
 'iemand',
 'iets',
 'is',
 'ja',
 'jaar',
 'je',
 'jij',
 'jou',
 'jouw',
 'kan',
 'keer',
 'kijk',
 'klaar',
 'kom',
 'komen',
 'komt',
 'kon',
 'kunt',
 'laat',
 'laten',
 'leven',
 'm',
 'maar',
 'mag',
 'maken',
 'man',
 'me',
 'mee',
 'meer',
 'met',
 'mij',
 'mijn',
 'misschien',
 'moeder',
 'mr',
 'n',
 'na',
 'naar',
 'nee',
 'neem',
 'net',
 'niemand',
 'niet',
 'niks',
 'nodig',
 'nog',
 'nooit',
 'nou',
 'nu',
 'of',
 'oh',
 'ok',
 'omdat',
 'ons',
 'ook',
 'op',
 'over',
 'praten',
 's',
 'spijt',
 'sta

In [26]:
# used for multi search result
#twogram_link_list = df_twogram_link["search_string"].to_list()
#threegram_link_list = df_threegram_link["search_string"].to_list()
df_result = pd.DataFrame()
for sent in df_sentence_link["search_string"]:

    # words
    sent_words = word_tokenize(sent)

    # twogram
    twogram_zip = ngrams(sent.split(), 2)
    twogram_list = [" ".join(x) for x in twogram_zip]
    #df_twogram_var = pd.DataFrame(data=twogram, columns=["twogram"])
    
    # threegram
    threegram_zip = ngrams(sent.split(), 3)
    threegram_list = [" ".join(y) for y in threegram_zip]
    #df_threegram_var = pd.DataFrame(data=threegram, columns=["threegram"])

    # begin sent repeat 
    df_sent_search_begin = df_sentence_link[df_sentence_link["search_string"] == sent]
    for repeat_num in range(3):
        df_result = pd.concat([df_result,df_sent_search_begin], axis=0)

    # word result
    for word in sent_words:        
        df_word_search_var = df_word_link[df_word_link["search_string"] == word]
        df_word_search_var.reset_index(drop=True, inplace=True)
        for i in range(len(df_word_search_var)):
            df_link_default_var = df_link_default
            try:
                word_time_diff_var = df_word_search_var.loc[i,"end_time"] - df_word_search_var.loc[i,"start_time"]
                if word_time_diff_var < 4:
                    word_time_diff_var2 = 4
                else:
                    word_time_diff_var2 = word_time_diff_var+1.0                                       
                df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + word_time_diff_var2
                df_result = pd.concat([df_result,df_word_search_var.iloc[[i,]]], axis=0)
                df_result = pd.concat([df_result,df_link_default_var], axis=0)
            except:
                pass

    # twogram result
    df_twogram_search_var = df_twogram_link[df_twogram_link["search_string"].isin(twogram_list)]
    df_twogram_search_var.reset_index(drop=True, inplace=True)
    for j in range(len(df_twogram_search_var)):
        df_link_default_var = df_link_default
        try:
            twogram_time_diff_var = df_twogram_search_var.loc[j,"end_time"] - df_twogram_search_var.loc[j,"start_time"]
            if twogram_time_diff_var < 4:
                twogram_time_diff_var2 = 4
            else:
                twogram_time_diff_var2 = twogram_time_diff_var+1.0 
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + twogram_time_diff_var2
            df_result = pd.concat([df_result,df_twogram_search_var.iloc[[j,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

    # threegram result
    df_threegram_search_var = df_threegram_link[df_threegram_link["search_string"].isin(threegram_list)]
    df_threegram_search_var.reset_index(drop=True, inplace=True)
    for k in range(len(df_threegram_search_var)):
        df_link_default_var = df_link_default
        try:
            threegram_time_diff_var = df_threegram_search_var.loc[k,"end_time"] - df_threegram_search_var.loc[k,"start_time"]
            if threegram_time_diff_var < 4:
                threegram_time_diff_var2 = 4
            else:
                threegram_time_diff_var2 = threegram_time_diff_var+1.0 
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + threegram_time_diff_var2
            df_result = pd.concat([df_result,df_threegram_search_var.iloc[[k,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

    # sentence added
    df_sent_search_var = df_sentence_link[df_sentence_link["search_string"] == sent]
    df_sent_search_var.reset_index(drop=True, inplace=True)
    for l in range(len(df_sent_search_var)):
        df_link_default_var = df_link_default
        try:
            sent_time_diff_var = df_sent_search_var.loc[l,"end_time"] - df_sent_search_var.loc[l,"start_time"]
            if sent_time_diff_var < 4:
                sent_time_diff_var2 = 4
            else:
                sent_time_diff_var2 = sent_time_diff_var+1.0 
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + sent_time_diff_var2
            df_result = pd.concat([df_result,df_sent_search_var.iloc[[l,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

df_result.reset_index(drop=True, inplace=True)   

In [27]:
df_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ik ga u in,30,31,ik ga u in drie stappen uitleggen hoe u de tec,VLM6zkuzvt0,https://www.youtube.com/watch?v=VLM6zkuzvt0&t=30s
1,ik ga u in,30,31,ik ga u in drie stappen uitleggen hoe u de tec,VLM6zkuzvt0,https://www.youtube.com/watch?v=VLM6zkuzvt0&t=30s
2,ik ga u in,30,31,ik ga u in drie stappen uitleggen hoe u de tec,VLM6zkuzvt0,https://www.youtube.com/watch?v=VLM6zkuzvt0&t=30s
3,ik,310,311,zo cool bang is ga ik naar het toilet even lat...,fK39cuorTzE,https://www.youtube.com/watch?v=fK39cuorTzE&t=...
4,repeat,5,9,repeat_again,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s
...,...,...,...,...,...,...
183,repeat,5,9,repeat_again,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s
184,wie iedereen,612,614,willen jullie weten wie iedereen roept ja,UbIX71sXnSY,https://www.youtube.com/watch?v=UbIX71sXnSY&t=...
185,repeat,5,9,repeat_again,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s
186,willen jullie weten wie iedereen,610,614,willen jullie weten wie iedereen roept ja,UbIX71sXnSY,https://www.youtube.com/watch?v=UbIX71sXnSY&t=...


In [28]:
df_result[df_result["search_string"] == "repeat"]["end_time"].max()

10

In [29]:
sample_num = df_result[df_result["search_string"] == "repeat"]["end_time"].count()
sample_num

76

In [30]:
((df_result[df_result["search_string"] == "repeat"]["end_time"].sum()*2)-sample_num)/60

21.566666666666666

In [31]:
df_result.to_excel(f"{lang_folder.capitalize()}_{word_end}_Word_Talk_Time1.xlsx", index=False)

In [32]:
# used for divide part
part_sample_num = 116
sample_num_start = 0
sample_num_end = part_sample_num
for i in range(12):
    df_var = df_result.iloc[sample_num_start:sample_num_end,]
    
    id_list = []
    start_list = []
    end_list = []
    for id, start, end in zip(df_var["video_id"].to_list(),df_var["start_time"].to_list(),df_var["end_time"].to_list()):
        id_list.append(str(id))
        start_list.append(str(start))
        end_list.append(str(end))

    id_join = ",".join(id_list)
    start_join = ",".join(start_list)
    end_join = ",".join(end_list)

    df_result_for_embedded = pd.DataFrame(data=[[id_join,start_join,end_join]], columns=["id","start_time","end_time"])
    df_result_for_embedded.to_excel(f"{lang_folder.capitalize()}_{word_end}_Word_Talk_Time1_Join_Sample{i+1}.xlsx", index=False)
    #df_var.to_excel(f"Turkish_200_Word_Talk_Time1_Sample{i+1}.xlsx", index=False)
    sample_num_start += part_sample_num
    sample_num_end += part_sample_num

#### Copy Move And Delete

In [None]:
output_file = glob.glob(f"{lang_folder.capitalize()}_*_Word_Talk_Time*.xlsx")
output_file

In [None]:
for y in output_file:
    source = y # source directory
    destination = path
    shutil.copy2(source, destination)

In [None]:
for z in output_file:
    try:
        os.remove(z)
    except:
        pass

In [None]:
# used for one search result
df_result = pd.DataFrame()
for sent in df_sentence_link["search_string"]:
    sent_words = word_tokenize(sent)
    for word in sent_words:
        df_link_default_var = df_link_default
        df_var = df_word_link[df_word_link["search_string"] == word]
        df_var.reset_index(drop=True, inplace=True)
        try:
            var_time_diff = (df_var.loc[0,"end_time"] - df_var.loc[0,"start_time"])
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + var_time_diff+1.0
            df_result = pd.concat([df_result,df_var], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass
df_result.reset_index(drop=True, inplace=True)
df_result

In [33]:
df_result_select = df_result.head(120)
df_result_select

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ik ga u in,30,31,ik ga u in drie stappen uitleggen hoe u de tec,VLM6zkuzvt0,https://www.youtube.com/watch?v=VLM6zkuzvt0&t=30s
1,ik ga u in,30,31,ik ga u in drie stappen uitleggen hoe u de tec,VLM6zkuzvt0,https://www.youtube.com/watch?v=VLM6zkuzvt0&t=30s
2,ik ga u in,30,31,ik ga u in drie stappen uitleggen hoe u de tec,VLM6zkuzvt0,https://www.youtube.com/watch?v=VLM6zkuzvt0&t=30s
3,ik,310,311,zo cool bang is ga ik naar het toilet even lat...,fK39cuorTzE,https://www.youtube.com/watch?v=fK39cuorTzE&t=...
4,repeat,5,9,repeat_again,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s
...,...,...,...,...,...,...
115,repeat,5,9,repeat_again,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s
116,niets anders gedaan moet,460,462,ik heb echter niets anders gedaan moet geluk z...,4YSldeXmuXw,https://www.youtube.com/watch?v=4YSldeXmuXw&t=...
117,niets anders gedaan moet,460,462,ik heb echter niets anders gedaan moet geluk z...,4YSldeXmuXw,https://www.youtube.com/watch?v=4YSldeXmuXw&t=...
118,niets anders gedaan moet,460,462,ik heb echter niets anders gedaan moet geluk z...,4YSldeXmuXw,https://www.youtube.com/watch?v=4YSldeXmuXw&t=...


In [34]:
df_result_select.to_excel("Dutch_200_Word_Talk_Time1_Test.xlsx", index=False)

In [35]:
id_list = []
start_list = []
end_list = []
for id, start, end in zip(df_result_select["video_id"].to_list(),df_result_select["start_time"].to_list(),df_result_select["end_time"].to_list()):
    id_list.append(str(id))
    start_list.append(str(start))
    end_list.append(str(end))

id_join = ",".join(id_list)
start_join = ",".join(start_list)
end_join = ",".join(end_list)

df_result_for_embedded = pd.DataFrame(data=[[id_join,start_join,end_join]], columns=["id","start_time","end_time"])
df_result_for_embedded

Unnamed: 0,id,start_time,end_time
0,"VLM6zkuzvt0,VLM6zkuzvt0,VLM6zkuzvt0,fK39cuorTz...","30,30,30,310,5,3497,5,13876,5,3592,5,888,5,30,...","31,31,31,311,9,3499,9,13877,9,3593,9,889,9,31,..."


In [36]:
df_result_for_embedded.to_excel("Dutch_200_Word_Talk_Time1_Join_Test.xlsx", index=False)

#### Temp

In [None]:
def df_col_value_join_comma(df, df_columns_list):
    column_value_list = []
    for column in df_columns_list:
        list_var = df[f"{column}"].to_list()
        list_var_string = [str(x) for x in list_var] 
        list_var_join = ",".join(list_var_string)
        column_value_list.append(list_var_join)

    df_result = pd.DataFrame([[column_value_list[0],column_value_list[1],column_value_list[2]]], columns=df_columns_list)

    return df_result

In [None]:
df_test = pd.read_excel("Dutch_200_Word_Talk_Time1_Test.xlsx")
df_test

In [None]:
df_test_join = df_col_value_join_comma(df_test, ["video_id","start_time","end_time"])
df_test_join

In [None]:
df_test_join.to_excel("Dutch_200_Word_Talk_Time1_Join_Test.xlsx", index=False)