### Talk Time

In [123]:
import os
import multiprocessing
#import multiprocessing as mp
from multiprocessing import Process, Manager, Pool, Queue
from itertools import islice
from collections import Counter
import re
import pandas as pd
import numpy as np
import glob
import nltk
from nltk import word_tokenize
from nltk import ngrams
from functools import reduce
from pathlib import Path
import shutil

In [124]:
nprocs = multiprocessing.cpu_count()
print(f"Number of CPU cores: {nprocs}")

Number of CPU cores: 16


In [125]:
# language pair
lang_folder = "Turkish"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
#lang_pair = "Intersect"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> native language

# adding native word to shared word
word_start = 0  # 0 native word start index
word_end = 200  # 28 native word end index

In [126]:
def word_in_wordgroup_simple(source_word_list, df_target, target_column, word_sample_num, simple=False):

    '''word_in_wordgroup(not_in_sent_word_list, df_youtube_sent_select, "search_string", 5, simple=False)\n
       source_word_list is searching word list\n
       df_target is dataframe, target_column are dataframe column string name\n
       word_sample_num is searching sample number.
       simple use for all column row result or only target column result 
    '''
    if simple:
        df_select = df_target[[f"{target_column}"]].dropna()
    else:
        df_select = df_target
        
    df_result = pd.DataFrame()
    for i in source_word_list:
        try:
            word_in_word_cluster = df_select[df_select[f"{target_column}"].str.contains(fr"(?:\s|^){i}(?:\s|$)", na=True)].head(word_sample_num)    
        except:
            pass        
        word_in_word_cluster.insert(0,"search_string",i)
        df_result = pd.concat([df_result,word_in_word_cluster], axis=0)
    df_result.reset_index(drop=True, inplace=True)

    return df_result

In [127]:
def word_group_youtube(df, search_list, target_column, sample_num):
    '''
    word_group_youtube(df_youtube_sentence, search_list, "sentence", 6)
    ''' 
    df_search_result = pd.DataFrame()
    for j in search_list:
        try:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].sample(sample_num)
        except:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].head(sample_num)
        #df_result = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)]  # sentence length part
        #df_result.sort_values(f"{target_column}",key=lambda x:x.str.len(), inplace=True)
        #df_select = df_result.head(sample_num)
        df_select.insert(0,"search_string",j)
        df_search_result = pd.concat([df_search_result,df_select], axis=0)
    df_search_result.reset_index(inplace=True, drop=True)
    
    return df_search_result

In [128]:
def word_group_time_loc(df, search, start_sent, end_sent, sent, sent_video_id):
    '''
    word_group_time_loc(df_search_result, "search_string", "start_time", "end_time", "sentence", "video_id")
    '''
    df.reset_index(drop=True, inplace=True)
    word_time_loc_list = []
    for i in range(len(df)):
        word = df.loc[i,f"{search}"]
        start_time = df.loc[i,f"{start_sent}"]
        end_time = df.loc[i,f"{end_sent}"]
        sentence = df.loc[i,f"{sent}"]
        video_id = df.loc[i,f"{sent_video_id}"]
        time_length = end_time-start_time
        sentence_length = len(sentence)
        time_length_ratio = time_length/sentence_length
        loc_list = []
        for j in re.finditer(fr"(?:\s|^){word}(?:\s|$)", sentence, re.IGNORECASE|re.UNICODE):
            loc_list.append(j)
            start = loc_list[0].start()
            end = loc_list[0].end()
            start_loc = start_time+(start*time_length_ratio)
            end_loc = start_time+(end*time_length_ratio)
        word_time_loc_list.append([word,start_loc,end_loc,sentence,video_id])
    df_word_time_loc = pd.DataFrame(word_time_loc_list, columns=[f"{search}",f"{start_sent}",f"{end_sent}",f"{sent}",f"{sent_video_id}"])

    return df_word_time_loc

In [129]:
def word_count_result(df, column_list, set_condition=False): # df is dataframe, column_list is list value
    '''
    word_count_bool(df, column_list): df columns word count for word frequency\n
    df is dataframe, column_list is list value\n
    word_count_bool(df, ["word","twogram"]):
    '''
    list_all = []
    for i in df.loc[:,[x for x in column_list]].columns:
        if set_condition:
            var_list = set(df[f"{i}"].dropna().tolist())
        else:
            var_list = df[f"{i}"].dropna().tolist()
        for j in var_list:
            list_all.append(j)
    text = " ".join(list_all)
    word_list = re.findall(r"\w+",text, re.UNICODE)
    df_word_list = pd.DataFrame(word_list, columns=["word"])
    #df_word_list.rename(columns={0:"word"}, inplace=True)
    df_word_count = pd.DataFrame(df_word_list.value_counts())
    df_word_count.reset_index(inplace=True)
    df_word_count.rename(columns={0:"word_count"}, inplace=True)
    df_word_count.sort_values("word_count", ascending=False, inplace=True)
    df_word_count.reset_index(inplace=True, drop=True)
    
    return  df_word_count

In [130]:
path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Talk Time/Result/1-Talk Time/{lang_folder.capitalize()}"

Path(path).mkdir(parents=True, exist_ok=True)

In [131]:
df_word_all = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.lower().capitalize()}/Deployment/Data/Word/Word_Merge_Preprocess.xlsx")
df_word_all

Unnamed: 0,word,frequency
0,bir,18835735
1,bu,11062659
2,ne,8025880
3,ve,7766036
4,için,5484109
...,...,...
987924,karneleme,5
987925,karnaya,5
987926,dörtlümüzün,5
987927,karnavalınız,5


In [132]:
df_word_select = df_word_all.iloc[word_start:word_end,]

In [133]:
df_word_select

Unnamed: 0,word,frequency
0,bir,18835735
1,bu,11062659
2,ne,8025880
3,ve,7766036
4,için,5484109
...,...,...
195,saat,399989
196,onunla,399330
197,yapıyorsun,398274
198,neler,397377


In [134]:
df_word_select.to_excel(f"{lang_folder.capitalize()}_200_Word.xlsx", index=False)

In [135]:
word_list = df_word_select["word"].to_list()
len(word_list)

200

In [136]:
df_youtube_sentence = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Youtube/Result/{lang_folder.capitalize()}/Sentence Clean Merge/Clean_Youtube_Sentence_Merge_Result.csv")
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,00:00:00.294,00:00:03.294,bu dizinin betimlemesi staff fi lm tarafından,fI1BI4d5KFU
1,00:00:03.375,00:00:06.375,sesli betimleme derneğine yaptırılmıştır,fI1BI4d5KFU
2,00:00:06.462,00:00:09.462,wwwsebederorg,fI1BI4d5KFU
3,00:00:10.267,00:00:11.394,mumu,fI1BI4d5KFU
4,00:00:11.835,00:00:14.061,bak şu an sinir katsayım hat safhada,fI1BI4d5KFU
...,...,...,...,...
3934202,00:10:00.240,00:10:07.818,videoyu bitirmeden şunu da söylemek isterim ki...,dRssR_apbR0
3934203,00:10:07.994,00:10:13.534,bu nedenle ulaşımda insanların yarısına yakını...,dRssR_apbR0
3934204,00:10:14.745,00:10:17.924,danimarkayla alakalı şimdilik söyleyeceklerim ...,dRssR_apbR0
3934205,00:10:18.047,00:10:22.398,video hoşunuza gittiyse beğenmeyi ve kanalıma ...,dRssR_apbR0


In [137]:
df_youtube_sentence['start_time'] = pd.to_timedelta(df_youtube_sentence['start_time']) # data type converted timedelta for second 
df_youtube_sentence['end_time'] = pd.to_timedelta(df_youtube_sentence['end_time'])

In [138]:
df_youtube_sentence['start_time'] = df_youtube_sentence['start_time'].apply(lambda x: x.total_seconds()) # convert seconds
df_youtube_sentence['end_time'] = df_youtube_sentence['end_time'].apply(lambda x: x.total_seconds())
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,0.294,3.294,bu dizinin betimlemesi staff fi lm tarafından,fI1BI4d5KFU
1,3.375,6.375,sesli betimleme derneğine yaptırılmıştır,fI1BI4d5KFU
2,6.462,9.462,wwwsebederorg,fI1BI4d5KFU
3,10.267,11.394,mumu,fI1BI4d5KFU
4,11.835,14.061,bak şu an sinir katsayım hat safhada,fI1BI4d5KFU
...,...,...,...,...
3934202,600.240,607.818,videoyu bitirmeden şunu da söylemek isterim ki...,dRssR_apbR0
3934203,607.994,613.534,bu nedenle ulaşımda insanların yarısına yakını...,dRssR_apbR0
3934204,614.745,617.924,danimarkayla alakalı şimdilik söyleyeceklerim ...,dRssR_apbR0
3934205,618.047,622.398,video hoşunuza gittiyse beğenmeyi ve kanalıma ...,dRssR_apbR0


In [139]:
# other option 
# mUf7VNqChac =>  black screen
# 0_CDMstFg7M => 10sn
# bj1JRuyYeco => 20sn
df_link_default = pd.DataFrame(data=[["repeat",0,2,"bj1JRuyYeco","https://www.youtube.com/watch?v=bj1JRuyYeco&t=0s"]], columns=["search_string","start_time","end_time","video_id","video_url"])
df_link_default

Unnamed: 0,search_string,start_time,end_time,video_id,video_url
0,repeat,0,2,bj1JRuyYeco,https://www.youtube.com/watch?v=bj1JRuyYeco&t=0s


In [140]:
df_word_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/1-Talk Time Data Prepare/{lang_folder.capitalize()}/{lang_folder.capitalize()}_200_Word_6_Youtube_0.6s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_word_link

Unnamed: 0,search_string,start_time,end_time,video_id,video_url
0,istiyorum,2732,2734,XWd602XfKh0,https://www.youtube.com/watch?v=XWd602XfKh0&t=...
1,olmaz,3223,3224,UDlsIakb9Kg,https://www.youtube.com/watch?v=UDlsIakb9Kg&t=...
2,gel,428,430,lYSwby37FwQ,https://www.youtube.com/watch?v=lYSwby37FwQ&t=...
3,aslında,6703,6705,Ba_ffkYzPds,https://www.youtube.com/watch?v=Ba_ffkYzPds&t=...
4,tekrar,11677,11678,1LSDvAXn2Ug,https://www.youtube.com/watch?v=1LSDvAXn2Ug&t=...
...,...,...,...,...,...
195,ister,9385,9387,WQzOYp3hhpM,https://www.youtube.com/watch?v=WQzOYp3hhpM&t=...
196,ver,792,793,RKMql8M5O5A,https://www.youtube.com/watch?v=RKMql8M5O5A&t=...
197,biliyorum,799,801,qJd2PBgNCwU,https://www.youtube.com/watch?v=qJd2PBgNCwU&t=...
198,söyle,391,392,g_t4wXG9o_s,https://www.youtube.com/watch?v=g_t4wXG9o_s&t=...


In [141]:
df_word_link[df_word_link["search_string"].duplicated()]

Unnamed: 0,search_string,start_time,end_time,video_id,video_url


In [142]:
df_twogram_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/1-Talk Time Data Prepare/{lang_folder.capitalize()}/{lang_folder.capitalize()}_Twogram_With_200_Word_6_Youtube_0.6s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_twogram_link

Unnamed: 0,search_string,start_time,end_time,video_id,video_url
0,teşekkür ederim,2127,2129,ry0hEicwuhQ,https://www.youtube.com/watch?v=ry0hEicwuhQ&t=...
1,evet efendim,977,979,SA3R1HniCoM,https://www.youtube.com/watch?v=SA3R1HniCoM&t=...
2,sorun değil,1922,1924,rXAAGktXqQ4,https://www.youtube.com/watch?v=rXAAGktXqQ4&t=...
3,çok güzel,324,326,uRvMnq_rk8k,https://www.youtube.com/watch?v=uRvMnq_rk8k&t=...
4,değil mi,674,676,sDR1xwXjMuE,https://www.youtube.com/watch?v=sDR1xwXjMuE&t=...
...,...,...,...,...,...
157,daha doğru,8090,8092,eFKv8712V4w,https://www.youtube.com/watch?v=eFKv8712V4w&t=...
158,senin gerçekten,4999,5001,g81mNCrBnO4,https://www.youtube.com/watch?v=g81mNCrBnO4&t=...
159,ama sen,1261,1263,hNpaCPqSxyg,https://www.youtube.com/watch?v=hNpaCPqSxyg&t=...
160,tamam haydi,2116,2117,6y3Z_Xx_Log,https://www.youtube.com/watch?v=6y3Z_Xx_Log&t=...


In [143]:
df_threegram_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/1-Talk Time Data Prepare/{lang_folder.capitalize()}/{lang_folder.capitalize()}_Threegram_With_200_Word_6_Youtube_0.6s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_threegram_link

Unnamed: 0,search_string,start_time,end_time,video_id,video_url
0,bir şey yok,1269,1272,rXq5yNXIAac,https://www.youtube.com/watch?v=rXq5yNXIAac&t=...
1,ben teşekkür ederim,3054,3056,7G29Odeyqpk,https://www.youtube.com/watch?v=7G29Odeyqpk&t=...
2,orada neler oluyor,6524,6526,pvsImT6E4yM,https://www.youtube.com/watch?v=pvsImT6E4yM&t=...
3,gerçekten çok güzel,377,380,Xz9hCrwDk0Y,https://www.youtube.com/watch?v=Xz9hCrwDk0Y&t=...
4,öyle bir şey,2641,2643,_IimozO_x50,https://www.youtube.com/watch?v=_IimozO_x50&t=...
5,ne oluyor ya,2445,2447,IxuS6zAE2Kc,https://www.youtube.com/watch?v=IxuS6zAE2Kc&t=...
6,tamam devam et,236,238,PwGdpbguE4w,https://www.youtube.com/watch?v=PwGdpbguE4w&t=...
7,ve bu da,143,145,En1VFf1cD50,https://www.youtube.com/watch?v=En1VFf1cD50&t=...
8,ne kadar büyük,10153,10155,d61_l7BtleU,https://www.youtube.com/watch?v=d61_l7BtleU&t=...
9,güzel çok güzel,42,45,KZFeDu9e5xI,https://www.youtube.com/watch?v=KZFeDu9e5xI&t=42s


In [144]:
df_sentence_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/1-Talk Time Data Prepare/{lang_folder.capitalize()}/{lang_folder.capitalize()}_200_Word_Group_In_Youtube_Sentence_Sample_Selected_Manuel.xlsx")
df_sentence_link

Unnamed: 0,search_string,start_time,end_time,video_id,video_url
0,kendi şimdi sen mi ne oluyor biliyor musun bu ...,446,451,Yuu5VhJBIUg,https://www.youtube.com/watch?v=Yuu5VhJBIUg&t=...
1,gerçekten uzun ve bu da benim için çok önemli ...,623,627,r_9ngjGh2Ks,https://www.youtube.com/watch?v=r_9ngjGh2Ks&t=...
2,her zaman öyle değil herkes için de her zaman ...,559,562,cPoQ27hCNOU,https://www.youtube.com/watch?v=cPoQ27hCNOU&t=...
3,bile yok bu da benim için aslında önemli bir ş...,131,134,aqQ7IUho9pA,https://www.youtube.com/watch?v=aqQ7IUho9pA&t=...
4,için artık değil misin gerek yok bunun için si...,65,70,gX77n13Qj0o,https://www.youtube.com/watch?v=gX77n13Qj0o&t=65s
...,...,...,...,...,...
63,bir şey yok ki ya adam,2554,2556,MhHCQ_cb3T8,https://www.youtube.com/watch?v=MhHCQ_cb3T8&t=...
64,pekala pekala tamam,1409,1411,GdJVLA16hUc,https://www.youtube.com/watch?v=GdJVLA16hUc&t=...
65,yine aynı şeyler,1705,1708,L9bkiAADDIY,https://www.youtube.com/watch?v=L9bkiAADDIY&t=...
66,güzel kız,357,359,Dh0fabNXYCw,https://www.youtube.com/watch?v=Dh0fabNXYCw&t=...


In [145]:
df_sentence_link_word_count = word_count_result(df_sentence_link, ["search_string"], set_condition=False)
df_sentence_link_word_count

Unnamed: 0,word,word_count
0,bir,15
1,çok,11
2,için,7
3,ne,7
4,tamam,7
...,...,...
190,sizi,1
191,sorun,1
192,söyle,1
193,tabii,1


In [146]:
set_sent_link_word = set(df_sentence_link_word_count["word"])
set_word_list = set(word_list)

In [147]:
set_word_list.difference(set_sent_link_word)

{'benimle', 'bize', 'değilim', 'lazım', 'olacak'}

In [148]:
# used for multi search result
#twogram_link_list = df_twogram_link["search_string"].to_list()
#threegram_link_list = df_threegram_link["search_string"].to_list()
df_result = pd.DataFrame()
for sent in df_sentence_link["search_string"]:

    # words
    sent_words = word_tokenize(sent)

    # twogram
    twogram_zip = ngrams(sent.split(), 2)
    twogram_list = [" ".join(x) for x in twogram_zip]
    #df_twogram_var = pd.DataFrame(data=twogram, columns=["twogram"])
    
    # threegram
    threegram_zip = ngrams(sent.split(), 3)
    threegram_list = [" ".join(y) for y in threegram_zip]
    #df_threegram_var = pd.DataFrame(data=threegram, columns=["threegram"])
    
    # word result
    for word in sent_words:        
        df_word_search_var = df_word_link[df_word_link["search_string"] == word]
        df_word_search_var.reset_index(drop=True, inplace=True)
        for i in range(len(df_word_search_var)):
            df_link_default_var = df_link_default
            try:
                word_time_diff_var = df_word_search_var.loc[i,"end_time"] - df_word_search_var.loc[i,"start_time"]
                df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + word_time_diff_var+1.0
                df_result = pd.concat([df_result,df_word_search_var.iloc[[i,]]], axis=0)
                df_result = pd.concat([df_result,df_link_default_var], axis=0)
            except:
                pass

    # twogram result
    df_twogram_search_var = df_twogram_link[df_twogram_link["search_string"].isin(twogram_list)]
    df_twogram_search_var.reset_index(drop=True, inplace=True)
    for j in range(len(df_twogram_search_var)):
        df_link_default_var = df_link_default
        try:
            twogram_time_diff_var = df_twogram_search_var.loc[j,"end_time"] - df_twogram_search_var.loc[j,"start_time"]
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + twogram_time_diff_var+1.0
            df_result = pd.concat([df_result,df_twogram_search_var.iloc[[j,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

    # threegram result
    df_threegram_search_var = df_threegram_link[df_threegram_link["search_string"].isin(threegram_list)]
    df_threegram_search_var.reset_index(drop=True, inplace=True)
    for k in range(len(df_threegram_search_var)):
        df_link_default_var = df_link_default
        try:
            threegram_time_diff_var = df_threegram_search_var.loc[k,"end_time"] - df_threegram_search_var.loc[k,"start_time"]
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + threegram_time_diff_var+1.0
            df_result = pd.concat([df_result,df_threegram_search_var.iloc[[k,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

    # sentence added
    df_sent_search_var = df_sentence_link[df_sentence_link["search_string"] == sent]
    df_sent_search_var.reset_index(drop=True, inplace=True)
    for l in range(len(df_sent_search_var)):
        df_link_default_var = df_link_default
        try:
            sent_time_diff_var = df_sent_search_var.loc[l,"end_time"] - df_sent_search_var.loc[l,"start_time"]
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + sent_time_diff_var+1.0
            df_result = pd.concat([df_result,df_sent_search_var.iloc[[l,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

df_result.reset_index(drop=True, inplace=True)   

In [149]:
df_result

Unnamed: 0,search_string,start_time,end_time,video_id,video_url
0,kendi,2551,2553,J3lK1h2j7Lo,https://www.youtube.com/watch?v=J3lK1h2j7Lo&t=...
1,repeat,0,3,bj1JRuyYeco,https://www.youtube.com/watch?v=bj1JRuyYeco&t=0s
2,şimdi,1773,1775,Isab2NO9jlk,https://www.youtube.com/watch?v=Isab2NO9jlk&t=...
3,repeat,0,3,bj1JRuyYeco,https://www.youtube.com/watch?v=bj1JRuyYeco&t=0s
4,sen,7791,7792,JvsqNlgYDFc,https://www.youtube.com/watch?v=JvsqNlgYDFc&t=...
...,...,...,...,...,...
1387,repeat,0,2,bj1JRuyYeco,https://www.youtube.com/watch?v=bj1JRuyYeco&t=0s
1388,aynı şeyler,918,920,rU3lNhDPo2U,https://www.youtube.com/watch?v=rU3lNhDPo2U&t=...
1389,repeat,0,3,bj1JRuyYeco,https://www.youtube.com/watch?v=bj1JRuyYeco&t=0s
1390,aynı şeyler,918,920,rU3lNhDPo2U,https://www.youtube.com/watch?v=rU3lNhDPo2U&t=...


In [150]:
df_result[df_result["search_string"] == "repeat"]["end_time"].max()

13

In [151]:
sample_num = df_result[df_result["search_string"] == "repeat"]["end_time"].count()
sample_num

696

In [152]:
((df_result[df_result["search_string"] == "repeat"]["end_time"].sum()*2)-sample_num)/60

53.93333333333333

In [153]:
df_result.to_excel("Turkish_200_Word_Talk_Time1.xlsx", index=False)

In [154]:
part_sample_num = 116
sample_num_start = 0
sample_num_end = part_sample_num
for i in range(12):
    df_var = df_result.iloc[sample_num_start:sample_num_end,]
    
    id_list = []
    start_list = []
    end_list = []
    for id, start, end in zip(df_var["video_id"].to_list(),df_var["start_time"].to_list(),df_var["end_time"].to_list()):
        id_list.append(str(id))
        start_list.append(str(start))
        end_list.append(str(end))

    id_join = ",".join(id_list)
    start_join = ",".join(start_list)
    end_join = ",".join(end_list)

    df_result_for_embedded = pd.DataFrame(data=[[id_join,start_join,end_join]], columns=["id","start_time","end_time"])
    df_result_for_embedded.to_excel(f"Turkish_200_Word_Talk_Time1_Join_Sample{i+1}.xlsx", index=False)
    #df_var.to_excel(f"Turkish_200_Word_Talk_Time1_Sample{i+1}.xlsx", index=False)
    sample_num_start += part_sample_num
    sample_num_end += part_sample_num

#### Copy Move And Delete

In [155]:
output_file = glob.glob(f"{lang_folder.capitalize()}_*_Word_Talk_Time*.xlsx")
output_file

['Turkish_200_Word_Talk_Time1.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample1.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample2.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample3.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample4.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample5.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample6.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample7.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample8.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample9.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample10.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample11.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample12.xlsx']

In [None]:
for y in output_file:
    source = y # source directory
    destination = path
    shutil.copy2(source, destination)

In [None]:
for z in output_file:
    try:
        os.remove(z)
    except:
        pass

In [None]:
# used for one search result
df_result = pd.DataFrame()
for sent in df_sentence_link["search_string"]:
    sent_words = word_tokenize(sent)
    for word in sent_words:
        df_link_default_var = df_link_default
        df_var = df_word_link[df_word_link["search_string"] == word]
        df_var.reset_index(drop=True, inplace=True)
        try:
            var_time_diff = (df_var.loc[0,"end_time"] - df_var.loc[0,"start_time"])
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + var_time_diff+1.0
            df_result = pd.concat([df_result,df_var], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass
df_result.reset_index(drop=True, inplace=True)
df_result

In [None]:
df_result_select = df_result.head(84)
df_result_select

In [None]:
id_list = []
start_list = []
end_list = []
for id, start, end in zip(df_result_select["video_id"].to_list(),df_result_select["start_time"].to_list(),df_result_select["end_time"].to_list()):
    id_list.append(str(id))
    start_list.append(str(start))
    end_list.append(str(end))

id_join = ",".join(id_list)
start_join = ",".join(start_list)
end_join = ",".join(end_list)

df_result_for_embedded = pd.DataFrame(data=[[id_join,start_join,end_join]], columns=["id","start_time","end_time"])
df_result_for_embedded

In [None]:
df_result_for_embedded.to_excel("Turkish_200_Word_Talk_Time1_Join_Test.xlsx", index=False)