### Talk Time

In [1]:
import os
import multiprocessing
#import multiprocessing as mp
from multiprocessing import Process, Manager, Pool, Queue
from itertools import islice
from collections import Counter
import re
import pandas as pd
import numpy as np
import glob
import nltk
from nltk import word_tokenize
from nltk import ngrams
from functools import reduce
from pathlib import Path
import shutil

In [2]:
nprocs = multiprocessing.cpu_count()
print(f"Number of CPU cores: {nprocs}")

Number of CPU cores: 16


In [19]:
# language pair
lang_folder = "French"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
#lang_pair = "Intersect"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> native language

# adding native word to shared word
word_start = 0  # 0 native word start index
word_end = 200  # 28 native word end index

# youtube read data
sample_num = 1  # 6
time_shift = 0.6

In [4]:
def word_in_wordgroup_simple(source_word_list, df_target, target_column, word_sample_num, simple=False):

    '''word_in_wordgroup(not_in_sent_word_list, df_youtube_sent_select, "search_string", 5, simple=False)\n
       source_word_list is searching word list\n
       df_target is dataframe, target_column are dataframe column string name\n
       word_sample_num is searching sample number.
       simple use for all column row result or only target column result 
    '''
    if simple:
        df_select = df_target[[f"{target_column}"]].dropna()
    else:
        df_select = df_target
        
    df_result = pd.DataFrame()
    for i in source_word_list:
        try:
            word_in_word_cluster = df_select[df_select[f"{target_column}"].str.contains(fr"(?:\s|^){i}(?:\s|$)", na=True)].head(word_sample_num)    
        except:
            pass        
        word_in_word_cluster.insert(0,"search_string",i)
        df_result = pd.concat([df_result,word_in_word_cluster], axis=0)
    df_result.reset_index(drop=True, inplace=True)

    return df_result

In [5]:
def word_group_youtube(df, search_list, target_column, sample_num):
    '''
    word_group_youtube(df_youtube_sentence, search_list, "sentence", 6)
    ''' 
    df_search_result = pd.DataFrame()
    for j in search_list:
        try:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].sample(sample_num)
        except:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].head(sample_num)
        #df_result = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)]  # sentence length part
        #df_result.sort_values(f"{target_column}",key=lambda x:x.str.len(), inplace=True)
        #df_select = df_result.head(sample_num)
        df_select.insert(0,"search_string",j)
        df_search_result = pd.concat([df_search_result,df_select], axis=0)
    df_search_result.reset_index(inplace=True, drop=True)
    
    return df_search_result

In [6]:
def word_group_time_loc(df, search, start_sent, end_sent, sent, sent_video_id):
    '''
    word_group_time_loc(df_search_result, "search_string", "start_time", "end_time", "sentence", "video_id")
    '''
    df.reset_index(drop=True, inplace=True)
    word_time_loc_list = []
    for i in range(len(df)):
        word = df.loc[i,f"{search}"]
        start_time = df.loc[i,f"{start_sent}"]
        end_time = df.loc[i,f"{end_sent}"]
        sentence = df.loc[i,f"{sent}"]
        video_id = df.loc[i,f"{sent_video_id}"]
        time_length = end_time-start_time
        sentence_length = len(sentence)
        time_length_ratio = time_length/sentence_length
        loc_list = []
        for j in re.finditer(fr"(?:\s|^){word}(?:\s|$)", sentence, re.IGNORECASE|re.UNICODE):
            loc_list.append(j)
            start = loc_list[0].start()
            end = loc_list[0].end()
            start_loc = start_time+(start*time_length_ratio)
            end_loc = start_time+(end*time_length_ratio)
        word_time_loc_list.append([word,start_loc,end_loc,sentence,video_id])
    df_word_time_loc = pd.DataFrame(word_time_loc_list, columns=[f"{search}",f"{start_sent}",f"{end_sent}",f"{sent}",f"{sent_video_id}"])

    return df_word_time_loc

In [7]:
def word_count_result(df, column_list, set_condition=False): # df is dataframe, column_list is list value
    '''
    word_count_bool(df, column_list): df columns word count for word frequency\n
    df is dataframe, column_list is list value\n
    word_count_bool(df, ["word","twogram"]):
    '''
    list_all = []
    for i in df.loc[:,[x for x in column_list]].columns:
        if set_condition:
            var_list = set(df[f"{i}"].dropna().tolist())
        else:
            var_list = df[f"{i}"].dropna().tolist()
        for j in var_list:
            list_all.append(j)
    text = " ".join(list_all)
    word_list = re.findall(r"\w+",text, re.UNICODE)
    df_word_list = pd.DataFrame(word_list, columns=["word"])
    #df_word_list.rename(columns={0:"word"}, inplace=True)
    df_word_count = pd.DataFrame(df_word_list.value_counts())
    df_word_count.reset_index(inplace=True)
    df_word_count.rename(columns={0:"word_count"}, inplace=True)
    df_word_count.sort_values("word_count", ascending=False, inplace=True)
    df_word_count.reset_index(inplace=True, drop=True)
    
    return  df_word_count

In [8]:
path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Talk Time/Result/1-Talk Time/{lang_folder.capitalize()}"

Path(path).mkdir(parents=True, exist_ok=True)

In [9]:
df_word_all = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.lower().capitalize()}/Deployment/Data/Word/Word_Merge_Preprocess.xlsx")
df_word_all

Unnamed: 0,word,frequency
0,de,16588988
1,je,16386475
2,pas,11547876
3,le,10592792
4,la,9939090
...,...,...
317922,puisetil,5
317923,shouzheng,5
317924,steinam,5
317925,cochezle,5


In [10]:
df_word_select = df_word_all.iloc[word_start:word_end,]

In [11]:
df_word_select

Unnamed: 0,word,frequency
0,de,16588988
1,je,16386475
2,pas,11547876
3,le,10592792
4,la,9939090
...,...,...
195,nos,332905
196,cela,330730
197,nom,329252
198,prendre,327639


In [134]:
#df_word_select.to_excel(f"{lang_folder.capitalize()}_200_Word.xlsx", index=False)

In [12]:
word_list = df_word_select["word"].to_list()
len(word_list)

200

In [13]:
df_youtube_sentence = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Youtube/Result/{lang_folder.capitalize()}/Sentence Clean Merge/Clean_Youtube_Sentence_Merge_Result.csv")
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,00:00:00.000,00:00:00.200,perrosguirec bretagne,BU7DYGMSUA4
1,00:00:30.972,00:00:33.609,located in the regional natural park of armorica,4nmWfnmVgo4
2,00:00:34.263,00:00:36.866,the huelgoat forest,4nmWfnmVgo4
3,00:00:37.315,00:00:39.315,has always inspired popular stories,4nmWfnmVgo4
4,00:00:40.132,00:00:43.640,places with mysterious names sometimes refer t...,4nmWfnmVgo4
...,...,...,...,...
477511,01:17:00.530,01:17:07.970,être remettre le lien si vous avez des questio...,gloQzChn1lk
477512,01:17:07.970,01:17:13.640,la crise sanitaire ou quils concernent autre c...,gloQzChn1lk
477513,01:17:13.640,01:17:20.960,contacter et en tout cas merci beaucoup et bon...,gloQzChn1lk
477514,01:17:20.960,01:17:27.140,si catherine merci à tout le monde à revoir me...,gloQzChn1lk


In [14]:
df_youtube_sentence['start_time'] = pd.to_timedelta(df_youtube_sentence['start_time']) # data type converted timedelta for second 
df_youtube_sentence['end_time'] = pd.to_timedelta(df_youtube_sentence['end_time'])

In [15]:
df_youtube_sentence['start_time'] = df_youtube_sentence['start_time'].apply(lambda x: x.total_seconds()) # convert seconds
df_youtube_sentence['end_time'] = df_youtube_sentence['end_time'].apply(lambda x: x.total_seconds())
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,0.000,0.200,perrosguirec bretagne,BU7DYGMSUA4
1,30.972,33.609,located in the regional natural park of armorica,4nmWfnmVgo4
2,34.263,36.866,the huelgoat forest,4nmWfnmVgo4
3,37.315,39.315,has always inspired popular stories,4nmWfnmVgo4
4,40.132,43.640,places with mysterious names sometimes refer t...,4nmWfnmVgo4
...,...,...,...,...
477511,4620.530,4627.970,être remettre le lien si vous avez des questio...,gloQzChn1lk
477512,4627.970,4633.640,la crise sanitaire ou quils concernent autre c...,gloQzChn1lk
477513,4633.640,4640.960,contacter et en tout cas merci beaucoup et bon...,gloQzChn1lk
477514,4640.960,4647.140,si catherine merci à tout le monde à revoir me...,gloQzChn1lk


In [17]:
# other option 
# mUf7VNqChac =>  black screen
# 0_CDMstFg7M => 10sn
# bj1JRuyYeco => 20sn
# cElhIDdGz7M => screensaver
default_video_id = "cElhIDdGz7M"
df_link_default = pd.DataFrame(data=[["repeat",0,2,f"{default_video_id}",f"https://www.youtube.com/watch?v={default_video_id}&t=0s"]], columns=["search_string","start_time","end_time","video_id","video_url"])
df_link_default

Unnamed: 0,search_string,start_time,end_time,video_id,video_url
0,repeat,0,2,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s


In [20]:
df_word_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/1-Talk Time Data Prepare/{lang_folder.capitalize()}/{lang_folder.capitalize()}_200_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_word_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,de,583,584,vient le moment où nous comprenons que ce que ...,AkoZt6NCQpA,https://www.youtube.com/watch?v=AkoZt6NCQpA&t=...
1,je,158,159,sont à biarritz ce weekend cest cool mais plut...,S8IncqDM3ic,https://www.youtube.com/watch?v=S8IncqDM3ic&t=...
2,pas,209,210,médecine parce que pour lui la médecine cétait...,LdZZKGXf5G4,https://www.youtube.com/watch?v=LdZZKGXf5G4&t=...
3,le,1538,1539,le règne de dieu est tout proche,N0x70T8aNVU,https://www.youtube.com/watch?v=N0x70T8aNVU&t=...
4,la,146,147,de consommer par jour posezvous la question et...,qvOl8x9jxjw,https://www.youtube.com/watch?v=qvOl8x9jxjw&t=...
...,...,...,...,...,...,...
195,nos,585,587,on va en reparler tout à lheure vous allez voi...,Tndgvn3FoZU,https://www.youtube.com/watch?v=Tndgvn3FoZU&t=...
196,cela,698,700,en chantant cela on imagine déjà,p6FqYFOoFvQ,https://www.youtube.com/watch?v=p6FqYFOoFvQ&t=...
197,nom,869,870,jécris ton nom sur la mousse des nuages sur le...,PXUG5yjcUh4,https://www.youtube.com/watch?v=PXUG5yjcUh4&t=...
198,prendre,137,138,au bout dun moment le serveur vint prendre sa ...,OiyQXzGpXZw,https://www.youtube.com/watch?v=OiyQXzGpXZw&t=...


In [21]:
df_word_link[df_word_link["search_string"].duplicated()]

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url


In [22]:
df_twogram_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/1-Talk Time Data Prepare/{lang_folder.capitalize()}/{lang_folder.capitalize()}_Twogram_With_200_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_twogram_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,comment ça,733,735,noël ma absolument oui bernois comment ça va a...,KTujQiqqtcs,https://www.youtube.com/watch?v=KTujQiqqtcs&t=...
1,et vous,1372,1374,chrétiens et vous savez si vous êtes un vrai,gr_2UsdqPWE,https://www.youtube.com/watch?v=gr_2UsdqPWE&t=...
2,mais non,196,197,jespère que je ne lui ai pas fait trop mal mai...,PfVqyogksgs,https://www.youtube.com/watch?v=PfVqyogksgs&t=...
3,de quoi,199,201,de quoi ont besoin les plantes pour être heure...,DhEqSOjwYq0,https://www.youtube.com/watch?v=DhEqSOjwYq0&t=...
4,comme ça,3989,3991,très bien comme ça parce que,H4NOeTTLAic,https://www.youtube.com/watch?v=H4NOeTTLAic&t=...
...,...,...,...,...,...,...
74,que voilà,1925,1926,ça qui apparaîtra je crois que voilà vraiment ...,hlQNYZMJz2c,https://www.youtube.com/watch?v=hlQNYZMJz2c&t=...
75,était mal,1199,1201,était mal positionné eh bien je vais en tirer ...,vknHaRqCPvk,https://www.youtube.com/watch?v=vknHaRqCPvk&t=...
76,moi elle,520,521,cette notion de enchantement pour moi elle est...,vQWSJoODS28,https://www.youtube.com/watch?v=vQWSJoODS28&t=...
77,est très,57,59,concernant ce master de psycho qui est très de...,1bSHE3sH0mE,https://www.youtube.com/watch?v=1bSHE3sH0mE&t=57s


In [23]:
df_threegram_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/1-Talk Time Data Prepare/{lang_folder.capitalize()}/{lang_folder.capitalize()}_Threegram_With_200_Word_{sample_num}_Youtube_{time_shift}s_Timeshift_For_Talk_Time_Result_Manuel.xlsx")
df_threegram_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,pas du tout,5907,5909,pas du tout été traités le bilan dramatique en...,LSZIjPaZTb4,https://www.youtube.com/watch?v=LSZIjPaZTb4&t=...
1,je suis là,28,31,je suis là pour faire une mise à leau,3QblW0a8IxI,https://www.youtube.com/watch?v=3QblW0a8IxI&t=28s
2,vous êtes là,673,675,ce que je fous là pourquoi je suis là et vous ...,jpwUNmNViuk,https://www.youtube.com/watch?v=jpwUNmNViuk&t=...
3,cest pas bien,905,906,cest pas bien grave,Qe51q3_WMWg,https://www.youtube.com/watch?v=Qe51q3_WMWg&t=...
4,je pense que,2232,2234,et je pense que tant quon a,seWzxrNRuZc,https://www.youtube.com/watch?v=seWzxrNRuZc&t=...
5,elle est bien,88,90,à deux places ou plus que lon peut facilement ...,MyWOr94A0b4,https://www.youtube.com/watch?v=MyWOr94A0b4&t=88s
6,voilà cest fait,186,189,juste à cliquer sur publish et publish to sele...,prldVLvuAJk,https://www.youtube.com/watch?v=prldVLvuAJk&t=...
7,comment ça on,260,262,comment ça on vous veut du mal pas du tout on ...,DFejViks2uY,https://www.youtube.com/watch?v=DFejViks2uY&t=...
8,dans le passé,362,364,jen ai beaucoup ris mais ils possèdent tous de...,yTcxoLGVzG8,https://www.youtube.com/watch?v=yTcxoLGVzG8&t=...
9,cest ce que,78,80,et bien cest ce que nous allons découvrir ou r...,WkEfYhisILE,https://www.youtube.com/watch?v=WkEfYhisILE&t=78s


In [26]:
df_sentence_link = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Talk Time/Data/1-Talk Time Data Prepare/{lang_folder.capitalize()}/{lang_folder.capitalize()}_200_Word_Group_In_Youtube_Sentence_Sample_Selected_Manuel.xlsx")
df_sentence_link

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url,word_index
0,comment ça on vous veut du mal pas du tout on ...,261,266,comment ça on vous veut du mal pas du tout on ...,DFejViks2uY,https://www.youtube.com/watch?v=DFejViks2uY&t=...,"[74, 14, 19, 6, 152, 30, 156, 2, 30, 33, 19, 6..."
1,quand même quoi donc parce que voilà cest fait...,157,163,quand même quoi donc parce que voilà cest fait...,0_sv895JPlc,https://www.youtube.com/watch?v=0_sv895JPlc&t=...,"[65, 99, 47, 134, 131, 5, 182, 10, 40, 32, 24,..."
2,nai pas bien fait parce que cest pas bien de f...,591,596,nai pas bien fait parce que cest pas bien de f...,9RDA9RZLTSc,https://www.youtube.com/watch?v=9RDA9RZLTSc&t=...,"[155, 2, 29, 40, 131, 5, 10, 2, 29, 0, 45, 14,..."
3,dit il sest passé quelque chose il sest passé ...,257,261,dit il sest passé quelque chose il sest passé ...,hecMN45bbK4,https://www.youtube.com/watch?v=hecMN45bbK4&t=...,"[54, 12, 161, 193, 113, 79, 12, 161, 193, 113,..."
4,nom mais je pense que jétais en mieux et peut ...,135,144,nom mais je pense que jétais en mieux et peut ...,c1zhja4N8hw,https://www.youtube.com/watch?v=c1zhja4N8hw&t=...,"[197, 24, 1, 158, 5, 199, 20, 171, 8, 100, 2, ..."
5,là pourquoi je suis là et vous pourquoi vous ê...,672,676,ce que je fous là pourquoi je suis là et vous ...,jpwUNmNViuk,https://www.youtube.com/watch?v=jpwUNmNViuk&t=...,"[61, 58, 1, 31, 61, 8, 6, 58, 6, 110, 61, 41, ..."
6,comme ça mais je pense que cest une bonne chos...,566,571,démonter comme ça mais je pense que cest une b...,7rV0nsyIiUg,https://www.youtube.com/watch?v=7rV0nsyIiUg&t=...,"[49, 14, 24, 1, 158, 5, 10, 16, 172, 79, 49, 1..."
7,as été bien daccord avec moi elle était pas ét...,175,186,as été bien daccord avec moi elle était pas ét...,l7kuaPv5UvA,https://www.youtube.com/watch?v=l7kuaPv5UvA&t=...,"[72, 91, 29, 123, 34, 41, 37, 86, 2, 86, 2, 12..."
8,de dire dans le passé ce qui était bien et ce ...,34,37,de dire dans le passé ce qui était bien et ce ...,naM2K14xXqI,https://www.youtube.com/watch?v=naM2K14xXqI&t=34s,"[0, 69, 25, 3, 193, 15, 22, 86, 29, 8, 15, 22,..."
9,elle est bien elle est bien mais elle est très...,287,293,elle est bien elle est bien mais elle est très...,OS_T0FMHoBY,https://www.youtube.com/watch?v=OS_T0FMHoBY&t=...,"[37, 18, 29, 37, 18, 29, 24, 37, 18, 81, 81, 8..."


In [27]:
df_sentence_link_word_count = word_count_result(df_sentence_link, ["search_string"], set_condition=False)
df_sentence_link_word_count

Unnamed: 0,word,word_count
0,cest,7
1,pas,7
2,je,7
3,bien,7
4,que,7
...,...,...
73,veux,1
74,sur,1
75,voilà,1
76,été,1


In [28]:
set_sent_link_word = set(df_sentence_link_word_count["word"])
set_word_list = set(word_list)

In [29]:
set_word_list.difference(set_sent_link_word)

{'ai',
 'aller',
 'ans',
 'après',
 'au',
 'aussi',
 'autre',
 'aux',
 'avait',
 'avant',
 'avez',
 'bon',
 'ca',
 'cela',
 'ces',
 'cette',
 'chez',
 'crois',
 'cétait',
 'depuis',
 'des',
 'deux',
 'dieu',
 'dois',
 'doit',
 'déjà',
 'désolé',
 'encore',
 'es',
 'estce',
 'eu',
 'fais',
 'faut',
 'femme',
 'fille',
 'fois',
 'gens',
 'homme',
 'ici',
 'ils',
 'jai',
 'jamais',
 'jen',
 'juste',
 'la',
 'lai',
 'les',
 'leur',
 'ma',
 'maison',
 'me',
 'merci',
 'mes',
 'monde',
 'mort',
 'mère',
 'na',
 'ne',
 'nest',
 'nos',
 'notre',
 'nous',
 'ny',
 'oh',
 'ok',
 'ont',
 'ou',
 'ouais',
 'oui',
 'où',
 'par',
 'parler',
 'personne',
 'petit',
 'peu',
 'peutêtre',
 'peux',
 'prendre',
 'père',
 'quel',
 'quelle',
 'quelquun',
 'questce',
 'quil',
 'quils',
 'rien',
 'sa',
 'sais',
 'sans',
 'se',
 'sera',
 'ses',
 'sil',
 'soir',
 'soit',
 'son',
 'sont',
 'sûr',
 'ta',
 'tas',
 'te',
 'temps',
 'tes',
 'ton',
 'toujours',
 'tous',
 'toute',
 'trop',
 'tu',
 'va',
 'vais',
 'vas',


In [30]:
# used for multi search result
#twogram_link_list = df_twogram_link["search_string"].to_list()
#threegram_link_list = df_threegram_link["search_string"].to_list()
df_result = pd.DataFrame()
for sent in df_sentence_link["search_string"]:

    # words
    sent_words = word_tokenize(sent)

    # twogram
    twogram_zip = ngrams(sent.split(), 2)
    twogram_list = [" ".join(x) for x in twogram_zip]
    #df_twogram_var = pd.DataFrame(data=twogram, columns=["twogram"])
    
    # threegram
    threegram_zip = ngrams(sent.split(), 3)
    threegram_list = [" ".join(y) for y in threegram_zip]
    #df_threegram_var = pd.DataFrame(data=threegram, columns=["threegram"])
    
    # word result
    for word in sent_words:        
        df_word_search_var = df_word_link[df_word_link["search_string"] == word]
        df_word_search_var.reset_index(drop=True, inplace=True)
        for i in range(len(df_word_search_var)):
            df_link_default_var = df_link_default
            try:
                word_time_diff_var = df_word_search_var.loc[i,"end_time"] - df_word_search_var.loc[i,"start_time"]
                df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + word_time_diff_var+1.0
                df_result = pd.concat([df_result,df_word_search_var.iloc[[i,]]], axis=0)
                df_result = pd.concat([df_result,df_link_default_var], axis=0)
            except:
                pass

    # twogram result
    df_twogram_search_var = df_twogram_link[df_twogram_link["search_string"].isin(twogram_list)]
    df_twogram_search_var.reset_index(drop=True, inplace=True)
    for j in range(len(df_twogram_search_var)):
        df_link_default_var = df_link_default
        try:
            twogram_time_diff_var = df_twogram_search_var.loc[j,"end_time"] - df_twogram_search_var.loc[j,"start_time"]
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + twogram_time_diff_var+1.0
            df_result = pd.concat([df_result,df_twogram_search_var.iloc[[j,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

    # threegram result
    df_threegram_search_var = df_threegram_link[df_threegram_link["search_string"].isin(threegram_list)]
    df_threegram_search_var.reset_index(drop=True, inplace=True)
    for k in range(len(df_threegram_search_var)):
        df_link_default_var = df_link_default
        try:
            threegram_time_diff_var = df_threegram_search_var.loc[k,"end_time"] - df_threegram_search_var.loc[k,"start_time"]
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + threegram_time_diff_var+1.0
            df_result = pd.concat([df_result,df_threegram_search_var.iloc[[k,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

    # sentence added
    df_sent_search_var = df_sentence_link[df_sentence_link["search_string"] == sent]
    df_sent_search_var.reset_index(drop=True, inplace=True)
    for l in range(len(df_sent_search_var)):
        df_link_default_var = df_link_default
        try:
            sent_time_diff_var = df_sent_search_var.loc[l,"end_time"] - df_sent_search_var.loc[l,"start_time"]
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + sent_time_diff_var+1.0
            df_result = pd.concat([df_result,df_sent_search_var.iloc[[l,]]], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass

df_result.reset_index(drop=True, inplace=True)   

In [31]:
df_result

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url,word_index
0,comment,1112,1113,cette vidéo pour que je vous fasse une vidéo s...,RR9qtpQL0mY,https://www.youtube.com/watch?v=RR9qtpQL0mY&t=...,
1,repeat,0,2,,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s,
2,ça,2224,2225,mais ça légitime dautant plus et je finirai là...,PL2eqHfPK3Q,https://www.youtube.com/watch?v=PL2eqHfPK3Q&t=...,
3,repeat,0,2,,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s,
4,on,452,454,pas ce sinon on risque de casser feuilletage a...,V7B9B_D0jjg,https://www.youtube.com/watch?v=V7B9B_D0jjg&t=...,
...,...,...,...,...,...,...,...
677,repeat,0,3,,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s,
678,de quoi vous,30,32,de se raspberry pi donc alors de quoi vous all...,XI_w1j9XE8c,https://www.youtube.com/watch?v=XI_w1j9XE8c&t=30s,
679,repeat,0,3,,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s,
680,donc alors de quoi vous allez avoir besoin vou...,29,34,de se raspberry pi donc alors de quoi vous all...,XI_w1j9XE8c,https://www.youtube.com/watch?v=XI_w1j9XE8c&t=29s,"[134, 71, 0, 47, 6, 103, 126, 117, 6, 103, 126..."


In [32]:
df_result[df_result["search_string"] == "repeat"]["end_time"].max()

12

In [33]:
sample_num = df_result[df_result["search_string"] == "repeat"]["end_time"].count()
sample_num

341

In [34]:
((df_result[df_result["search_string"] == "repeat"]["end_time"].sum()*2)-sample_num)/60

26.816666666666666

In [35]:
df_result.to_excel("Turkish_200_Word_Talk_Time1.xlsx", index=False)

In [154]:
part_sample_num = 116
sample_num_start = 0
sample_num_end = part_sample_num
for i in range(12):
    df_var = df_result.iloc[sample_num_start:sample_num_end,]
    
    id_list = []
    start_list = []
    end_list = []
    for id, start, end in zip(df_var["video_id"].to_list(),df_var["start_time"].to_list(),df_var["end_time"].to_list()):
        id_list.append(str(id))
        start_list.append(str(start))
        end_list.append(str(end))

    id_join = ",".join(id_list)
    start_join = ",".join(start_list)
    end_join = ",".join(end_list)

    df_result_for_embedded = pd.DataFrame(data=[[id_join,start_join,end_join]], columns=["id","start_time","end_time"])
    df_result_for_embedded.to_excel(f"Turkish_200_Word_Talk_Time1_Join_Sample{i+1}.xlsx", index=False)
    #df_var.to_excel(f"Turkish_200_Word_Talk_Time1_Sample{i+1}.xlsx", index=False)
    sample_num_start += part_sample_num
    sample_num_end += part_sample_num

#### Copy Move And Delete

In [155]:
output_file = glob.glob(f"{lang_folder.capitalize()}_*_Word_Talk_Time*.xlsx")
output_file

['Turkish_200_Word_Talk_Time1.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample1.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample2.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample3.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample4.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample5.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample6.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample7.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample8.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample9.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample10.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample11.xlsx',
 'Turkish_200_Word_Talk_Time1_Join_Sample12.xlsx']

In [None]:
for y in output_file:
    source = y # source directory
    destination = path
    shutil.copy2(source, destination)

In [None]:
for z in output_file:
    try:
        os.remove(z)
    except:
        pass

In [None]:
# used for one search result
df_result = pd.DataFrame()
for sent in df_sentence_link["search_string"]:
    sent_words = word_tokenize(sent)
    for word in sent_words:
        df_link_default_var = df_link_default
        df_var = df_word_link[df_word_link["search_string"] == word]
        df_var.reset_index(drop=True, inplace=True)
        try:
            var_time_diff = (df_var.loc[0,"end_time"] - df_var.loc[0,"start_time"])
            df_link_default_var.loc[0,"end_time"] = df_link_default_var.loc[0,"start_time"] + var_time_diff+1.0
            df_result = pd.concat([df_result,df_var], axis=0)
            df_result = pd.concat([df_result,df_link_default_var], axis=0)
        except:
            pass
df_result.reset_index(drop=True, inplace=True)
df_result

In [36]:
df_result_select = df_result.head(120)
df_result_select

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url,word_index
0,comment,1112,1113,cette vidéo pour que je vous fasse une vidéo s...,RR9qtpQL0mY,https://www.youtube.com/watch?v=RR9qtpQL0mY&t=...,
1,repeat,0,2,,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s,
2,ça,2224,2225,mais ça légitime dautant plus et je finirai là...,PL2eqHfPK3Q,https://www.youtube.com/watch?v=PL2eqHfPK3Q&t=...,
3,repeat,0,2,,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s,
4,on,452,454,pas ce sinon on risque de casser feuilletage a...,V7B9B_D0jjg,https://www.youtube.com/watch?v=V7B9B_D0jjg&t=...,
...,...,...,...,...,...,...,...
115,repeat,0,3,,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s,
116,parce que,585,587,mais le normandie il arrive à la suite dune pé...,hRF_qePrIUk,https://www.youtube.com/watch?v=hRF_qePrIUk&t=...,
117,repeat,0,3,,cElhIDdGz7M,https://www.youtube.com/watch?v=cElhIDdGz7M&t=0s,
118,non mais,129,131,non mais comme ça on respecte les gestes barri...,YatjUl6dPNI,https://www.youtube.com/watch?v=YatjUl6dPNI&t=...,


In [37]:
id_list = []
start_list = []
end_list = []
for id, start, end in zip(df_result_select["video_id"].to_list(),df_result_select["start_time"].to_list(),df_result_select["end_time"].to_list()):
    id_list.append(str(id))
    start_list.append(str(start))
    end_list.append(str(end))

id_join = ",".join(id_list)
start_join = ",".join(start_list)
end_join = ",".join(end_list)

df_result_for_embedded = pd.DataFrame(data=[[id_join,start_join,end_join]], columns=["id","start_time","end_time"])
df_result_for_embedded

Unnamed: 0,id,start_time,end_time
0,"RR9qtpQL0mY,cElhIDdGz7M,PL2eqHfPK3Q,cElhIDdGz7...","1112,0,2224,0,452,0,177,0,3073,0,22,0,61,0,209...","1113,2,2225,2,454,3,179,3,3076,4,23,2,63,3,210..."


In [38]:
df_result_for_embedded.to_excel("French_200_Word_Talk_Time1_Join_Test.xlsx", index=False)