### Kinetic Process

In [37]:
import os
import multiprocessing
#import multiprocessing as mp
from multiprocessing import Process, Manager, Pool, Queue
from itertools import islice
from collections import Counter
import re
import pandas as pd
import numpy as np
import glob
import nltk
from nltk import word_tokenize
from nltk import ngrams
from functools import reduce
from pathlib import Path
import shutil
import json

In [38]:
nprocs = multiprocessing.cpu_count()
print(f"Number of CPU cores: {nprocs}")

Number of CPU cores: 16


In [39]:
# language pair
lang_folder = "Turkish"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
#lang_pair = "English"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> native language

file_ext = 1000

# youtube
sample_num = 10  # 7
time_shift = 0.6

In [40]:
path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Lemma Stem POS/Result/5-0-Kinetic Process"

word_lemma_data_path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Lemma Stem POS/Result/3-2-Word In Visual Genome Merge"


#Path(path).mkdir(parents=True, exist_ok=True)

In [41]:
def word_group_youtube(df, search_list, target_column, sample_num):
    '''
    word_group_youtube(df_youtube_sentence, search_list, "sentence", 6)\n
    df_youtube_sentence is dataframe and "sentence" is its column for external searching_list
    ''' 
    df_search_result = pd.DataFrame()
    for j in search_list:
        try:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=False)].sample(sample_num)
            #df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)].tail(sample_num)  # will test
        except:
            df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=False)].head(sample_num)
        #df_result = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=True)]  # sentence length part
        #df_result.sort_values(f"{target_column}",key=lambda x:x.str.len(), inplace=True)
        #df_select = df_result.head(sample_num)
        df_select.insert(0,"search_string",j)
        df_search_result = pd.concat([df_search_result,df_select], axis=0)
    df_search_result.reset_index(inplace=True, drop=True)
    
    return df_search_result

In [42]:
def word_group_time_loc(df, search, start_sent, end_sent, sent, sent_video_id):
    '''
    word_group_time_loc(df_search_result, "search_string", "start_time", "end_time", "sentence", "video_id")\n
    df_search_result is dataframe and "search_string", "start_time", "end_time", "sentence", "video_id" are its columns
    '''
    df.reset_index(drop=True, inplace=True)
    word_time_loc_list = []
    for i in range(len(df)):
        word = df.loc[i,f"{search}"]
        start_time = df.loc[i,f"{start_sent}"]
        end_time = df.loc[i,f"{end_sent}"]
        sentence = df.loc[i,f"{sent}"]
        video_id = df.loc[i,f"{sent_video_id}"]
        time_length = end_time-start_time
        sentence_length = len(sentence)
        time_length_ratio = time_length/sentence_length
        loc_list = []
        for j in re.finditer(fr"(?:\s|^){word}(?:\s|$)", sentence, re.IGNORECASE|re.UNICODE):
            loc_list.append(j)
            start = loc_list[0].start()
            end = loc_list[0].end()
            start_loc = start_time+(start*time_length_ratio)
            end_loc = start_time+(end*time_length_ratio)
        word_time_loc_list.append([word,start_loc,end_loc,sentence,video_id])
    df_word_time_loc = pd.DataFrame(word_time_loc_list, columns=[f"{search}",f"{start_sent}",f"{end_sent}",f"{sent}",f"{sent_video_id}"])

    return df_word_time_loc

In [43]:
def word_count_result(df, column_list, set_condition=False): # df is dataframe, column_list is list value
    '''
    word_count_bool(df, column_list): df columns word count for word frequency\n
    df is dataframe, column_list is list value\n
    word_count_bool(df, ["word","twogram"]):
    '''
    list_all = []
    for i in df.loc[:,[x for x in column_list]].columns:
        if set_condition:
            var_list = set(df[f"{i}"].dropna().tolist())
        else:
            var_list = df[f"{i}"].dropna().tolist()
        for j in var_list:
            list_all.append(j)
    text = " ".join(list_all)
    word_list = re.findall(r"\w+",text, re.UNICODE)
    df_word_list = pd.DataFrame(word_list, columns=["word"])
    #df_word_list.rename(columns={0:"word"}, inplace=True)
    df_word_count = pd.DataFrame(df_word_list.value_counts())
    df_word_count.reset_index(inplace=True)
    df_word_count.rename(columns={0:"word_count"}, inplace=True)
    df_word_count.sort_values("word_count", ascending=False, inplace=True)
    df_word_count.reset_index(inplace=True, drop=True)
    
    return  df_word_count

In [44]:
def word_usage_result(word_list, df_target, target_column, target_opt_column, word_usage_min, word_usage_max):
    '''
    word_usage_result(word_list, df_ngram_pair, "threegram", "frequency", 1, 5) \n
    word_list is a list, df_target is a dateframe, target_column is df_target dataframe target column, \n
    target_opt_column is df_target dataframe opt_target column, \n
    word_usage_min and word_usage_max word usage condition.
    '''    
    word_num_dict = {}
    for i in word_list:
        word_num_dict[f"{i}"] = 0
    
    result_list_select = []
    var_list = []
    for i in range(len(df_target)):
        target_value = df_target.loc[i,f"{target_column}"]
        opt_value = df_target.loc[i,f"{target_opt_column}"]
        words = word_tokenize(target_value)   
        temp_list = [word for word in words]
        temp_list = temp_list + var_list
        # word count for max
        dict_list_count = Counter(temp_list)
        count_list = list(dict_list_count.values())
        # word count for min
        count_list2 = list(word_num_dict.values())
    
        if any([True if i>word_usage_max else False for i in count_list]) or not(any([True if j<word_usage_min else False for j in count_list2])):
            pass
        else:
            var_list = temp_list
            result_list_select.append([target_value,opt_value]) 
    
            for item2 in dict_list_count.items(): 
                word_num_dict[item2[0]] = item2[1]        
    df_result = pd.DataFrame(result_list_select, columns=[f"{target_column}",f"{target_opt_column}"])
    df_result.sort_values(by="frequency", ascending=False, inplace=True)
    df_result.reset_index(drop=True, inplace=True)
    
    return df_result

In [45]:
def lower_strip_func(x):
    try:
        var_low = str(x).lower()
        var_out = var_low.strip()
    except:
        var_out = x
    return var_out 

In [46]:
en = re.compile(r"[abcdefghıijklmnopqrstxuvwyz]+", re.IGNORECASE|re.UNICODE) # English
def clean_text(text):
    text_clean = re.findall(en, text)
    text_result = " ".join(text_clean)
    return text_result

#### Kinetic Data

In [24]:
df_kinetic_700_csv = pd.read_csv(f"/home/kurubal/Downloads/Kinetic/kinetic 700.csv")
df_kinetic_700_csv

Unnamed: 0,label,youtube_id,time_start,time_end,split
0,testifying,---QUuC4vJs,84,94,validate
1,washing feet,--GkrdYZ9Tc,0,10,validate
2,air drumming,--nQbRBEz2s,104,114,validate
3,pull ups,--rd8woSLiM,41,51,validate
4,building cabinet,--uGS0Y4D6k,9,19,validate
...,...,...,...,...,...
33324,trimming trees,zxdSPlGlSAQ,38,48,validate
33325,feeding goats,zxrvNwur1RE,194,204,validate
33326,country line dancing,zy7uvdwyK8k,3,13,validate
33327,playing paintball,zylVBFyoxZ0,94,104,validate


In [25]:
df_kinetic_train_csv = pd.read_csv(f"/home/kurubal/Downloads/Kinetic/train.csv")
df_kinetic_train_csv

Unnamed: 0,label,youtube_id,time_start,time_end,split
0,clay pottery making,---0dWlqevI,19,29,train
1,news anchoring,---aQ-tA5_A,9,19,train
2,using bagging machine,---j12rm3WI,14,24,train
3,javelin throw,--07WQ2iBlw,1,11,train
4,climbing a rope,--0NTAs-fA0,29,39,train
...,...,...,...,...,...
533058,washing dishes,zzz_3yWpTXo,0,10,train
533059,juggling fire,zzzkS3amkWE,124,134,train
533060,taking photo,zzzsd1R7H0E,6,16,train
533061,brush painting,zzzxltuPx2Q,84,94,train


In [26]:
df_kinetic_validate_csv = pd.read_csv(f"/home/kurubal/Downloads/Kinetic/validate.csv")
df_kinetic_validate_csv

Unnamed: 0,label,youtube_id,time_start,time_end,split
0,testifying,---QUuC4vJs,84,94,validate
1,washing feet,--GkrdYZ9Tc,0,10,validate
2,air drumming,--nQbRBEz2s,104,114,validate
3,pull ups,--rd8woSLiM,41,51,validate
4,building cabinet,--uGS0Y4D6k,9,19,validate
...,...,...,...,...,...
33324,trimming trees,zxdSPlGlSAQ,38,48,validate
33325,feeding goats,zxrvNwur1RE,194,204,validate
33326,country line dancing,zy7uvdwyK8k,3,13,validate
33327,playing paintball,zylVBFyoxZ0,94,104,validate


In [27]:
df_kinetic_test_csv = pd.read_csv(f"/home/kurubal/Downloads/Kinetic/test.csv")
df_kinetic_test_csv

Unnamed: 0,youtube_id,time_start,time_end,split
0,---v8pgm1eQ,0,10,test
1,--0kKuQu4Gs,0,10,test
2,--1f2DTKcwg,111,121,test
3,--2V_kDPfDg,192,202,test
4,--3X_T3dnAE,370,380,test
...,...,...,...,...
65999,zvInmNgphQU,27,37,test
66000,zvifz6eL30E,38,48,test
66001,zvkWcTn-MX4,225,235,test
66002,zvwkhXw_BqM,8,18,test


In [28]:
df_kinetic_concat = pd.concat([df_kinetic_700_csv,df_kinetic_train_csv,df_kinetic_validate_csv], axis=0)
df_kinetic_concat = df_kinetic_concat.drop("split", axis=1)
df_kinetic_concat.drop_duplicates(inplace=True)
df_kinetic_concat

Unnamed: 0,label,youtube_id,time_start,time_end
0,testifying,---QUuC4vJs,84,94
1,washing feet,--GkrdYZ9Tc,0,10
2,air drumming,--nQbRBEz2s,104,114
3,pull ups,--rd8woSLiM,41,51
4,building cabinet,--uGS0Y4D6k,9,19
...,...,...,...,...
533058,washing dishes,zzz_3yWpTXo,0,10
533059,juggling fire,zzzkS3amkWE,124,134
533060,taking photo,zzzsd1R7H0E,6,16
533061,brush painting,zzzxltuPx2Q,84,94


In [36]:
df_kinetic_concat["label"] = df_kinetic_concat["label"].apply(lambda x: lower_strip_func(x))
df_kinetic_concat["label"] = df_kinetic_concat["label"].apply(lambda x: clean_text(x))
df_kinetic_concat

Unnamed: 0,label,youtube_id,time_start,time_end
0,testifying,---QUuC4vJs,84,94
1,washing feet,--GkrdYZ9Tc,0,10
2,air drumming,--nQbRBEz2s,104,114
3,pull ups,--rd8woSLiM,41,51
4,building cabinet,--uGS0Y4D6k,9,19
...,...,...,...,...
533058,washing dishes,zzz_3yWpTXo,0,10
533059,juggling fire,zzzkS3amkWE,124,134
533060,taking photo,zzzsd1R7H0E,6,16
533061,brush painting,zzzxltuPx2Q,84,94


In [47]:
Pos_Tag = "VERB" # NOUN, VERB, ADJ, ADV, NUM, PRON, CCONJ, ADP, AUX

In [48]:
df_genome_word_lemma_concat = pd.read_csv(f"{word_lemma_data_path}/Visual_Genome_{file_ext}_Word_Lemma_Search_Result.csv")
df_genome_word_lemma_concat

Unnamed: 0,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,search_text,image_id,num
0,NUM,,bir,bir,bir,a,a,18835735,a,2390994,2920126305
1,NUM,,bir,bir,bir,a,a,18835735,a,2348965,129190150113
2,NUM,,bir,bir,bir,a,a,18835735,a,2348965,123317182233
3,NUM,,bir,bir,bir,a,a,18835735,a,2349861,4615142168
4,NUM,,bir,bir,bir,a,a,18835735,a,2349866,54188097
...,...,...,...,...,...,...,...,...,...,...,...
192717,VERB,,çekilin,çek,çek,withdraw,check,69201,airport check in kiosks,2317616,276026737
192718,VERB,,çekilin,çek,çek,withdraw,check,69201,red check of tablecloth,2400604,376517459
192719,VERB,,çekilin,çek,çek,withdraw,check,69201,a check is on the table,2386272,95132301236
192720,VERB,,çekilin,çek,çek,withdraw,check,69201,cleats with white check,2371210,2211405194


In [49]:
df_genome_word_lemma_concat = df_genome_word_lemma_concat[df_genome_word_lemma_concat["POS1"] == Pos_Tag]
df_genome_word_lemma_concat

Unnamed: 0,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,search_text,image_id,num
2320,VERB,,gerektiğini,gerek,gerek,what,necessary,120666,what,2321637,1308718190
2321,VERB,,gerektiğini,gerek,gerek,what,necessary,120666,what,2412863,2404701598
2322,VERB,,gerektiğini,gerek,gerek,what,necessary,120666,what this is,2388466,34576516
2323,VERB,,gerektiğini,gerek,gerek,what,necessary,120666,what is not allowed,2399091,553912620
2324,VERB,,gerektiğini,gerek,gerek,what,necessary,120666,"what , white",2408062,365513498
...,...,...,...,...,...,...,...,...,...,...,...
192717,VERB,,çekilin,çek,çek,withdraw,check,69201,airport check in kiosks,2317616,276026737
192718,VERB,,çekilin,çek,çek,withdraw,check,69201,red check of tablecloth,2400604,376517459
192719,VERB,,çekilin,çek,çek,withdraw,check,69201,a check is on the table,2386272,95132301236
192720,VERB,,çekilin,çek,çek,withdraw,check,69201,cleats with white check,2371210,2211405194


In [50]:
search_list = list(set(df_genome_word_lemma_concat["word"]))  # verb list
len(search_list)

279

In [51]:
disable_video_id_list = []

In [52]:
df_youtube_sentence = pd.read_csv(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Youtube/Result/{lang_folder.capitalize()}/Sentence Clean Merge/Clean_Youtube_Sentence_Merge_Result.csv")
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,00:00:00.294,00:00:03.294,bu dizinin betimlemesi staff fi lm tarafından,fI1BI4d5KFU
1,00:00:03.375,00:00:06.375,sesli betimleme derneğine yaptırılmıştır,fI1BI4d5KFU
2,00:00:06.462,00:00:09.462,wwwsebederorg,fI1BI4d5KFU
3,00:00:10.267,00:00:11.394,mumu,fI1BI4d5KFU
4,00:00:11.835,00:00:14.061,bak şu an sinir katsayım hat safhada,fI1BI4d5KFU
...,...,...,...,...
3934202,00:10:00.240,00:10:07.818,videoyu bitirmeden şunu da söylemek isterim ki...,dRssR_apbR0
3934203,00:10:07.994,00:10:13.534,bu nedenle ulaşımda insanların yarısına yakını...,dRssR_apbR0
3934204,00:10:14.745,00:10:17.924,danimarkayla alakalı şimdilik söyleyeceklerim ...,dRssR_apbR0
3934205,00:10:18.047,00:10:22.398,video hoşunuza gittiyse beğenmeyi ve kanalıma ...,dRssR_apbR0


In [53]:
df_youtube_sentence = df_youtube_sentence[~df_youtube_sentence["video_id"].isin(disable_video_id_list)]
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,00:00:00.294,00:00:03.294,bu dizinin betimlemesi staff fi lm tarafından,fI1BI4d5KFU
1,00:00:03.375,00:00:06.375,sesli betimleme derneğine yaptırılmıştır,fI1BI4d5KFU
2,00:00:06.462,00:00:09.462,wwwsebederorg,fI1BI4d5KFU
3,00:00:10.267,00:00:11.394,mumu,fI1BI4d5KFU
4,00:00:11.835,00:00:14.061,bak şu an sinir katsayım hat safhada,fI1BI4d5KFU
...,...,...,...,...
3934202,00:10:00.240,00:10:07.818,videoyu bitirmeden şunu da söylemek isterim ki...,dRssR_apbR0
3934203,00:10:07.994,00:10:13.534,bu nedenle ulaşımda insanların yarısına yakını...,dRssR_apbR0
3934204,00:10:14.745,00:10:17.924,danimarkayla alakalı şimdilik söyleyeceklerim ...,dRssR_apbR0
3934205,00:10:18.047,00:10:22.398,video hoşunuza gittiyse beğenmeyi ve kanalıma ...,dRssR_apbR0


In [54]:
df_youtube_sentence['start_time'] = pd.to_timedelta(df_youtube_sentence['start_time']) # data type converted timedelta for second 
df_youtube_sentence['end_time'] = pd.to_timedelta(df_youtube_sentence['end_time'])

In [55]:
df_youtube_sentence['start_time'] = df_youtube_sentence['start_time'].apply(lambda x: x.total_seconds()) # convert seconds
df_youtube_sentence['end_time'] = df_youtube_sentence['end_time'].apply(lambda x: x.total_seconds())
df_youtube_sentence

Unnamed: 0,start_time,end_time,sentence,video_id
0,0.294,3.294,bu dizinin betimlemesi staff fi lm tarafından,fI1BI4d5KFU
1,3.375,6.375,sesli betimleme derneğine yaptırılmıştır,fI1BI4d5KFU
2,6.462,9.462,wwwsebederorg,fI1BI4d5KFU
3,10.267,11.394,mumu,fI1BI4d5KFU
4,11.835,14.061,bak şu an sinir katsayım hat safhada,fI1BI4d5KFU
...,...,...,...,...
3934202,600.240,607.818,videoyu bitirmeden şunu da söylemek isterim ki...,dRssR_apbR0
3934203,607.994,613.534,bu nedenle ulaşımda insanların yarısına yakını...,dRssR_apbR0
3934204,614.745,617.924,danimarkayla alakalı şimdilik söyleyeceklerim ...,dRssR_apbR0
3934205,618.047,622.398,video hoşunuza gittiyse beğenmeyi ve kanalıma ...,dRssR_apbR0


In [56]:
df_word_group = word_group_youtube(df_youtube_sentence, search_list, "sentence", sample_num)
df_word_group

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ediyorum,539.200,541.200,ben onlara yardım ediyorum yüce honos,q_h9tUmdYaU
1,ediyorum,2724.406,2725.930,sana ders vermeyi kabul ediyorum,UKVile_HpwQ
2,ediyorum,3644.904,3646.840,ben nasıl olduğunu merak ediyorum,OPzhVWt1D9A
3,ediyorum,242.770,248.560,denemenizi tavsiye ediyorum ben hamuru bu şeki...,cVRNSvEpSIY
4,ediyorum,426.210,431.760,gerçekten çok lezzetli gönül rahatlığı ile tav...,1p2oao1tjOo
...,...,...,...,...,...
2785,dön,7361.577,7364.054,tamam sola dön,aPwTtMXPv3o
2786,dön,1463.866,1466.053,dön bir sor istiyorsan bakalım kim mutlu ediyo...,pcAhoVDv2Gs
2787,dön,6173.440,6174.520,teyze dön böyle,agAUR7SzzH0
2788,dön,6978.008,6979.988,ulan dön şöyle be öf,MXciFLdLVSY


In [57]:
df_word_group_time_loc = word_group_time_loc(df_word_group, "search_string", "start_time", "end_time", "sentence", "video_id")
df_word_group_time_loc

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ediyorum,540.118919,540.659459,ben onlara yardım ediyorum yüce honos,q_h9tUmdYaU
1,ediyorum,2725.501375,2725.930000,sana ders vermeyi kabul ediyorum,UKVile_HpwQ
2,ediyorum,3646.312000,3646.840000,ben nasıl olduğunu merak ediyorum,OPzhVWt1D9A
3,ediyorum,244.025663,244.723253,denemenizi tavsiye ediyorum ben hamuru bu şeki...,cVRNSvEpSIY
4,ediyorum,429.960000,430.710000,gerçekten çok lezzetli gönül rahatlığı ile tav...,1p2oao1tjOo
...,...,...,...,...,...
2785,dön,7363.346286,7364.054000,tamam sola dön,aPwTtMXPv3o
2786,dön,1463.866000,1464.028000,dön bir sor istiyorsan bakalım kim mutlu ediyo...,pcAhoVDv2Gs
2787,dön,6173.800000,6174.160000,teyze dön böyle,agAUR7SzzH0
2788,dön,6978.404000,6978.899000,ulan dön şöyle be öf,MXciFLdLVSY


In [58]:
df_word_group_time_loc.start_time = df_word_group_time_loc.start_time.apply(lambda x: (x-time_shift))
df_word_group_time_loc.end_time = df_word_group_time_loc.end_time.apply(lambda x: (x+time_shift))
df_word_group_time_loc

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ediyorum,539.518919,541.259459,ben onlara yardım ediyorum yüce honos,q_h9tUmdYaU
1,ediyorum,2724.901375,2726.530000,sana ders vermeyi kabul ediyorum,UKVile_HpwQ
2,ediyorum,3645.712000,3647.440000,ben nasıl olduğunu merak ediyorum,OPzhVWt1D9A
3,ediyorum,243.425663,245.323253,denemenizi tavsiye ediyorum ben hamuru bu şeki...,cVRNSvEpSIY
4,ediyorum,429.360000,431.310000,gerçekten çok lezzetli gönül rahatlığı ile tav...,1p2oao1tjOo
...,...,...,...,...,...
2785,dön,7362.746286,7364.654000,tamam sola dön,aPwTtMXPv3o
2786,dön,1463.266000,1464.628000,dön bir sor istiyorsan bakalım kim mutlu ediyo...,pcAhoVDv2Gs
2787,dön,6173.200000,6174.760000,teyze dön böyle,agAUR7SzzH0
2788,dön,6977.804000,6979.499000,ulan dön şöyle be öf,MXciFLdLVSY


In [59]:
df_word_group_time_loc.start_time = df_word_group_time_loc.start_time.apply(lambda x: round(x))
df_word_group_time_loc.end_time = df_word_group_time_loc.end_time.apply(lambda x: round(x))
df_word_group_time_loc 

Unnamed: 0,search_string,start_time,end_time,sentence,video_id
0,ediyorum,540,541,ben onlara yardım ediyorum yüce honos,q_h9tUmdYaU
1,ediyorum,2725,2727,sana ders vermeyi kabul ediyorum,UKVile_HpwQ
2,ediyorum,3646,3647,ben nasıl olduğunu merak ediyorum,OPzhVWt1D9A
3,ediyorum,243,245,denemenizi tavsiye ediyorum ben hamuru bu şeki...,cVRNSvEpSIY
4,ediyorum,429,431,gerçekten çok lezzetli gönül rahatlığı ile tav...,1p2oao1tjOo
...,...,...,...,...,...
2785,dön,7363,7365,tamam sola dön,aPwTtMXPv3o
2786,dön,1463,1465,dön bir sor istiyorsan bakalım kim mutlu ediyo...,pcAhoVDv2Gs
2787,dön,6173,6175,teyze dön böyle,agAUR7SzzH0
2788,dön,6978,6979,ulan dön şöyle be öf,MXciFLdLVSY


In [60]:
df_word_group_time_loc["video_url"] = "https://www.youtube.com/watch?v="+df_word_group_time_loc['video_id'].map(str)+"&t="+df_word_group_time_loc['start_time'].map(str)+"s"
df_word_group_time_loc

Unnamed: 0,search_string,start_time,end_time,sentence,video_id,video_url
0,ediyorum,540,541,ben onlara yardım ediyorum yüce honos,q_h9tUmdYaU,https://www.youtube.com/watch?v=q_h9tUmdYaU&t=...
1,ediyorum,2725,2727,sana ders vermeyi kabul ediyorum,UKVile_HpwQ,https://www.youtube.com/watch?v=UKVile_HpwQ&t=...
2,ediyorum,3646,3647,ben nasıl olduğunu merak ediyorum,OPzhVWt1D9A,https://www.youtube.com/watch?v=OPzhVWt1D9A&t=...
3,ediyorum,243,245,denemenizi tavsiye ediyorum ben hamuru bu şeki...,cVRNSvEpSIY,https://www.youtube.com/watch?v=cVRNSvEpSIY&t=...
4,ediyorum,429,431,gerçekten çok lezzetli gönül rahatlığı ile tav...,1p2oao1tjOo,https://www.youtube.com/watch?v=1p2oao1tjOo&t=...
...,...,...,...,...,...,...
2785,dön,7363,7365,tamam sola dön,aPwTtMXPv3o,https://www.youtube.com/watch?v=aPwTtMXPv3o&t=...
2786,dön,1463,1465,dön bir sor istiyorsan bakalım kim mutlu ediyo...,pcAhoVDv2Gs,https://www.youtube.com/watch?v=pcAhoVDv2Gs&t=...
2787,dön,6173,6175,teyze dön böyle,agAUR7SzzH0,https://www.youtube.com/watch?v=agAUR7SzzH0&t=...
2788,dön,6978,6979,ulan dön şöyle be öf,MXciFLdLVSY,https://www.youtube.com/watch?v=MXciFLdLVSY&t=...


In [61]:
word_count_result(df_word_group_time_loc, ["search_string"], set_condition=True)

Unnamed: 0,word,word_count
0,al,1
1,bilemiyorum,1
2,alacağım,1
3,aldı,1
4,aldım,1
...,...,...
274,çık,1
275,çıktı,1
276,öldü,1
277,öldürmek,1


In [62]:
#df_word_group_time_loc.to_excel(f"{lang_folder.capitalize()}_{file_ext}_With_Verb_Youtube_Link.xlsx", index=False)  ########*****

In [9]:
df_kinetic_train_json = pd.read_json(f"/home/kurubal/Downloads/Kinetic/train.json")
df_kinetic_train_json

Unnamed: 0,---0dWlqevI,---aQ-tA5_A,---j12rm3WI,--07WQ2iBlw,--0NTAs-fA0,--0l35AkU34,--33Lscn6sk,--3OAstUWtU,--3lTx87ebQ,--3ouPhoy2A,...,zzy_artj1B8,zzyxMjfYpL0,zzz0-zDYts8,zzz0X4NoFyA,zzzZycxdZHk,zzz_3yWpTXo,zzzkS3amkWE,zzzsd1R7H0E,zzzxltuPx2Q,zzzzE0ncP1Y
annotations,"{'label': 'clay pottery making', 'segment': [1...","{'label': 'news anchoring', 'segment': [9.0, 1...","{'label': 'using bagging machine', 'segment': ...","{'label': 'javelin throw', 'segment': [1.0, 11...","{'label': 'climbing a rope', 'segment': [29.0,...","{'label': 'sipping cup', 'segment': [68.0, 78.0]}","{'label': 'flipping pancake', 'segment': [4.0,...","{'label': 'tickling', 'segment': [45.0, 55.0]}","{'label': 'watering plants', 'segment': [23.0,...","{'label': 'eating spaghetti', 'segment': [20.0...",...,"{'label': 'gargling', 'segment': [210.0, 220.0]}","{'label': 'blowing glass', 'segment': [8.0, 18...","{'label': 'ice fishing', 'segment': [80.0, 90.0]}","{'label': 'using bagging machine', 'segment': ...","{'label': 'making a cake', 'segment': [54.0, 6...","{'label': 'washing dishes', 'segment': [0.0, 1...","{'label': 'juggling fire', 'segment': [124.0, ...","{'label': 'taking photo', 'segment': [6.0, 16.0]}","{'label': 'brush painting', 'segment': [84.0, ...","{'label': 'changing oil', 'segment': [232.0, 2..."
duration,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
subset,train,train,train,train,train,train,train,train,train,train,...,train,train,train,train,train,train,train,train,train,train
url,https://www.youtube.com/watch?v=---0dWlqevI,https://www.youtube.com/watch?v=---aQ-tA5_A,https://www.youtube.com/watch?v=---j12rm3WI,https://www.youtube.com/watch?v=--07WQ2iBlw,https://www.youtube.com/watch?v=--0NTAs-fA0,https://www.youtube.com/watch?v=--0l35AkU34,https://www.youtube.com/watch?v=--33Lscn6sk,https://www.youtube.com/watch?v=--3OAstUWtU,https://www.youtube.com/watch?v=--3lTx87ebQ,https://www.youtube.com/watch?v=--3ouPhoy2A,...,https://www.youtube.com/watch?v=zzy_artj1B8,https://www.youtube.com/watch?v=zzyxMjfYpL0,https://www.youtube.com/watch?v=zzz0-zDYts8,https://www.youtube.com/watch?v=zzz0X4NoFyA,https://www.youtube.com/watch?v=zzzZycxdZHk,https://www.youtube.com/watch?v=zzz_3yWpTXo,https://www.youtube.com/watch?v=zzzkS3amkWE,https://www.youtube.com/watch?v=zzzsd1R7H0E,https://www.youtube.com/watch?v=zzzxltuPx2Q,https://www.youtube.com/watch?v=zzzzE0ncP1Y


In [15]:
df_kinetic_validate_json = pd.read_json(f"/home/kurubal/Downloads/Kinetic/validate.json")
df_kinetic_validate_json

Unnamed: 0,---QUuC4vJs,--GkrdYZ9Tc,--nQbRBEz2s,--rd8woSLiM,--uGS0Y4D6k,-07WVYMSwac,-08En2aQklI,-0IErS_cisg,-0ML-FXomBw,-0MsYnGUrfE,...,zwk2hY12XAg,zwrku6rQ41s,zx2kZkM1LXA,zxYAOHRS4kI,zxaklajobzg,zxdSPlGlSAQ,zxrvNwur1RE,zy7uvdwyK8k,zylVBFyoxZ0,zyz4uOKGTzQ
annotations,"{'label': 'testifying', 'segment': [84.0, 94.0]}","{'label': 'washing feet', 'segment': [0.0, 10.0]}","{'label': 'air drumming', 'segment': [104.0, 1...","{'label': 'pull ups', 'segment': [41.0, 51.0]}","{'label': 'building cabinet', 'segment': [9.0,...","{'label': 'petting cat', 'segment': [20.0, 30.0]}","{'label': 'building cabinet', 'segment': [233....","{'label': 'marching', 'segment': [17.0, 27.0]}","{'label': 'playing paintball', 'segment': [1.0...","{'label': 'playing paintball', 'segment': [22....",...,"{'label': 'lifting hat', 'segment': [7.0, 17.0]}","{'label': 'cleaning shoes', 'segment': [20.0, ...","{'label': 'reading newspaper', 'segment': [11....","{'label': 'snowkiting', 'segment': [240.0, 250...","{'label': 'sled dog racing', 'segment': [455.0...","{'label': 'trimming trees', 'segment': [38.0, ...","{'label': 'feeding goats', 'segment': [194.0, ...","{'label': 'country line dancing', 'segment': [...","{'label': 'playing paintball', 'segment': [94....","{'label': 'washing hair', 'segment': [233.0, 2..."
duration,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
subset,validate,validate,validate,validate,validate,validate,validate,validate,validate,validate,...,validate,validate,validate,validate,validate,validate,validate,validate,validate,validate
url,https://www.youtube.com/watch?v=---QUuC4vJs,https://www.youtube.com/watch?v=--GkrdYZ9Tc,https://www.youtube.com/watch?v=--nQbRBEz2s,https://www.youtube.com/watch?v=--rd8woSLiM,https://www.youtube.com/watch?v=--uGS0Y4D6k,https://www.youtube.com/watch?v=-07WVYMSwac,https://www.youtube.com/watch?v=-08En2aQklI,https://www.youtube.com/watch?v=-0IErS_cisg,https://www.youtube.com/watch?v=-0ML-FXomBw,https://www.youtube.com/watch?v=-0MsYnGUrfE,...,https://www.youtube.com/watch?v=zwk2hY12XAg,https://www.youtube.com/watch?v=zwrku6rQ41s,https://www.youtube.com/watch?v=zx2kZkM1LXA,https://www.youtube.com/watch?v=zxYAOHRS4kI,https://www.youtube.com/watch?v=zxaklajobzg,https://www.youtube.com/watch?v=zxdSPlGlSAQ,https://www.youtube.com/watch?v=zxrvNwur1RE,https://www.youtube.com/watch?v=zy7uvdwyK8k,https://www.youtube.com/watch?v=zylVBFyoxZ0,https://www.youtube.com/watch?v=zyz4uOKGTzQ


In [18]:
df_kinetic_train_json2 = json.load(open('/home/kurubal/Downloads/Kinetic/train.json'))
df_kinetic_train_json2

{'---0dWlqevI': {'annotations': {'label': 'clay pottery making',
   'segment': [19.0, 29.0]},
  'duration': 10.0,
  'subset': 'train',
  'url': 'https://www.youtube.com/watch?v=---0dWlqevI'},
 '---aQ-tA5_A': {'annotations': {'label': 'news anchoring',
   'segment': [9.0, 19.0]},
  'duration': 10.0,
  'subset': 'train',
  'url': 'https://www.youtube.com/watch?v=---aQ-tA5_A'},
 '---j12rm3WI': {'annotations': {'label': 'using bagging machine',
   'segment': [14.0, 24.0]},
  'duration': 10.0,
  'subset': 'train',
  'url': 'https://www.youtube.com/watch?v=---j12rm3WI'},
 '--07WQ2iBlw': {'annotations': {'label': 'javelin throw',
   'segment': [1.0, 11.0]},
  'duration': 10.0,
  'subset': 'train',
  'url': 'https://www.youtube.com/watch?v=--07WQ2iBlw'},
 '--0NTAs-fA0': {'annotations': {'label': 'climbing a rope',
   'segment': [29.0, 39.0]},
  'duration': 10.0,
  'subset': 'train',
  'url': 'https://www.youtube.com/watch?v=--0NTAs-fA0'},
 '--0l35AkU34': {'annotations': {'label': 'sipping cup'

#### Copy Move And Delete

In [60]:
output_file = glob.glob(f"Visual_Genome_*_Analysis.csv")
output_file

['Visual_Genome_Objects_Analysis.csv',
 'Visual_Genome_Attributes_Analysis.csv',
 'Visual_Genome_Relationships_Analysis.csv',
 'Visual_Genome_Question_Answers_Analysis.csv',
 'Visual_Genome_Region_Descriptions_Analysis.csv',
 'Visual_Genome_Images_Analysis.csv']

In [61]:
for l in output_file:
    source = l # source directory
    destination = path
    shutil.copy2(source, destination)

In [62]:
for j in output_file:
    try:
        os.remove(j)
    except:
        pass