### Find Word Images Mask Fade

In [39]:
import os
import sys
import pandas as pd
import numpy as np
import re
import glob
from pathlib import Path
import shutil
from os.path import isfile, join

In [40]:
# language pair
lang_folder = "Turkish"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
#lang_pair = "English"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> native language

In [41]:
word_lemma_all_data_path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Lemma Stem POS/Result/3-2-Word In Visual Genome Merge"

path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Lemma Stem POS/Result/3-4-Find Word Images Mask Fade"

Path(path).mkdir(parents=True, exist_ok=True)

In [42]:
def word_group_dataframe(df, search_list, target_column, sample_num):
    '''
    word_group_dataframe(df_youtube_sentence, search_list, "sentence", 6)\n
    df_youtube_sentence is dataframe and "sentence" is its column for external searching_list
    ''' 
    df_search_result = pd.DataFrame()
    for j in search_list:
        df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=False, regex=True)]
        #df_select.sort_values(f"{target_column}",key=lambda x:x.str.len(), inplace=True).head(sample_num)
        df_select = df_select.sort_values(f"{target_column}",key=lambda x:x.str.len()).head(sample_num)               
        df_select.insert(0,"search_string",j)
        df_search_result = pd.concat([df_search_result,df_select], axis=0)
    df_search_result.reset_index(inplace=True, drop=True)
    
    return df_search_result

In [43]:
def word_group_dataframe_all(df, search_list, target_column):
    '''
    word_group_dataframe(df_youtube_sentence, search_list, "sentence", 6)\n
    df_youtube_sentence is dataframe and "sentence" is its column for external searching_list
    ''' 
    df_search_result = pd.DataFrame()
    for j in search_list:
        df_select = df[df[f"{target_column}"].str.contains(fr"(?:\s|^){j}(?:\s|$)", na=False, regex=True)]
        #df_select.sort_values(f"{target_column}",key=lambda x:x.str.len(), inplace=True)
        df_select = df_select.sort_values(f"{target_column}",key=lambda x:x.str.len())
        df_select.insert(0,"search_string",j)
        df_search_result = pd.concat([df_search_result,df_select], axis=0)        
    df_search_result.reset_index(inplace=True, drop=True)
    
    return df_search_result

In [44]:
def take_dataframe_word_sample_from_sorting(df_source, word_list, word_source_column, sort_target_column, sort_ascending=True, sample_num=50):
    '''take_dataframe_word_sample_from_sorting(df_source, word_list, word_source_column, sort_target_column, sort_ascending=True, sample_num=50)\n
    df_source is a dataframe and word_list is equal in word_source_column. Then sort_target_column is sorting according to sort_ascending condition.\n
    Finally, taking sample_num each word_list values.\n 
    ex.\n
    take_dataframe_word_sample_from_sorting(df_genome_word_lemma_concat, word_list, "word", "search_text", sort_ascending=True, sample_num=50)
    '''
    df_search_result = pd.DataFrame()
    for word in word_list:
        df_select = df_source[df_source[f"{word_source_column}"] == word]
        df_select = df_select.sort_values(f"{sort_target_column}",key=lambda x:x.str.len(), ascending=sort_ascending).head(sample_num)
        df_search_result = pd.concat([df_search_result,df_select], axis=0)
    
    df_search_result.reset_index(inplace=True, drop=True)
    
    return df_search_result

In [45]:
def create_word_folder_and_copy_image(df_source, word_list, word_source_column, image_id_column, image_folder_path, output_path_folder):
    '''create_word_folder_and_copy_image(df_source, word_list, word_source_column, image_id_column, image_folder_path, output_path_folder)\n
    df_source is a dataframe and word_list is equal in word_source_column. Then word image id search in image_id_column and image copy to\n
    output_path_folder from image_folder_path.\n
    ex.\n
    image_path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Lemma Stem POS/Data/Visual Genome/images"\n
    output_path_folder = "/home/kurubal/Downloads/temp folder"\n
    create_word_folder_and_copy_image(df_sample_result, word_list, "word", "image_id", image_path, output_path_folder)
    '''
    df_search_result = pd.DataFrame()
    for word in word_list:
        path = f"{output_path_folder}/{word}"
        Path(path).mkdir(parents=True, exist_ok=True)        
        df_select = df_source[df_source[f"{word_source_column}"] == word]
        for image_id in df_select[f"{image_id_column}"]:
            image_file = glob.glob(f"{image_folder_path}/*/{image_id}.jpg")
            for l in image_file:
                source = l # source directory
                destination = path
                shutil.copy2(source, destination)

In [46]:
image_path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Lemma Stem POS/Data/Visual Genome/images"

In [47]:
image_folder_list = glob.glob(f"{image_path}/*")
image_folder_list

['/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/Visual Genome/images/VG_100K',
 '/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/Visual Genome/images/VG_100K_2']

In [48]:
output_path_folder = "/home/kurubal/Downloads/temp folder"

#### Visual Genome Word Lemma All Category Data

In [49]:
df_genome_word_lemma_all_category_concat = pd.read_csv(f"")
df_genome_word_lemma_all_category_concat

Unnamed: 0,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,search_text,image_id
0,NUM,,bir,bir,bir,a,a,18835735,a,2390994
1,NUM,,bir,bir,bir,a,a,18835735,a,2348965
2,NUM,,bir,bir,bir,a,a,18835735,a,2349861
3,NUM,,bir,bir,bir,a,a,18835735,a,2349866
4,NUM,,bir,bir,bir,a,a,18835735,a,2349935
...,...,...,...,...,...,...,...,...,...,...
176316,VERB,,çekilin,çek,çek,withdraw,check,69201,airport check in kiosks,2317616
176317,VERB,,çekilin,çek,çek,withdraw,check,69201,red check of tablecloth,2400604
176318,VERB,,çekilin,çek,çek,withdraw,check,69201,a check is on the table,2386272
176319,VERB,,çekilin,çek,çek,withdraw,check,69201,cleats with white check,2371210


#### Copy Move And Delete

In [46]:
output_file = glob.glob(f"")
output_file

['Visual_Genome_Question_Answers_Word_Result2.csv',
 'Visual_Genome_Question_Answers_Lemma_Result2.csv']

In [47]:
for l in output_file:
    source = l # source directory
    destination = path
    shutil.copy2(source, destination)

In [48]:
for j in output_file:
    try:
        os.remove(j)
    except:
        pass

#### Temp

In [19]:
df_test = pd.read_excel("image_text.xlsx")
df_test

Unnamed: 0,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,search_text,image_id
0,ADJ,,üstünde,üst,üst,above,top,86801,top,2371537
1,ADJ,,üstünde,üst,üst,above,top,86801,top,2328281
2,ADJ,,üstünde,üst,üst,above,top,86801,top,2328272
3,ADJ,,üstünde,üst,üst,above,top,86801,top,2328221
4,ADJ,,üstünde,üst,üst,above,top,86801,top,2328063
...,...,...,...,...,...,...,...,...,...,...
5395,NOUN,,konusunda,konu,konu,about,subject,167046,about half,2390601
5396,NOUN,,konusunda,konu,konu,about,subject,167046,about half,2402492
5397,NOUN,,konusunda,konu,konu,about,subject,167046,no subject,2318736
5398,NOUN,,konusunda,konu,konu,about,subject,167046,no subject,2356446
