### Find Word Images Mask Fade

In [1]:
import cv2
import PIL
from PIL import Image, ImageFilter, ImageOps
import matplotlib.pyplot as plt
import os
import sys
import pandas as pd
import numpy as np
import re
import glob
from pathlib import Path
import shutil
from os.path import isfile, join

In [2]:
# language pair
lang_folder = "Turkish"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> target language for learner
#lang_pair = "English"  # Arabic, English, French, German, Turkish, Spanish, Portuguese, Dutch, Italian ==> native language

file_ext = 1000

#image size
x_height_min = 1
y_width_min = 1

In [3]:
#word_lemma_all_data_path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
#Lemma Stem POS/Result/3-2-Word In Visual Genome Merge"

visual_genome_process_data_path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Lemma Stem POS/Result/3-0-Visual Genome Process"

path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Lemma Stem POS/Result/3-3-Find Word Images Mask Fade"

Path(path).mkdir(parents=True, exist_ok=True)

In [4]:
def image_crop_blur(image_file, x_koor, y_koor, height, width, blur_radius=5, blur_part="outside", border_color="orange", border = (3, 3, 3, 3)):
    '''image_crop_blur(image_file, x_koor, y_koor, height, width, blur_radius=5, crop_part="inside", border_color="orange", border = (3, 3, 3, 3))\n
    image_file is a picture and x_koor, y_koor, height and width are its crop coordinates (x_koor,x_koor+height, y_koor,y_koor+width)\n
    blur_radius is GaussianBlur parameters, high value is more blurring, blur_part used for which part (outside and inside) wanted to blur.\n
    border_color is cropped image border color and border is border thickness.
    ex.\n
    result_list[4] = "\media\ssd\sample\895152896.jpg"\n
    height = 150\n
    width = 120\n
    x_koor = 25\n
    y_koor = 10\n
    image_crop_blur(result_list[4], x_koor=x_koor, y_koor=y_koor, height=height, width=width, blur_radius=7, blur_part="outside")
    '''
    img = Image.open(f"{image_file}")
    cropped_image = img.crop((x_koor, y_koor, x_koor+width, y_koor+height))
    # left, top, right, bottom
    #border = (5, 10, 10, 15)
    cropped_image_border = ImageOps.expand(cropped_image, border=border, fill=border_color)
    if blur_part == "outside":
        blurred_img = img.filter(ImageFilter.GaussianBlur(radius=blur_radius))
        blurred_img.paste(cropped_image_border,(x_koor-border[0], y_koor-border[1], x_koor+width+border[2], y_koor+height+border[3]))
        return blurred_img
    elif blur_part == "inside":
        blurred_img = cropped_image_border.filter(ImageFilter.GaussianBlur(radius=blur_radius))
        img.paste(blurred_img,(x_koor-border[0], y_koor-border[1], x_koor+width+border[2], y_koor+height+border[3]))
        return img
    else:
        return img

In [5]:
def image_id_koor_path_crop_blur(df, image_id_column, image_id, image_path, output_path, image_path_addition="*", xheight="height", ywidth="width",
                                xkoor="x_koor", ykoor="y_koor", num_col="num", blur_radius=4, blur_part="outside"):
    '''
    image_id_koor_path_crop_blur(df, image_id_column, image_id, image_path, output_path, image_path_addition="*", xheight="height", ywidth="width",
                                xkoor="x_koor", ykoor="y_koor", num_col="num", blur_radius=4, blur_part="outside")\n
    df is dataframe which includes image_id in image_id_column that represented real image in image_path as image_id name and coordinate columns name like\n
    as xheight="height", ywidth="width", xkoor="x_koor", ykoor="y_koor". Then finding image process with image_crop_blur func using blur_radius=4, blur_part="outside"\n
    and copy to output_path. blur_radius is high that shows more blur and blur_part used for blur "inside" or "outside" of coordinates. image_path_addition="*" used for\n
    glob.glob func.\n
    ex.\n
    image_id_koor_path_crop_blur(df_genome_word_lemma_all_category_concat_dropna_select, "image_id", image_id, image_path, mask_blur_image_path,
                                image_path_addition="*", xheight="height", ywidth="width", xkoor="x_koor", ykoor="y_koor", num_col="num", blur_radius=4, blur_part="outside")
    '''
    df_dropna = df.dropna(subset=[f'{xheight}',f'{ywidth}',f'{xkoor}',f'{ykoor}'])
    df_var_result = df_dropna[df_dropna[f"{image_id_column}"] == image_id]
    for i in range(len(df_var_result)):
        df_var = df_var_result.iloc[[i],]
        num = int(df_var.loc[:,f"{num_col}"])     
        height = int(df_var.loc[:,f"{xheight}"])
        width = int(df_var.loc[:,f"{ywidth}"])
        x_koor = int(df_var.loc[:,f"{xkoor}"])
        y_koor = int(df_var.loc[:,f"{ykoor}"])
        image_path_file_result = glob.glob(f"{image_path}/{image_path_addition}/{image_id}.jpg")[0]
        image_out = image_crop_blur(image_path_file_result, x_koor, y_koor, height, width, blur_radius=blur_radius, blur_part=blur_part)
        image_out.save(f"{output_path}/{image_id}_{num}.jpg")

##### Image Path

In [6]:
image_path = f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/Lemma Stem POS/Data/Visual Genome/images"
mask_blur_image_path = f"{image_path}/VG_Mask_Blur"

Path(mask_blur_image_path).mkdir(parents=True, exist_ok=True)

In [7]:
image_folder_list = glob.glob(f"{image_path}/*")
image_folder_list

['/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/Visual Genome/images/VG_100K',
 '/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/Visual Genome/images/VG_100K_2',
 '/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/Visual Genome/images/VG_Mask_Blur',
 '/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/Turkish/Lemma Stem POS/Data/Visual Genome/images/VG_Mask_Blur2']

#### Visual Genome Word Lemma All Category Data

In [8]:
df_genome_word_lemma_all_category_concat = pd.read_excel(f"/media/kurubal/SSD/Data Scientist/Work/Modern Ways/Project/{lang_folder.capitalize()}/\
Lemma Stem POS/Result/3-2-Word In Visual Genome Merge/Visual_Genome_{file_ext}_Word_Lemma_All_Category_Concat_Result.xlsx")
df_genome_word_lemma_all_category_concat

Unnamed: 0,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,object_id,names,...,y_koor,num,attributes,sub_name,sub_object_id,questions,que_and_ans_id,answers,phrases,region_id
0,NUM,,bir,bir,bir,a,a,18835735,4446407.0,a,...,305.0,2920126305,,,,,,,,
1,NUM,,bir,bir,bir,a,a,18835735,3904298.0,a,...,113.0,129190150113,,,,,,,,
2,NUM,,bir,bir,bir,a,a,18835735,3904300.0,a,...,233.0,123317182233,,,,,,,,
3,NUM,,bir,bir,bir,a,a,18835735,3902259.0,a,...,168.0,4615142168,,,,,,,,
4,NUM,,bir,bir,bir,a,a,18835735,3902256.0,a,...,97.0,54188097,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197127,VERB,,çekilin,çek,çek,withdraw,check,69201,,,...,37.0,276026737,,,,,,,airport check in kiosks,4492506.0
197128,VERB,,çekilin,çek,çek,withdraw,check,69201,,,...,459.0,376517459,,,,,,,red check of tablecloth,726657.0
197129,VERB,,çekilin,çek,çek,withdraw,check,69201,,,...,236.0,95132301236,,,,,,,a check is on the table,1814370.0
197130,VERB,,çekilin,çek,çek,withdraw,check,69201,,,...,194.0,2211405194,,,,,,,cleats with white check,2132765.0


In [9]:
select_image_id_list = list(set(df_genome_word_lemma_all_category_concat["image_id"]))
select_num_list = list(set(df_genome_word_lemma_all_category_concat["num"]))

In [10]:
df_genome_word_lemma_all_category_concat["POS1"].value_counts()

NOUN     87123
VERB     51815
ADJ      24838
PRON     16993
ADV       7351
NUM       3024
CCONJ     2945
ADP       1606
AUX       1437
Name: POS1, dtype: int64

##### Visual Genome Coordinate Data For Crop Blur

In [11]:
df_objects = pd.read_csv(f"{visual_genome_process_data_path}/Visual_Genome_Objects_Analysis.csv")
df_objects = df_objects.drop(["object_id","names","image_url","synsets","merged_object_ids"], axis=1)
df_objects

  df_objects = pd.read_csv(f"{visual_genome_process_data_path}/Visual_Genome_Objects_Analysis.csv")


Unnamed: 0,image_id,height,width,x_koor,y_koor,num
0,1,557,799,0,0,55779900
1,1,290,722,78,308,29072278308
2,1,538,222,1,0,53822210
3,1,258,359,439,283,258359439283
4,1,535,135,0,1,53513501
...,...,...,...,...,...,...
2516934,2417997,24,28,188,228,2428188228
2516935,2417997,54,89,238,233,5489238233
2516936,2417997,83,90,312,247,8390312247
2516937,2417997,121,495,2,208,1214952208


In [12]:
df_attributes = pd.read_csv(f"{visual_genome_process_data_path}/Visual_Genome_Attributes_Analysis.csv")
df_attributes = df_attributes.drop(['object_id','names','attributes','synsets'], axis=1)
df_attributes

Unnamed: 0,image_id,height,width,x_koor,y_koor,num
0,1,339,79,421,91,3397942191
1,1,262,714,77,328,26271477328
2,1,192,274,119,338,192274119338
3,1,262,60,238,249,26260238249
4,1,26,52,243,489,2652243489
...,...,...,...,...,...,...
3802369,2417997,170,497,1,2,17049712
3802370,2417997,83,90,312,247,8390312247
3802371,2417997,144,50,98,127,1445098127
3802372,2417997,155,225,29,109,15522529109


In [13]:
df_relationships = pd.read_csv(f"{visual_genome_process_data_path}/Visual_Genome_Relationships_Analysis.csv")
df_relationships = df_relationships.drop(['obj_names', 'obj_object_id', 'obj_synsets','obj_merged_object_ids','obj_height','obj_width','obj_x_koor',
                                            'obj_y_koor','relationships_id','synsets','sub_name','sub_object_id','sub_synsets','predicate'], axis=1)
df_relationships = df_relationships.rename(columns={"sub_height":"height","sub_width":"width","sub_x_koor":"x_koor","sub_y_koor":"y_koor"})
df_relationships

Unnamed: 0,image_id,height,width,x_koor,y_koor,num
0,1,192,274,119,338,192274119338
1,1,262,60,238,249,26260238249
2,1,98,74,479,315,9874479315
3,1,182,88,118,13,1828811813
4,1,327,87,622,234,32787622234
...,...,...,...,...,...,...
2316099,2417997,54,89,238,233,5489238233
2316100,2417997,182,287,14,110,18228714110
2316101,2417997,193,289,14,110,19328914110
2316102,2417997,193,289,14,110,19328914110


In [14]:
#df_question_answers = pd.read_csv(f"{visual_genome_process_data_path}/Visual_Genome_Question_Answers_Analysis.csv") # Not Include x and y coor
#df_question_answers = df_question_answers.drop(['questions','que_and_ans_id','answers'], axis=1)
#df_question_answers

In [15]:
df_region_descriptions = pd.read_csv(f"{visual_genome_process_data_path}/Visual_Genome_Region_Descriptions_Analysis.csv")
df_region_descriptions = df_region_descriptions.drop(['phrases','region_id'], axis=1)
df_region_descriptions

  df_region_descriptions = pd.read_csv(f"{visual_genome_process_data_path}/Visual_Genome_Region_Descriptions_Analysis.csv")


Unnamed: 0,image_id,height,width,x_koor,y_koor,num
0,1,139,82,421,57,1398242157
1,1,109,182,194,372,109182194372
2,1,30,61,241,491,3061241491
3,1,36,36,617,377,3636617377
4,1,49,41,322,298,4941322298
...,...,...,...,...,...,...
5408684,2417997,181,303,0,111,1813030111
5408685,2417997,15,23,191,231,1523191231
5408686,2417997,13,17,259,247,1317259247
5408687,2417997,112,317,180,218,112317180218


In [16]:
df_genome_all_coor_data_concat = pd.concat([df_objects,df_attributes,df_relationships,df_region_descriptions], axis=0)
df_genome_all_coor_data_concat = df_genome_all_coor_data_concat.drop_duplicates()
df_genome_all_coor_data_concat = df_genome_all_coor_data_concat.reset_index(drop=True)
df_genome_all_coor_data_concat

Unnamed: 0,image_id,height,width,x_koor,y_koor,num
0,1,557,799,0,0,55779900
1,1,290,722,78,308,29072278308
2,1,538,222,1,0,53822210
3,1,258,359,439,283,258359439283
4,1,535,135,0,1,53513501
...,...,...,...,...,...,...
11123953,2417997,181,303,0,111,1813030111
11123954,2417997,15,23,191,231,1523191231
11123955,2417997,13,17,259,247,1317259247
11123956,2417997,112,317,180,218,112317180218


In [17]:
df_genome_all_coor_data_concat_dropna = df_genome_all_coor_data_concat.dropna(subset=["height","width","x_koor","y_koor"])
df_genome_all_coor_data_concat_dropna

Unnamed: 0,image_id,height,width,x_koor,y_koor,num
0,1,557,799,0,0,55779900
1,1,290,722,78,308,29072278308
2,1,538,222,1,0,53822210
3,1,258,359,439,283,258359439283
4,1,535,135,0,1,53513501
...,...,...,...,...,...,...
11123953,2417997,181,303,0,111,1813030111
11123954,2417997,15,23,191,231,1523191231
11123955,2417997,13,17,259,247,1317259247
11123956,2417997,112,317,180,218,112317180218


In [18]:
df_genome_all_coor_data_concat_dropna_limit = df_genome_all_coor_data_concat_dropna[(df_genome_all_coor_data_concat_dropna["height"] >= x_height_min) & (df_genome_all_coor_data_concat_dropna["width"] >= y_width_min)]
#df_genome_all_coor_data_concat_dropna_limit = df_genome_all_coor_data_concat_dropna_select.head(100)
df_genome_all_coor_data_concat_dropna_limit

Unnamed: 0,image_id,height,width,x_koor,y_koor,num
0,1,557,799,0,0,55779900
1,1,290,722,78,308,29072278308
2,1,538,222,1,0,53822210
3,1,258,359,439,283,258359439283
4,1,535,135,0,1,53513501
...,...,...,...,...,...,...
11123953,2417997,181,303,0,111,1813030111
11123954,2417997,15,23,191,231,1523191231
11123955,2417997,13,17,259,247,1317259247
11123956,2417997,112,317,180,218,112317180218


In [19]:
df_genome_all_coor_data_concat_dropna_select = df_genome_all_coor_data_concat_dropna_limit[(df_genome_all_coor_data_concat_dropna_limit["num"].isin(select_num_list)) & (df_genome_all_coor_data_concat_dropna_limit["image_id"].isin(select_image_id_list))]
df_genome_all_coor_data_concat_dropna_select

Unnamed: 0,image_id,height,width,x_koor,y_koor,num
11,1,248,82,367,264,24882367264
15,1,164,80,719,342,16480719342
16,1,164,70,716,345,16470716345
59,3,245,506,131,233,245506131233
67,3,138,113,525,95,13811352595
...,...,...,...,...,...,...
11122144,2417961,60,25,286,41,602528641
11122530,2417969,81,40,289,213,8140289213
11122585,2417970,13,4,64,226,13464226
11122630,2417971,134,497,0,197,1344970197


In [20]:
image_id_list = list(set(df_genome_all_coor_data_concat_dropna_select["image_id"]))
len(image_id_list)

32283

In [132]:
for image_id in image_id_list:
    try:
        image_id_koor_path_crop_blur(df_genome_all_coor_data_concat_dropna_select,"image_id",image_id,image_path,mask_blur_image_path,
                                image_path_addition="*",xheight="height",ywidth="width",xkoor="x_koor",ykoor="y_koor",num_col="num",blur_radius=6,blur_part="outside")
    except:
        pass

##### Word Find In Coordinate Data

In [20]:
df_genome_word_lemma_all_category_concat

Unnamed: 0,POS1,POS2,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,frequency,object_id,names,...,y_koor,num,attributes,sub_name,sub_object_id,questions,que_and_ans_id,answers,phrases,region_id
0,NUM,,bir,bir,bir,a,a,18835735,4446407.0,a,...,305.0,2920126305,,,,,,,,
1,NUM,,bir,bir,bir,a,a,18835735,3904298.0,a,...,113.0,129190150113,,,,,,,,
2,NUM,,bir,bir,bir,a,a,18835735,3904300.0,a,...,233.0,123317182233,,,,,,,,
3,NUM,,bir,bir,bir,a,a,18835735,3902259.0,a,...,168.0,4615142168,,,,,,,,
4,NUM,,bir,bir,bir,a,a,18835735,3902256.0,a,...,97.0,54188097,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
197127,VERB,,çekilin,çek,çek,withdraw,check,69201,,,...,37.0,276026737,,,,,,,airport check in kiosks,4492506.0
197128,VERB,,çekilin,çek,çek,withdraw,check,69201,,,...,459.0,376517459,,,,,,,red check of tablecloth,726657.0
197129,VERB,,çekilin,çek,çek,withdraw,check,69201,,,...,236.0,95132301236,,,,,,,a check is on the table,1814370.0
197130,VERB,,çekilin,çek,çek,withdraw,check,69201,,,...,194.0,2211405194,,,,,,,cleats with white check,2132765.0


In [21]:
len(set(df_genome_word_lemma_all_category_concat["image_id"]))

48828

In [22]:
df_genome_word_lemma_all_category_concat_dropna = df_genome_word_lemma_all_category_concat.dropna(subset=["height","width","x_koor","y_koor"])
df_genome_word_lemma_all_category_concat_dropna = df_genome_word_lemma_all_category_concat_dropna[["num","word","lemma.spacy","stem","word_en_translate","lemma_en_translate","image_id","height","width","x_koor","y_koor"]]
df_genome_word_lemma_all_category_concat_dropna

Unnamed: 0,num,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,image_id,height,width,x_koor,y_koor
0,2920126305,bir,bir,bir,a,a,2390994,29.0,20.0,126.0,305.0
1,129190150113,bir,bir,bir,a,a,2348965,129.0,190.0,150.0,113.0
2,123317182233,bir,bir,bir,a,a,2348965,123.0,317.0,182.0,233.0
3,4615142168,bir,bir,bir,a,a,2349861,46.0,15.0,142.0,168.0
4,54188097,bir,bir,bir,a,a,2349866,54.0,188.0,0.0,97.0
...,...,...,...,...,...,...,...,...,...,...,...
197127,276026737,çekilin,çek,çek,withdraw,check,2317616,27.0,60.0,267.0,37.0
197128,376517459,çekilin,çek,çek,withdraw,check,2400604,37.0,65.0,17.0,459.0
197129,95132301236,çekilin,çek,çek,withdraw,check,2386272,95.0,132.0,301.0,236.0
197130,2211405194,çekilin,çek,çek,withdraw,check,2371210,22.0,11.0,405.0,194.0


In [23]:
df_genome_word_lemma_all_category_concat_dropna_select = df_genome_word_lemma_all_category_concat_dropna[(df_genome_word_lemma_all_category_concat_dropna["height"] >= x_height_min) & (df_genome_word_lemma_all_category_concat_dropna["width"] >= y_width_min)]
df_genome_word_lemma_all_category_concat_dropna_select

Unnamed: 0,num,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,image_id,height,width,x_koor,y_koor
0,2920126305,bir,bir,bir,a,a,2390994,29.0,20.0,126.0,305.0
1,129190150113,bir,bir,bir,a,a,2348965,129.0,190.0,150.0,113.0
2,123317182233,bir,bir,bir,a,a,2348965,123.0,317.0,182.0,233.0
3,4615142168,bir,bir,bir,a,a,2349861,46.0,15.0,142.0,168.0
4,54188097,bir,bir,bir,a,a,2349866,54.0,188.0,0.0,97.0
...,...,...,...,...,...,...,...,...,...,...,...
197127,276026737,çekilin,çek,çek,withdraw,check,2317616,27.0,60.0,267.0,37.0
197128,376517459,çekilin,çek,çek,withdraw,check,2400604,37.0,65.0,17.0,459.0
197129,95132301236,çekilin,çek,çek,withdraw,check,2386272,95.0,132.0,301.0,236.0
197130,2211405194,çekilin,çek,çek,withdraw,check,2371210,22.0,11.0,405.0,194.0


In [24]:
df_genome_word_lemma_all_category_concat_dropna_select_merge = pd.merge(df_genome_word_lemma_all_category_concat_dropna_select,df_genome_all_coor_data_concat_dropna_select,
                                                                        how="inner", on=["image_id","height","width","x_koor","y_koor","num"])
df_genome_word_lemma_all_category_concat_dropna_select_merge = df_genome_word_lemma_all_category_concat_dropna_select_merge.drop_duplicates()
df_genome_word_lemma_all_category_concat_dropna_select_merge                                            

Unnamed: 0,num,word,lemma.spacy,stem,word_en_translate,lemma_en_translate,image_id,height,width,x_koor,y_koor
0,2920126305,bir,bir,bir,a,a,2390994,29.0,20.0,126.0,305.0
1,129190150113,bir,bir,bir,a,a,2348965,129.0,190.0,150.0,113.0
2,123317182233,bir,bir,bir,a,a,2348965,123.0,317.0,182.0,233.0
3,4615142168,bir,bir,bir,a,a,2349861,46.0,15.0,142.0,168.0
4,54188097,bir,bir,bir,a,a,2349866,54.0,188.0,0.0,97.0
...,...,...,...,...,...,...,...,...,...,...,...
97131,2034269302,çekilin,çek,çek,withdraw,check,2318219,20.0,34.0,269.0,302.0
97132,4612198435,çekilin,çek,çek,withdraw,check,2333148,461.0,219.0,84.0,35.0
97133,276026737,çekilin,çek,çek,withdraw,check,2317616,27.0,60.0,267.0,37.0
97134,376517459,çekilin,çek,çek,withdraw,check,2400604,37.0,65.0,17.0,459.0


In [25]:
df_genome_word_lemma_all_category_concat_dropna_select_merge.to_csv(f"Visual_Genome_{file_ext}_Word_Lemma_Coordinate_Search_Result.csv", index=False)

#### Copy Move And Delete

In [26]:
output_file = glob.glob(f"Visual_Genome_{file_ext}_Word_Lemma_Coordinate_Search_Result.csv")
output_file

['Visual_Genome_1000_Word_Lemma_Coordinate_Search_Result.csv']

In [27]:
for l in output_file:
    source = l # source directory
    destination = path
    shutil.copy2(source, destination)

In [28]:
for j in output_file:
    try:
        os.remove(j)
    except:
        pass