In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import numpy as np
import pandas as pd
from PIL import Image as pilImg
import os 
import cv2
from datetime import datetime
import matplotlib.pyplot as plt
import itertools

#**PREPARING  TRAIN DATA**

In [4]:
def Extract_image_names(file_path,number):
    
    with open(file_path) as f:
        file_names=f.readlines()
        f.close()
        count=0
        img_names=[]
        for file in file_names:
            _,label,_=file.split('_')
            if len(label)>=4 and len(label)<=12:
                img_names.append(file)
                count+=1
            if count==number:
                break
        images_names=['SynthImageDataset'+x.strip() for x in img_names]
        return images_names

In [5]:

train_images= Extract_image_names('/content/gdrive/MyDrive/annotation_train.txt',200000)

In [6]:
train_images[:5]

['SynthImageDataset./2425/1/115_Lube_45484.jpg 45484',
 'SynthImageDataset./2425/1/114_Spencerian_73323.jpg 73323',
 'SynthImageDataset./2425/1/112_CARPENTER_11682.jpg 11682',
 'SynthImageDataset./2425/1/110_savannas_67969.jpg 67969',
 'SynthImageDataset./2425/1/109_unfix_82473.jpg 82473']

In [7]:
def clean_file_names(file_names):
    clean_files=[]
    for file in file_names:
        main_folder,img_loc,extension=file.split('.')
        
        extension,_=extension.split(' ')
        img_file=main_folder+img_loc+'.'+extension
        clean_files.append(img_file)
    return clean_files

In [8]:
train_cleaned=clean_file_names(train_images)

In [9]:
train_cleaned[:5]

['SynthImageDataset/2425/1/115_Lube_45484.jpg',
 'SynthImageDataset/2425/1/114_Spencerian_73323.jpg',
 'SynthImageDataset/2425/1/112_CARPENTER_11682.jpg',
 'SynthImageDataset/2425/1/110_savannas_67969.jpg',
 'SynthImageDataset/2425/1/109_unfix_82473.jpg']

In [10]:
train_data=pd.DataFrame({'ImageName':train_cleaned})

In [11]:
train_data.head()

Unnamed: 0,ImageName
0,SynthImageDataset/2425/1/115_Lube_45484.jpg
1,SynthImageDataset/2425/1/114_Spencerian_73323.jpg
2,SynthImageDataset/2425/1/112_CARPENTER_11682.jpg
3,SynthImageDataset/2425/1/110_savannas_67969.jpg
4,SynthImageDataset/2425/1/109_unfix_82473.jpg


#**EXTRACTING GROUND TRUTH VALUES**

In [12]:
def extract_ground_truth(files):
    txt_labels=[]
    for file in files:
        folder,ground_truth,image=file.split('_')
        ground_truth=ground_truth.upper()
        txt_labels.append(ground_truth)
    return txt_labels

In [13]:
Train_ground_truths=extract_ground_truth(train_cleaned)

In [14]:
train_data['Labels']=Train_ground_truths

In [15]:
train_data.head()

Unnamed: 0,ImageName,Labels
0,SynthImageDataset/2425/1/115_Lube_45484.jpg,LUBE
1,SynthImageDataset/2425/1/114_Spencerian_73323.jpg,SPENCERIAN
2,SynthImageDataset/2425/1/112_CARPENTER_11682.jpg,CARPENTER
3,SynthImageDataset/2425/1/110_savannas_67969.jpg,SAVANNAS
4,SynthImageDataset/2425/1/109_unfix_82473.jpg,UNFIX


In [16]:
train_data.to_csv('Train_data.csv')

#**TRAIN DATA CREATION**

```
# This is formatted as code
```



In [18]:
Validation_images=Extract_image_names('/content/gdrive/MyDrive/annotation_train.txt',12000)

In [19]:
val_cleaned=clean_file_names(Validation_images)

In [20]:
val_data=pd.DataFrame({'ImageName':val_cleaned})

In [21]:
val_data.head()

Unnamed: 0,ImageName
0,SynthImageDataset/2425/1/115_Lube_45484.jpg
1,SynthImageDataset/2425/1/114_Spencerian_73323.jpg
2,SynthImageDataset/2425/1/112_CARPENTER_11682.jpg
3,SynthImageDataset/2425/1/110_savannas_67969.jpg
4,SynthImageDataset/2425/1/109_unfix_82473.jpg


In [22]:
Val_ground_truths=extract_ground_truth(val_cleaned)

In [23]:
val_data['Labels']=Val_ground_truths

In [24]:
val_data.head()

Unnamed: 0,ImageName,Labels
0,SynthImageDataset/2425/1/115_Lube_45484.jpg,LUBE
1,SynthImageDataset/2425/1/114_Spencerian_73323.jpg,SPENCERIAN
2,SynthImageDataset/2425/1/112_CARPENTER_11682.jpg,CARPENTER
3,SynthImageDataset/2425/1/110_savannas_67969.jpg,SAVANNAS
4,SynthImageDataset/2425/1/109_unfix_82473.jpg,UNFIX


In [25]:
val_data.to_csv('Validation_data.csv')

#**TEST DATA CREATION**

In [26]:
test_images=Extract_image_names('/content/gdrive/MyDrive/annotation_test.txt',15000)

In [27]:
test_cleaned=clean_file_names(test_images)

In [28]:
test_data=pd.DataFrame({'ImageName':test_cleaned})

In [29]:
test_data.head()

Unnamed: 0,ImageName
0,SynthImageDataset/3000/7/182_slinking_71711.jpg
1,SynthImageDataset/3000/7/181_REMODELERS_64541.jpg
2,SynthImageDataset/3000/7/180_Chronographs_1353...
3,SynthImageDataset/3000/7/179_Impeaching_38222.jpg
4,SynthImageDataset/3000/7/177_Loots_45256.jpg


In [30]:
test_ground_truths=extract_ground_truth(test_cleaned)

In [31]:
test_data['Labels']=test_ground_truths

In [32]:
test_data.head()

Unnamed: 0,ImageName,Labels
0,SynthImageDataset/3000/7/182_slinking_71711.jpg,SLINKING
1,SynthImageDataset/3000/7/181_REMODELERS_64541.jpg,REMODELERS
2,SynthImageDataset/3000/7/180_Chronographs_1353...,CHRONOGRAPHS
3,SynthImageDataset/3000/7/179_Impeaching_38222.jpg,IMPEACHING
4,SynthImageDataset/3000/7/177_Loots_45256.jpg,LOOTS


In [33]:
test_data.to_csv('Test_data.csv')

**Converting Images to Grayscale i.e Single Channel**

In [48]:
def img_store_single_channel(destination_folder,files):
    start=datetime.now()
    destination_list=[]
    count=1
    for file in files:
        #Removing the extra folder structures
        _,_,_,Name=file.split('/')
        _,img,_=Name.split('_')
        destination=destination_folder+str(count)+'_'+img+'.jpg'
        cv_img=cv2.imread(file)
        cv_img_sc=cv_resized[:,:,1]
        cv2.imwrite(destination,cv_img_sc)
        destination_list.append(destination)
        count+=1
#         if count%10000==0:
#             print("Processed Images: ",count)
    print('Time Taken for Processing: ',datetime.now() - start)
    return destination_list