# Image Dimmensions

In [1]:
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import os
import math
import matplotlib.pyplot as plt

In [2]:

test_data=pd.read_csv('archive/im2latex_test.csv')
train_data=pd.read_csv('archive/im2latex_train.csv')
val_data=pd.read_csv('archive/im2latex_validate.csv')

In [3]:
def Write_Image_Sizes(filenames,storage_file):
    """
    Takes the File names, writes the width and height of images in csv along with file names
    """
    store_file=open(storage_file,'w+')
    store_file.write("ImageName,Height,Width")
    store_file.write("\n")
    cnt=0
    for file in filenames:
        cv_img=cv2.imread("archive/formula_images_processed/formula_images_processed/" + file)
        #img.shape gives (img_height,img_width,img_channel)
        store_file.write(str(file)+","+str(cv_img.shape[0])+","+str(cv_img.shape[1]))
        store_file.write("\n")
        cnt+=1
        if cnt%10000==0:
            print("Processed Images: ",cnt)
    store_file.close()       

In [4]:
train_image_names=list(train_data['image'].values)
val_image_names=list(val_data['image'].values)
test_image_names=list(test_data['image'].values)

In [5]:
Write_Image_Sizes(train_image_names,'data/data_analysis/Train_image_sizes.csv')

Processed Images:  10000
Processed Images:  20000
Processed Images:  30000
Processed Images:  40000
Processed Images:  50000
Processed Images:  60000
Processed Images:  70000


In [6]:
Write_Image_Sizes(val_image_names,'data/data_analysis/Val_image_sizes.csv')

In [7]:
Write_Image_Sizes(test_image_names,'data/data_analysis/Test_image_sizes.csv')

Processed Images:  10000


In [8]:

train_img_size=pd.read_csv('data/data_analysis/Train_image_sizes.csv')
val_img_size=pd.read_csv('data/data_analysis/Val_image_sizes.csv')
test_img_size=pd.read_csv('data/data_analysis/Test_image_sizes.csv')

In [9]:
print(test_img_size[test_img_size["Width"]>480].count() / test_img_size.shape[0])

ImageName    0.076678
Height       0.076678
Width        0.076678
dtype: float64


In [10]:
for i in range(11):
    print("Train Images Width "+str(90+i)+ " percentile :",np.percentile(train_img_size['Width'].values,90+i))
print("="*60)
for i in range(11):
    print("Validation Images Width "+str(90+i)+ " percentile :",np.percentile(val_img_size['Width'].values,90+i))
print("="*60)
for i in range(11):
    print("Test Images Width "+str(90+i)+ " percentile :",np.percentile(test_img_size['Width'].values,90+i))
   

Train Images Width 90 percentile : 480.0
Train Images Width 91 percentile : 480.0
Train Images Width 92 percentile : 480.0
Train Images Width 93 percentile : 480.0
Train Images Width 94 percentile : 480.0
Train Images Width 95 percentile : 480.0
Train Images Width 96 percentile : 480.0
Train Images Width 97 percentile : 480.0
Train Images Width 98 percentile : 480.0
Train Images Width 99 percentile : 480.0
Train Images Width 100 percentile : 480.0
Validation Images Width 90 percentile : 480.0
Validation Images Width 91 percentile : 480.0
Validation Images Width 92 percentile : 480.0
Validation Images Width 93 percentile : 480.0
Validation Images Width 94 percentile : 480.0
Validation Images Width 95 percentile : 480.0
Validation Images Width 96 percentile : 480.0
Validation Images Width 97 percentile : 480.0
Validation Images Width 98 percentile : 480.0
Validation Images Width 99 percentile : 480.0
Validation Images Width 100 percentile : 480.0
Test Images Width 90 percentile : 480.0
T

In [11]:
for i in range(11):
    print("Train Images Height "+str(90+i)+ " percentile :",np.percentile(train_img_size['Height'].values,90+i))
print("="*60)
for i in range(11):
    print("Validation Images Height "+str(90+i)+ " percentile :",np.percentile(val_img_size['Height'].values,90+i))
print("="*60)
for i in range(11):
    print("Test Images Height "+str(90+i)+ " percentile :",np.percentile(test_img_size['Height'].values,90+i))
   

Train Images Height 90 percentile : 64.0
Train Images Height 91 percentile : 64.0
Train Images Height 92 percentile : 64.0
Train Images Height 93 percentile : 64.0
Train Images Height 94 percentile : 64.0
Train Images Height 95 percentile : 64.0
Train Images Height 96 percentile : 64.0
Train Images Height 97 percentile : 64.0
Train Images Height 98 percentile : 64.0
Train Images Height 99 percentile : 96.0
Train Images Height 100 percentile : 160.0
Validation Images Height 90 percentile : 64.0
Validation Images Height 91 percentile : 64.0
Validation Images Height 92 percentile : 64.0
Validation Images Height 93 percentile : 64.0
Validation Images Height 94 percentile : 64.0
Validation Images Height 95 percentile : 64.0
Validation Images Height 96 percentile : 64.0
Validation Images Height 97 percentile : 64.0
Validation Images Height 98 percentile : 64.0
Validation Images Height 99 percentile : 96.0
Validation Images Height 100 percentile : 160.0
Test Images Height 90 percentile : 96.0

val/train:
=> 98% of images have a height of 64 or less and a width of 480 or less
=> the smallest images are of size (32, 128) and of height (160, 480)


In [12]:
train_img_size[train_img_size["Width"]>480].count() / test_img_size.shape[0]

ImageName    0.0
Height       0.0
Width        0.0
dtype: float64

In [13]:
val_img_size.describe()

Unnamed: 0,Height,Width
count,8370.0,8370.0
mean,52.656631,283.075747
std,17.190752,108.656045
min,32.0,128.0
25%,32.0,192.0
50%,64.0,256.0
75%,64.0,384.0
max,160.0,480.0


In [14]:
train_img_size.describe()

Unnamed: 0,Height,Width
count,75275.0,75275.0
mean,52.868549,281.247586
std,17.024796,108.426687
min,32.0,128.0
25%,32.0,192.0
50%,64.0,256.0
75%,64.0,384.0
max,160.0,480.0


In [15]:
test_img_size[test_img_size["Height"]>96].count() / test_img_size.shape[0]

ImageName    0.087011
Height       0.087011
Width        0.087011
dtype: float64

In [16]:
test_img_size[test_img_size["Width"]>480].count() / test_img_size.shape[0]

ImageName    0.076678
Height       0.076678
Width        0.076678
dtype: float64

test:
=> 1% of the images are larger than (800, 128) (only 0.4%)
=> 8.7% of the images have a with of more than 96
=> 7.6% of the images have a width of more than 480

In [17]:
test_img_size.describe()

Unnamed: 0,Height,Width
count,10355.0,10355.0
mean,60.64703,318.621729
std,41.887919,159.238145
min,32.0,128.0
25%,32.0,192.0
50%,64.0,320.0
75%,64.0,384.0
max,800.0,800.0


=> good image size (480, 96) with this we would have to crop (or drop) 10% of the test images 
=> from taking a look at the pictures it becomes evident, that almost all of them have to much padding on the bottom or on the right hand side of the image

In [18]:
WIDTH = 480
HEIGHT = 96

In [21]:
def save_new_image(dataset,save_dir):
    """
    Takes the image name and saves the image in the save_dir
    """
    for image_name in dataset['image'].values:
        img=cv2.imread("archive/formula_images_processed/formula_images_processed/" + image_name)
        img_height, img_width=img.shape[0],img.shape[1]

        if img_height / img_width < HEIGHT / WIDTH:
            new_height = HEIGHT / WIDTH * img_width
            pad = (new_height - img_height) / 2
            img_padded= cv2.copyMakeBorder(img, math.ceil(pad), math.floor(pad), 0, 0, cv2.BORDER_CONSTANT, value=[255, 255, 255])

        if img_height / img_width > HEIGHT / WIDTH:
            new_width = img_height * WIDTH / HEIGHT
            pad = (new_width - img_height)
            img_padded= cv2.copyMakeBorder(img, 0, 0 , math.ceil(pad),math.floor(pad), cv2.BORDER_CONSTANT, value=[255, 255, 255])

        img_rescaled = img_padded #img_padded / (255 / 2) - 1 # Rescale to [-1, 1]
        img_resize = cv2.resize(img_rescaled, [WIDTH, HEIGHT])

        cv2.imwrite(save_dir + image_name, img_resize)

        


In [22]:
save_new_image(train_data,'data/train_images/')
