In [None]:
import pandas as pd
from pathlib import Path

In [None]:
import keras, os
import tensorflow as tf
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from tqdm import tqdm
from tensorflow.keras.preprocessing.image import load_img

from keras.layers import Input, Dense, Activation, BatchNormalization, Flatten, Conv2D
from keras.layers import MaxPooling2D, Dropout, UpSampling2D


In [None]:
train_zip_path = '/kaggle/input/denoising-dirty-documents/train.zip'
test_zip_path = '/kaggle/input/denoising-dirty-documents/test.zip'
sample_zip_path = '/kaggle/input/denoising-dirty-documents/sampleSubmission.csv.zip'
trainclean_zip_path = '/kaggle/input/denoising-dirty-documents/train_cleaned.zip'
extracting_path = '/kaggle/working'


In [None]:
import zipfile
with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracting_path)
    
with zipfile.ZipFile(test_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracting_path)
    
with zipfile.ZipFile(sample_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracting_path)
    
with zipfile.ZipFile(trainclean_zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracting_path)


In [None]:
img_arr = mpimg.imread(extracting_path + '/train/107.png')
h, w = img_arr.shape
print('Height: ', h,'- Width: ',w)
print(img_arr.dtype)


In [None]:
#Figure out images shape.
image_names = os.listdir(extracting_path + '/train')
data_size = len(image_names)
#initailize output arrays.
X = np.zeros([data_size, 2], dtype=np.uint16)
for i in tqdm(range(data_size)):
    image_name = image_names[i]
    img_dir = os.path.join(extracting_path + '/train', image_name)
    img_pixels = mpimg.imread(img_dir)
    X[i] = img_pixels.shape

print('Number of training images:', data_size)
print('Differnet image hights: {}'.format(set(X[:,0])))
print('Differnet image widths: {}'.format(set(X[:,1])))


In [None]:
def images_to_array(data_dir, label_dir=None, img_size=(h, w)):
    '''
    1- Read image samples from certain directory.
    2- Stack them into one big numpy array.
    -- And if there are labels images ..
    3- Read sample's label form the labels directory.
    4- Stack them into one big numpy array.
    5- Shuffle Data and label arrays.
    '''
    image_names = os.listdir(data_dir)
    data_size = len(image_names)
    #initailize data arrays.
    X = np.zeros([data_size, img_size[0], img_size[1]], dtype=np.uint8)
    #read data.
    for i in tqdm(range(data_size)):
        image_name = image_names[i]
        img_dir = os.path.join(data_dir, image_name)
        img_pixels = load_img(img_dir, color_mode='grayscale', target_size=(h, w))
        X[i] = img_pixels
    #reshape into 4-d array    
    X = X.reshape(data_size, h, w, 1) 
    
    if label_dir:
        label_names = os.listdir(label_dir)
        data_size = len(label_names)
        #initailize labels arrays.
        y = np.zeros([data_size, img_size[0], img_size[1]], dtype=np.uint8)
        #read lables.
        for i in tqdm(range(data_size)):
            image_name = label_names[i]
            img_dir = os.path.join(label_dir, image_name)
            img_pixels = load_img(img_dir, color_mode='grayscale', target_size=(h, w))
            y[i] = img_pixels
        #reshape into 4-d array    
        y = y.reshape(data_size, h, w, 1) 
        #shuffle    
        ind = np.random.permutation(data_size)
        X = X[ind]
        y = y[ind]
        print('Ouptut Data Size: ', X.shape)
        print('Ouptut Label Size: ', y.shape)
        return X/255., y/255.
    
    print('Ouptut Data Size: ', X.shape)
    return X/255.


In [None]:
X, y = images_to_array(extracting_path + '/train', extracting_path + '/train_cleaned')


In [None]:
val_split = int(.3 * data_size)
X_val, y_val = X[:val_split], y[:val_split]
X_train, y_train = X[val_split:], y[val_split:]
print('Train data shape: ', X_train.shape)
print('Test data shape: ', X_val.shape)


In [None]:
# First row will be raw data, second row will be the corresponding cleaned images
samples = np.concatenate((X_train[:3], y_train[:3]), axis=0) 

f, ax = plt.subplots(2, 3, figsize=(20,10))
for i, img in enumerate(samples):
    ax[i//3, i%3].imshow(img[:,:,0], cmap='gray')
    ax[i//3, i%3].axis('off')
plt.show() 


In [None]:
input_layer = Input(shape=(None, None, 1))
# encoder
x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_layer)
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = MaxPooling2D((2, 2), padding='same')(x)

# decoder
x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
x = UpSampling2D((2, 2))(x)
output_layer = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)
model = keras.models.Model(inputs=[input_layer], outputs=[output_layer])

sgd = keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)
rms = keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)
ada = keras.optimizers.Adagrad(learning_rate=0.01)

model.compile(optimizer = 'adam' , loss = "mean_squared_error")


In [None]:
#Prepare callbacks
LR_callback = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=4, verbose=10, factor=.4, min_lr=.00001)


In [None]:
history = model.fit(X, y, epochs=200, batch_size=16)


In [None]:
model.evaluate(X_val, y_val)


In [None]:
test_samples, test_labels = X_val[:3], y_val[:3]
test_pred = model.predict(X_val[:3])

# First row will be raw data, second row will be the corresponding cleaned images
samples = np.concatenate((test_samples, test_labels, test_pred), axis=0) 

f, ax = plt.subplots(3, 3, figsize=(25,15))
for i, img in enumerate(samples):
    ax[i//3, i%3].imshow(img[:,:,0], cmap='gray')
    ax[i//3, i%3].axis('off')
plt.show() 


In [None]:
image_names = sorted(os.listdir(extracting_path + '/test'))
data_size = len(image_names)
#initailize data arrays.
X_test = []
#read data.
for i in tqdm(range(data_size)):
    image_name = image_names[i]
    img_dir = os.path.join(extracting_path + '/test', image_name)
    img_pixels = load_img(img_dir, color_mode='grayscale')
    w, h = img_pixels.size
    X_test.append(np.array(img_pixels).reshape(1, h, w, 1) / 255.)
    
print('Test sample shape: ', X_test[0].shape)
print('Test sample dtype: ', X_test[0].dtype)


In [None]:
yh_test = []
for img in X_test:
    size = img.shape[1:3]
    yh_test.append(model.predict(img)[0, :, :, 0])


In [None]:
# First row will be raw data, second row will be the corresponding cleaned images
f, ax = plt.subplots(3,2, figsize=(20,10))
for i, (img, lbl) in enumerate(zip(X_test[:3], yh_test[:3])):
    ax[i, 0].imshow(img[0,:,:,0], cmap='gray')
    ax[i, 0].axis('off')

    ax[i, 1].imshow(lbl, cmap='gray')
    ax[i, 1].axis('off')
plt.show() 


In [None]:
submit_vector = []
for img in yh_test:
    h, w = img.shape
    for i in range(w):
        for j in range(h):
            submit_vector.append(img[j,i])
print(len(submit_vector))


In [None]:
sample_csv = pd.read_csv(extracting_path + '/sampleSubmission.csv')
sample_csv.head(10)


In [None]:
c = 0
for img in yh_test:
    hi, wi = img.shape
    c += (hi * wi)


In [None]:
id_col = sample_csv['id']
value_col = pd.Series(submit_vector, name='value')
submission = pd.concat([id_col, value_col], axis=1)
submission.head(10)


In [None]:
submission.to_csv('Cleared.csv',index = False)


In [None]:
import shutil
shutil.rmtree(extracting_path + '/train')
shutil.rmtree(extracting_path + '/test')
shutil.rmtree(extracting_path + '/train_cleaned')
