# File Processing

In [0]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [0]:
!unzip /kaggle/input/denoising-dirty-documents/train.zip > /dev/null
!unzip /kaggle/input/denoising-dirty-documents/train_cleaned.zip > /dev/null
!unzip /kaggle/input/denoising-dirty-documents/sampleSubmission.csv.zip > /dev/null
!unzip /kaggle/input/denoising-dirty-documents/test.zip > /dev/null

In [0]:
!ls

# Import Modules

In [0]:
%matplotlib inline
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D
from keras.models import Model
from keras.optimizers import Adam, RMSprop, Adagrad, Adadelta
from sklearn.model_selection import train_test_split
import os
import cv2
import matplotlib.pyplot as plt

In [0]:
df = pd.read_csv('sampleSubmission.csv')

In [0]:
df.head(10)

In [0]:
img = cv2.imread('train/167.png', 0)
plt.imshow(img, cmap="gray")

In [0]:
img.shape

In [0]:
img = cv2.imread('train_cleaned/101.png', 0)
plt.imshow(img, cmap="gray")

# Load Dataset

In [0]:
img_w, img_h = (258, 540)
print('Image height: ', img_h)
print('Image width: ', img_w)

In [0]:
def load_images(path):
    '''Read in all the images under the directory specified by path
    '''
    filename_list = os.listdir(path)
    num_files = len(filename_list)
    imgs = np.zeros((num_files, img_w, img_h, 1))
    idx = 0
    
    for filename in filename_list:
        file_path = path + filename
        img = cv2.imread(file_path, 0)
        img.resize(img_w, img_h, 1)
        imgs[idx] = img
        idx += 1
        
    return imgs

In [0]:
x_train = load_images('./train/')
x_train_cleaned = load_images('./train_cleaned/')
x_test = load_images('./test/')

In [0]:
print('Number of train images: ', len(x_train))
print('Number of train cleaned images: ', len(x_train_cleaned))
print('Number of test images: ', len(x_test))

# Preprocessing

In [0]:
def normalization(imgs):
    return imgs / 255.0

In [0]:
# Apply max normalization to all data points
x_train = normalization(x_train)
x_train_cleaned = normalization(x_train_cleaned)
x_test = normalization(x_test)

# Define Network Architecture

In [0]:
class Autoencoder():
    def __init__(self, optimizer=Adam, lr=0.001):
        self.img_width = img_w
        self.img_height = img_h
        self.img_channel = 1
        self.optimizer = optimizer(learning_rate=lr)
        self.lr= lr
    
    def build_model(self):
        input_img = Input(shape=(self.img_width, self.img_height, self.img_channel)) # of shape (258, 540, 1)
        # encoder
        x = Conv2D(64, (3, 3), activation='relu', padding='same')(input_img)
        x = MaxPooling2D((2, 2), padding='same')(x)
        # x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
        # x = MaxPooling2D((2, 2), padding='same')(x)
        # decoder
        # x = Conv2D(32, (3, 3), activation='relu', padding='same')(x)
        # x = UpSampling2D((2, 2))(x)
        x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
        x = UpSampling2D((2, 2))(x)
        decoded_img = Conv2D(1, (3, 3), activation='sigmoid', padding='same')(x)
        
        model = Model(input_img, decoded_img)
        model.compile(optimizer=self.optimizer, loss='mse')
        
        return model

# Model Training

In [0]:
autoencoder = Autoencoder()
model = autoencoder.build_model()

In [0]:
history = model.fit(x_train, x_train_cleaned,
                    batch_size=20,
                    epochs=500,
                    validation_split=0.15)

In [0]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epoch')

In [0]:
result = model.predict(x_test)

In [0]:
plt.imshow(x_test[0], cmap="gray")

In [0]:
plt.imshow(result[0], cmap="gray")

# Output File

In [0]:
ids = []
vals = []

for filename in filename_list:
    file_path = path + filename
    img = cv2.imread(file_path, 0)
    img.resize(img_w, img_h, 1)
    imgs[idx] = img
    idx += 1

pd.DataFrame({'id': ids, 'value': vals}).to_csv('submission.csv', index=False)