## Connect to drive

In [0]:
# Load the Drive helper and mount
from google.colab import drive

# Mount the drive folder. This will prompt for authorization.
drive.mount('/content/drive', force_remount=True)


# Opens the project folder.
%cd 'drive/My Drive/SpaceApps'

Mounted at /content/drive
/content/drive/My Drive/SpaceApps


## Make imports

In [0]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt


from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Conv2D
from keras.layers import Flatten

from keras.models import Model, Sequential
from keras.datasets import mnist
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adam

Using TensorFlow backend.


## Generate model

In [0]:
def normalize_data(filename):
    """
    Normalize the data from a .csv file

    Loads a .csv file with the name of `filename`.
    It also removes extra characteristics that the dataset may have.
    Normalizes the values using mean/std, with NaN handling capabilities.

    Parameters
    ----------
    filename : name of the .csv file to use the dataset.

    Returns
    -------
    numpy.matrix of n dimensions, with n being the number of features
    """

    # Read data from file 'filename.csv'
    data = pd.read_csv(filename)

    # Removes unnecesary data
    data = remove_characteristics(data)

    # From pandas to numpy
    np_data = data.to_numpy()

    # Array of the mean of each column
    mean = np.empty(0)

    # Array of the std of each column
    std = np.empty(0)

    # iterating the columns
    for id_col in range(len(np_data.T)):
        data_col = np_data[:, id_col]  # Separates the column to analyze
        data_col = data_col[~np.isnan(data_col)]  # Removes the NaN values
        mean = np.append(mean, data_col.mean())  # Saves the mean of the column
        std = np.append(std, data_col.std())  # Saves the std of the column

        # Normalize the data
        np_data[:, id_col] = np_data[:, id_col] - mean[-1]
        np_data[:, id_col] = np_data[:, id_col] / std[-1]

    # TODO: save the mean and std in a file to continue training in other program run
    return np_data, mean, std


def denormalize_data(np_data, mean, std):
    """
    Denormalize a dataset

    Parameters
    ----------
    np_data : `numpy.matrix` with the normalized dataset
    
    mean : `np.array` with the mean of each column
    
    std : `np.array` with the standard deviation of each column

    Returns
    -------
    `numpy.matrix` with the data denormalized
    """
    
    # iterating the columns
    for id_col in range(len(np_data.T)):
        # Denormalize the data
        np_data[:, id_col] = np_data[:, id_col] * std[id_col]
        np_data[:, id_col] = np_data[:, id_col] + mean[id_col]

    return np_data


def remove_characteristics(data):
    """
    Removes some unnecessary features from a pandas dataset.

    The returning array has less dimensions than the original.
    Configure to remove the desired characteristics

    Parameters
    ----------
    data : Pandas dataset array

    Returns
    -------
    `pandas.array` of m dimensions. If n is the original amount of features,
    the new array is m <= n.
    """

    # Remove unnecessary data, like strings
    data = data.drop(["id", "wsid", "wsnm", "inme", "city",
                      "prov", "date", "yr", "mo", "da", "hr"], axis=1)

    # Remove data to simulate the IMN records
    data = data.drop(["mdct", "smax", "smin", "dmax",
                      "dmin", "hmax", "hmin", "gust"], axis=1)

    return data


def generate_trainset(np_data):
    """
    Retrieve the complete data (no NaNs) to create a test dataset.

    With the same dataset, erase some values to have an input training set.
    The complete set function as the output set to make supervised learning

    Parameters
    ----------
    np_data : numpy matrix with the dataset

    Returns
    -------
    complete_dataset : a `numpy.matrix` with a dataset only with the complete data in every column.
    It works as the output to check and validate the results of the predictions.

    incomplete_dataset : a `numpy.matrix` based on the complete set, with some data removed
    to work as an input for training.
    """

    # Counter to register the counter position without affecting the delete
    row_counter = 0

    # Counter of the id to erase data in a distributed way
    incomplete_counter = 0

    # Initialize the return set
    incomplete_dataset = np.zeros(len(np_data.T))
    complete_dataset = np_data

    # Iterates to erase the NaN rows
    for rows in complete_dataset:
        if np.isnan(rows).any():  # Check for NaN in the row
            complete_dataset = np.delete(
                complete_dataset, row_counter, 0)  # Deletes it
        else:
            rows[incomplete_counter] = 0  # Erase one value
            incomplete_dataset = np.vstack(
                [incomplete_dataset, rows])  # Save the row
            incomplete_counter += 1  # Increase the counter of the data row to delete

            # Check if already has reset all the columns
            if incomplete_counter > len(complete_dataset.T)-1:
                incomplete_counter = 0  # Resets the counter

            row_counter += 1  # Increase the counter of deleted rows

    # Deletes the first row of zeros
    incomplete_dataset = np.delete(incomplete_dataset, 0, axis=0)
    return complete_dataset, incomplete_dataset

In [0]:
def create_unet():

    # Create sequential model
    model = Sequential()
    
    # Input layer
    model.add(Dense(10, input_shape=(13,)))
    model.add(LeakyReLU())

    # Compression layers
    model.add(Dense(7))
    model.add(LeakyReLU())
    model.add(Dense(5))
    model.add(LeakyReLU())
    model.add(Dense(3))
    
    # Decompression layers
    model.add(Dense(5))
    model.add(LeakyReLU())
    model.add(Dense(7))
    model.add(LeakyReLU())
    
    # Output layer
    model.add(Dense(13, activation='linear'))

    # Compile model
    model.compile(loss='logcosh', optimizer=Adam(lr=0.0002, beta_1=0.5), metrics=['accuracy'])
    return model

In [0]:
def create_discriminator():
  
  # Create sequential model
  model = Sequential()

  # Input layer
  model.add(Dense(32, input_shape=(13,)))
  model.add(Dropout(0.3))
  model.add(LeakyReLU())

  # Layers
  model.add(Dense(64))
  model.add(Dropout(0.3))
  model.add(LeakyReLU())
  model.add(Dense(128))
  model.add(Dropout(0.3))
  model.add(LeakyReLU())
  model.add(Dense(64))
  model.add(Dropout(0.3))
  model.add(LeakyReLU())
  
  # Output layer
  model.add(Dense(13, activation='linear'))

  # Compile model
  model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.0002, beta_1=0.5), metrics=['accuracy'])
  return model

## Training code

In [0]:
def training(model, epochs, batch_size, validation_fraction, test_fraction):
  
  # Generate trainig and target data
  training_data = pd.read_csv('train_norm.csv')
  target_data = pd.read_csv('target_norm.csv')
  
  # Divide test and train data
  len_test = int(training_data.shape[0] * test_fraction)
  test_data = training_data[:len_test]
  test_target = target_data[:len_test]
  
  # Remove test from training
  training_data =  training_data[len_test:]
  target_data =  target_data[len_test:]
  
  
  # Train model
  print('Training started\n')
  model.fit(training_data, target_data, batch_size, epochs, verbose=2, validation_split=validation_fraction)
  # Evaluate the model
  print('Evaluation started\n')
  print(model.evaluate(test_data, test_target, batch_size, verbose=1))
  # Save model
  model.save('unet.h5')
  

## Call the functions

In [0]:
# Create model
model = create_unet()

training(model, 2000, 256, 0.3, 0.3)

Training started

Train on 18714 samples, validate on 8021 samples
Epoch 1/2000
 - 1s - loss: 0.6861 - acc: 0.0189 - val_loss: 0.6890 - val_acc: 0.0192
Epoch 2/2000
 - 1s - loss: 0.6646 - acc: 0.0211 - val_loss: 0.6669 - val_acc: 0.0229
Epoch 3/2000
 - 1s - loss: 0.6358 - acc: 0.0260 - val_loss: 0.6228 - val_acc: 0.0295
Epoch 4/2000
 - 1s - loss: 0.5669 - acc: 0.0312 - val_loss: 0.5237 - val_acc: 0.0312
Epoch 5/2000
 - 1s - loss: 0.4781 - acc: 0.0309 - val_loss: 0.4523 - val_acc: 0.0282
Epoch 6/2000
 - 1s - loss: 0.4175 - acc: 0.0291 - val_loss: 0.3936 - val_acc: 0.0294
Epoch 7/2000
 - 1s - loss: 0.3677 - acc: 0.0309 - val_loss: 0.3506 - val_acc: 0.0339
Epoch 8/2000
 - 1s - loss: 0.3283 - acc: 0.0389 - val_loss: 0.3105 - val_acc: 0.0420
Epoch 9/2000
 - 1s - loss: 0.2895 - acc: 0.0463 - val_loss: 0.2701 - val_acc: 0.0468
Epoch 10/2000
 - 1s - loss: 0.2527 - acc: 0.0563 - val_loss: 0.2365 - val_acc: 0.0588
Epoch 11/2000
 - 1s - loss: 0.2282 - acc: 0.0714 - val_loss: 0.2208 - val_acc: 0.0