In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [2]:
# pip install keras

In [3]:
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
from matplotlib import pyplot as plt
import seaborn as sb
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
# Upload the datasets as pandas DataFrames

sample_submit = pd.read_csv('sample_submit.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
# sample_submit.info()

In [6]:
# train.info()

In [7]:
# test.info()

In [8]:
# The column 'y' in 'train' seemed to be the target values.
# Here we extract the 'y'-column as 'y_train'

y_train = train.y

In [9]:
# There is a column 'galaxy' in the dataframes 'train' and 'test'
# that has a type of 'object'.
# To build the regression model we need to apply get_dummies to this column
#  and put it into the separate tables for a while.


train_dummies = pd.get_dummies(train['galaxy'])
test_dummies = pd.get_dummies(test['galaxy'])

In [10]:
# We drop the columns 'y', 'galaxy' from both datasets (if they include one)

train.drop(labels = ['y','galaxy'], axis=1, inplace = True)
test.drop(labels = 'galaxy', axis=1, inplace = True);

In [11]:
def to_standard_normal_value(data, mean, std):
    
    """   This function converts the 'data'  to the standard_normal form
             with new_mean = 0 and new_std = 1   due to the formula:    
                       z = (X - mean) / std                               """
    
    z = (data - mean) / std
    return z

In [12]:
# For best perfomance of neural network used below,
# we need normilize a dataset.
# To have both datasets normilezed simular we
# use as mean = train.mean() and as std = train.std()

train_scaled = to_standard_normal_value(train, train.mean(), train.std())
test_scaled = to_standard_normal_value(test, train.mean(), train.std())

In [13]:
# Glue together the normilized datasets and dummy-datasets

train_dummed_scaled = pd.concat([train_scaled,train_dummies], axis=1)
test_dummed_scaled = pd.concat([test_scaled,test_dummies], axis=1)

In [14]:
# Impute all Nan in datasets as median in each column

imputer = SimpleImputer(missing_values=np.nan, strategy='median')

train_dummed_scaled = imputer.fit_transform(train_dummed_scaled)
test_dummed_scaled = imputer.fit_transform(test_dummed_scaled)

In [15]:
# Split big dataset train_dummed_scaled for train and test parts

train_X, val_X, train_y, val_y = train_test_split(train_dummed_scaled,y_train, test_size = 0.3, random_state = 14)

In [16]:
# Build the neural network for regression problem

NN_model = Sequential()

# The Input Layer :
NN_model.add(Dense(int(train_X.shape[1]), kernel_initializer='normal',input_dim = train_X.shape[1], activation='relu'))

# The Hidden Layers :
NN_model.add(Dense(int(train_X.shape[1]/2), kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(int(train_X.shape[1]/4), kernel_initializer='normal',activation='relu'))
NN_model.add(Dense(int(train_X.shape[1]/8), kernel_initializer='normal',activation='relu'))
# NN_model.add(Dense(int(train_X.shape[1]/16), kernel_initializer='normal',activation='relu'))

# The Output Layer :
NN_model.add(Dense(1, kernel_initializer='normal',activation='linear'))

# Compile the network :
NN_model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error']) #'mean_absolute_error'
NN_model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 259)               67340     
_________________________________________________________________
dense_2 (Dense)              (None, 129)               33540     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8320      
_________________________________________________________________
dense_4 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 111,313
Trainable params: 111,313
Non-trainable params: 0
_________________________________________________________________


In [17]:
checkpoint_name = 'Weights-{epoch:03d}--{val_loss:.5f}.hdf5' 
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose = 1, save_best_only = True, mode ='auto')
callbacks_list = [checkpoint]

In [18]:
NN_model.fit(train_X, train_y, epochs=25, batch_size=10, validation_split = 0.2, callbacks=callbacks_list,
             workers=20, use_multiprocessing=True)

Instructions for updating:
Use tf.cast instead.
Train on 2164 samples, validate on 541 samples
Epoch 1/25

Epoch 00001: val_loss improved from inf to 0.00088, saving model to Weights-001--0.00088.hdf5
Epoch 2/25

Epoch 00002: val_loss improved from 0.00088 to 0.00054, saving model to Weights-002--0.00054.hdf5
Epoch 3/25

Epoch 00003: val_loss improved from 0.00054 to 0.00039, saving model to Weights-003--0.00039.hdf5
Epoch 4/25

Epoch 00004: val_loss improved from 0.00039 to 0.00027, saving model to Weights-004--0.00027.hdf5
Epoch 5/25

Epoch 00005: val_loss did not improve from 0.00027
Epoch 6/25

Epoch 00006: val_loss did not improve from 0.00027
Epoch 7/25

Epoch 00007: val_loss did not improve from 0.00027
Epoch 8/25

Epoch 00008: val_loss did not improve from 0.00027
Epoch 9/25

Epoch 00009: val_loss did not improve from 0.00027
Epoch 10/25

Epoch 00010: val_loss improved from 0.00027 to 0.00026, saving model to Weights-010--0.00026.hdf5
Epoch 11/25

Epoch 00011: val_loss did not 

<keras.callbacks.callbacks.History at 0x7f9afeea3950>

In [19]:
predictions = NN_model.predict(val_X)

In [20]:
np.sqrt(mean_squared_error(val_y, predictions))

0.011951943930397734