In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_selection import SelectKBest, f_regression

#auxiliary
import auxilary

#KERAS
import keras
from keras.models import load_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import BatchNormalization
from keras.layers import Activation
from keras.layers import Dropout
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras import optimizers
from keras import backend as K

  from numpy.core.umath_tests import inner1d


ModuleNotFoundError: No module named 'keras'

In [None]:
# Import Training Set

X_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv(r"y_train.csv")

display('X_train:', X_train.shape, X_train.head())
display('y_train:', y_train.shape, y_train.head())

In [None]:
# Drop id column for X_train and y_train

X_train = X_train.drop(columns='id', axis=1)
display("X_train.head():", X_train.head())
print("X_train.shape:", X_train.shape)

print(" ")

y_train = y_train.drop(columns='id', axis=1)
display("y_train.head():", y_train.head())
print("y_train.shape:", y_train.shape)

### FILL IN NaNs

In [None]:
# Fill in data with mean

X_train.fillna(np.nanmean(X_train), inplace=True)
display(X_train.head())

# and look at NaNs

display("Total Number of NaN in X_train:", X_train.isna().sum(axis=1).sum(axis=0))
print(" ")
print("X_train.shape:", X_train.shape)

### OUTLIER DETECTION

In [None]:
# Outlier detection

X_train, y_train = auxilary.OutlierDetectionIsolationForest(X_train, y_train, percentageOutlier = 'auto')
print("Shape after outlier detection: ", X_train.shape)

### FEATURE SELECTION

In [6]:
# Select best features

# -> PROVISIONAL MANUALLY; THIS WILL BE DEFINED BY ITERATING
inputDim = 200

featureSelection = SelectKBest(f_regression, k = inputDim)
X_train = featureSelection.fit_transform(X_train, y_train)

print("Shape after feature selection: ", X_train.shape)

Shape after feature selection:  (1212, 200)


  y = column_or_1d(y, warn=True)


### TRAIN-TEST-SPLIT

In [7]:
# Split in Training & Test for Cross validation

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=14)

In [8]:
# Inspect current X_train and X_test

print("X_train.shape:", X_train.shape)
print("X_val.shape:", X_val.shape)

X_train.shape: (1090, 200)
X_val.shape: (122, 200)


### STANDARDIZE

In [9]:
total_mean_non_standardized = X_train.mean().mean()
display("This is the mean of the non-standardized dataset:", total_mean_non_standardized)

# Standardize

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
print("Mean X_train after standardized:", X_train.mean(axis=0).mean(axis=0))

y_train = y_train.values

'This is the mean of the non-standardized dataset:'

-1.1084244930971182e+18

Mean X_train after standardized: 3.559865613457897e-16


### NEURAL NETWORK

In [10]:
# Get dimension of input to create the NN accordingly

dimensionOfInput = X_train.shape[1]
print("dimensionOfInput:", dimensionOfInput)

dimensionOfInput: 200


In [11]:
# Deep NN definition

#Create model
model = Sequential()
dropout = 0.2

#First layer        
model.add(Dense(512, input_dim = dimensionOfInput))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout))

model.add(Dense(512, use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout))

model.add(Dense(512, use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(dropout))

model.add(Dense(1, activation = 'linear'))

# other definitions
es = EarlyStopping(monitor='val_coefficientofdetermination',mode='max',verbose=1,patience=400)
mc = ModelCheckpoint('best_model.h5',monitor='val_coefficientofdetermination',mode='max',verbose=1,save_best_only=True)

In [12]:
# Train

model.compile(loss = 'mse', optimizer = keras.optimizers.Adam(lr = 0.0005), metrics=[auxilary.coefficientofdetermination])
model.fit(X_train, y_train, validation_data = (X_val, y_val), epochs = 10, batch_size = 32, callbacks = [es, mc])
bestModel = load_model('best_model.h5', custom_objects = {'coefficientofdetermination' : auxilary.coefficientofdetermination})

Train on 1090 samples, validate on 122 samples
Epoch 1/10

Epoch 00001: val_coefficientofdetermination did not improve from -inf
Epoch 2/10

Epoch 00002: val_coefficientofdetermination did not improve from -inf
Epoch 3/10

Epoch 00003: val_coefficientofdetermination did not improve from -inf
Epoch 4/10

Epoch 00004: val_coefficientofdetermination did not improve from -inf
Epoch 5/10

Epoch 00005: val_coefficientofdetermination did not improve from -inf
Epoch 6/10

Epoch 00006: val_coefficientofdetermination did not improve from -inf
Epoch 7/10

Epoch 00007: val_coefficientofdetermination did not improve from -inf
Epoch 8/10

Epoch 00008: val_coefficientofdetermination did not improve from -inf
Epoch 9/10

Epoch 00009: val_coefficientofdetermination did not improve from -inf
Epoch 10/10

Epoch 00010: val_coefficientofdetermination did not improve from -inf


### FINAL SUBMISSION

In [13]:
# Berkays submission function

X_test = pd.read_csv(r"X_test.csv")

del X_test['id']

X_test = scaler.fit_transform(X_test)
X_test = np.nan_to_num(X_test)
X_test = featureSelection.transform(X_test)


y_predictions = bestModel.predict(X_test)
y_predictions = np.reshape(y_predictions, y_predictions.shape[0])

auxilary.createSubmissionFiles(y_predictions)

ValueError: Error when checking input: expected dense_1_input to have shape (150,) but got array with shape (200,)

In [None]:
X_test = np.nan_to_num(X_test)
print(X_test)

In [16]:
# For Final Submission

# Import Testing Set

X_test = pd.read_csv(r"X_test.csv")
print("X_test.shape:", X_test.shape)


# Drop id column from X_test

X_test = X_test.drop(columns='id', axis=1)
display(X_test.isna().sum(axis=0).sum(axis=0))

# Predict y values for X_test

y_pred = model.predict(X_test)

y_pred = pd.DataFrame(y_pred)

display(y_pred.head())
print("y_pred.shape:", y_pred.shape)

# Store values in submission format

y_submission = pd.DataFrame(columns=['id','y'])
y_submission.id = np.arange(0,776)
y_submission.y = y_pred

y_submission.to_csv("y_submission1.csv", index=False)

X_test.shape: (776, 833)


40637

ValueError: Error when checking input: expected dense_1_input to have shape (200,) but got array with shape (832,)