In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error

from keras.models import Sequential
from keras.layers import Dense
from keras_tuner import RandomSearch
from keras_tuner import Objective

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import mean_absolute_error, MAE
from tensorflow.keras.regularizers import L2

# Data Loading

In [2]:
train = pd.read_csv('train.csv', index_col='id')
test = pd.read_csv('test.csv', index_col='id')
ss = pd.read_csv('sample_submission.csv')

# EDA

# Combine Train and Test Data

In [3]:
train1 = train.drop('yield', axis=1).copy()

In [4]:
train1 = pd.concat([train1, test])

# Feature Engineering

In [5]:
train2 = train1.copy()

In [6]:
# Based on the correlation matrix, these parameters are highly correlated and have the same correlation with the target so it is safe to remove them
train2 = train2.drop(['MinOfUpperTRange', 'AverageOfUpperTRange', 'MaxOfLowerTRange', 'MinOfLowerTRange', 'AverageOfLowerTRange', 'RainingDays'], axis=1)
#, 'fruitmass', 'seeds'

# Feature Scaling

In [7]:
train3 = train2.copy()

In [8]:
sc = StandardScaler()
train3 = pd.DataFrame(sc.fit_transform(train3), columns=train3.columns)

# Target Transformation

In [9]:
y = train['yield']

# Split Train and Test data, transform

In [10]:
X = train3.loc[:train.index.max(), :].copy()
test_transformed = train3.iloc[train.index.max()+1:, :]

# Define MLP model

In [11]:
# Define your build_model function
def build_model(hp):
    model = Sequential()
    model.add(Dense(units=hp.Int('units_input', min_value=32, max_value=512, step=32), activation='relu', input_dim=X.shape[1]))
    for i in range(hp.Int('num_hidden_layers', min_value=0, max_value=3)):
        model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32), activation=hp.Choice(f'activation_{i}', values=['relu', 'sigmoid', 'tanh'])))
    model.add(Dense(1))
    
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling="log")),
                  loss='mean_absolute_error', 
                  metrics=['mae'])
    return model

# Define an empty array to store ensemble predictions
ensemble_predictions = np.zeros_like(y, dtype=float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

tuner = RandomSearch(build_model, 
                     objective=Objective("val_mae", direction="min"),
                     max_trials=100,
                     executions_per_trial=1,
                     directory=f'C:/Users/Anes3/Documents/keras_tuner_dir_fold',
                     project_name=f'my_hyperparameter_search_fold'
                    )

tuner.search(X_train, y_train,
             epochs=20,
             batch_size=32,
             validation_data=(X_test, y_test))

Trial 100 Complete [00h 00m 09s]
val_mae: 364.314697265625

Best val_mae So Far: 355.8360900878906
Total elapsed time: 00h 24m 02s
INFO:tensorflow:Oracle triggered exit


In [12]:
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]

# Training

In [13]:
# Build and train the best model for this fold on the full dataset
best_model = tuner.hypermodel.build(best_hyperparameters)
best_model.fit(X, y, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x13af2f6c448>

In [14]:
best_hyperparameters.values

{'units_input': 416,
 'num_hidden_layers': 2,
 'learning_rate': 0.0006591520561523958,
 'units_0': 512,
 'activation_0': 'tanh',
 'units_1': 256,
 'activation_1': 'relu',
 'units_2': 224,
 'activation_2': 'relu'}

# Prediction

In [28]:
# Use the trained model to predict on the test data
y_pred = best_model.predict(test_transformed).flatten()



# Submission

In [29]:
submission = pd.DataFrame(pd.concat([pd.Series(test_transformed.index, name='id'), pd.Series(y_pred, name='yield')], axis=1))

In [30]:
submission.to_csv('mlp_model.csv', index=False)