### Loading Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

## Data load
weather_data = np.load('inputs_weather_train.npy', allow_pickle=False, fix_imports=True)
other_data = np.load('inputs_others_train.npy', allow_pickle=False, fix_imports=True)
yield_data = np.load('yield_train.npy', fix_imports=True, allow_pickle=False)
clusterID_genotype = np.load('clusterID_genotype.npy')

weather_test_data = np.load('inputs_weather_test.npy', allow_pickle=False, fix_imports=True)
other_test_data = np.load('inputs_others_test.npy', allow_pickle=False, fix_imports=True)


encoder = LabelEncoder()
scaler = MinMaxScaler(feature_range=(0, 1))
other_data[:, 2] = encoder.fit_transform(other_data[:, 2].reshape(-1, 1))

  return f(*args, **kwargs)


In [2]:
other_data[0]  ## To check the indexing of the loaded data.

array(['3.0', '243.0', '7', '2009.0', '98.0'], dtype='<U32')

In [3]:
[length, days, prop_num] = weather_data.shape

# main data part
weather_data1 = np.zeros([length, int(days/7), prop_num])
# sum up year data by week
for i in range(length):
    year_data = weather_data[i]
    year_data1 = year_data[0:int(days/7)*7, :]
    year_data2 = year_data1.reshape(int(days/7),7,7)
    year_by_week = year_data2.sum(axis=1)
    weather_data1[i] = year_by_week

weather_data3 = weather_data1.reshape(length, 7 * int(days/7))
weather_df = pd.DataFrame(weather_data3)
other_df = pd.DataFrame(other_data, columns=['MG', 'Genotype_ID', 'State', 'Year', 'Location'])
yield_df = pd.DataFrame(yield_data, columns=['Yield'])

combined_df = pd.concat([weather_df, other_df, yield_df], axis = 1, join = 'inner')


In [4]:
state = combined_df['State'].values

In [5]:
type(state)

numpy.ndarray

In [6]:
state

array(['7', '0', '25', ..., '6', '9', '3'], dtype=object)

In [7]:
X = combined_df.iloc[:,0:-1].values  # gets all the column except for the last column(yield) 
Y_real = combined_df.iloc[:, -1].values
max_val = np.max(Y_real)
Y = Y_real/max_val     # Scaling the yield data 


In [9]:
X = scaler.fit_transform(X)
X = np.asarray(X).astype(np.float32)

In [10]:
X[:, -3] = state

In [12]:
# Splitting into train test 9:1
training_size = int(len(X) * 0.90)  
test_size = len(X) - training_size
X_train, X_test = X[0:training_size], X[training_size:len(X)]
Y_train, Y_test = Y[0:training_size], Y[training_size:len(X)]

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Activation, Dropout, BatchNormalization,Bidirectional
from tensorflow.keras import optimizers
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf

def create_model():
    model = Sequential()
    model.add(LSTM(512, return_sequences=True,input_shape=(1,215),activation='tanh')) # input layer with 512 neurons
    model.add(BatchNormalization())    # to normalize the hidden layer of neurons
#     model.add(Dropout(0.1))
    model.add(LSTM(256,return_sequences=True))
    model.add(BatchNormalization()) 
    model.add(Dropout(0.1))          # Dropout layer to prevent overfitting.
    model.add(LSTM(128,return_sequences=False))
    model.add(BatchNormalization())
    model.add(Dropout(0.1))
    model.add(Dense(64,activation='sigmoid'))
    model.add(Dropout(0.1))
    model.add(Dense(32,activation='sigmoid'))
    model.add(Dropout(0.1))
    model.add(Dense(1,activation='sigmoid'))
    model.add(Activation('linear'))
    return model

model = create_model()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4, epsilon=1e-08)
model.compile(loss='mse', optimizer=optimizer)

checkpoint = ModelCheckpoint(filepath="model.h5",
                             save_weights_only=True,
                             verbose=1, 
                             monitor='val_loss',
                             save_best_only=True, 
                             mode='min') 


X_train = X_train.reshape(X_train.shape[0],1,X_train.shape[1])
X_test = X_test.reshape(X_test.shape[0],1,X_test.shape[1])

model.fit(X_train,Y_train,validation_data=(X_test,Y_test),epochs=200,batch_size=60,callbacks = [checkpoint],shuffle=True)


Epoch 1/200

Epoch 00001: val_loss improved from inf to 0.00863, saving model to model.h5
Epoch 2/200

Epoch 00002: val_loss improved from 0.00863 to 0.00665, saving model to model.h5
Epoch 3/200

Epoch 00003: val_loss improved from 0.00665 to 0.00613, saving model to model.h5
Epoch 4/200

Epoch 00004: val_loss improved from 0.00613 to 0.00563, saving model to model.h5
Epoch 5/200

Epoch 00005: val_loss did not improve from 0.00563
Epoch 6/200

Epoch 00006: val_loss did not improve from 0.00563
Epoch 7/200

Epoch 00007: val_loss improved from 0.00563 to 0.00549, saving model to model.h5
Epoch 8/200

Epoch 00008: val_loss improved from 0.00549 to 0.00530, saving model to model.h5
Epoch 9/200

Epoch 00009: val_loss improved from 0.00530 to 0.00494, saving model to model.h5
Epoch 10/200

Epoch 00010: val_loss did not improve from 0.00494
Epoch 11/200

Epoch 00011: val_loss did not improve from 0.00494
Epoch 12/200

Epoch 00012: val_loss did not improve from 0.00494
Epoch 13/200

Epoch 000


Epoch 00050: val_loss improved from 0.00410 to 0.00407, saving model to model.h5
Epoch 51/200

Epoch 00051: val_loss did not improve from 0.00407
Epoch 52/200

Epoch 00052: val_loss did not improve from 0.00407
Epoch 53/200

Epoch 00053: val_loss did not improve from 0.00407
Epoch 54/200

Epoch 00054: val_loss did not improve from 0.00407
Epoch 55/200

Epoch 00055: val_loss did not improve from 0.00407
Epoch 56/200

Epoch 00056: val_loss did not improve from 0.00407
Epoch 57/200

Epoch 00057: val_loss did not improve from 0.00407
Epoch 58/200

Epoch 00058: val_loss did not improve from 0.00407
Epoch 59/200

Epoch 00059: val_loss improved from 0.00407 to 0.00406, saving model to model.h5
Epoch 60/200

Epoch 00060: val_loss did not improve from 0.00406
Epoch 61/200

Epoch 00061: val_loss improved from 0.00406 to 0.00406, saving model to model.h5
Epoch 62/200

Epoch 00062: val_loss did not improve from 0.00406
Epoch 63/200

Epoch 00063: val_loss improved from 0.00406 to 0.00405, saving m


Epoch 00100: val_loss did not improve from 0.00401
Epoch 101/200

Epoch 00101: val_loss did not improve from 0.00401
Epoch 102/200

Epoch 00102: val_loss did not improve from 0.00401
Epoch 103/200

Epoch 00103: val_loss did not improve from 0.00401
Epoch 104/200

Epoch 00104: val_loss did not improve from 0.00401
Epoch 105/200

Epoch 00105: val_loss did not improve from 0.00401
Epoch 106/200

Epoch 00106: val_loss did not improve from 0.00401
Epoch 107/200

Epoch 00107: val_loss did not improve from 0.00401
Epoch 108/200

Epoch 00108: val_loss did not improve from 0.00401
Epoch 109/200

Epoch 00109: val_loss did not improve from 0.00401
Epoch 110/200

Epoch 00110: val_loss did not improve from 0.00401
Epoch 111/200

Epoch 00111: val_loss did not improve from 0.00401
Epoch 112/200

Epoch 00112: val_loss did not improve from 0.00401
Epoch 113/200

Epoch 00113: val_loss did not improve from 0.00401
Epoch 114/200

Epoch 00114: val_loss did not improve from 0.00401
Epoch 115/200

Epoch 001

<keras.callbacks.History at 0x7f80db3be2b0>

In [13]:
import tensorflow as tf
from sklearn.metrics import mean_squared_error

model = create_model()
model.compile(optimizer=optimizer, loss= 'mse')
model.build(input_shape = (1,215))
model.load_weights("model.h5")

test_predict=model.predict(X_test)
trainScore = np.sqrt(mean_squared_error(Y_test *max_val, test_predict*max_val))
print ('Testing MSE', trainScore)

Testing MSE 7.10935642240515


### Testing Dataset 

In [14]:
[length_test, days_test, prop_num_test] = weather_test_data.shape
weather_test_data1 = np.zeros([length_test, int(days_test/7), prop_num_test])
for i in range(length_test):
    year_data = weather_test_data[i]
    year_data1 = year_data[0:int(days/7)*7, :]
    year_data2 = year_data1.reshape(30,7, prop_num_test)
    year_by_week = year_data2.sum(axis=1)
    weather_test_data1[i] = year_by_week


weather_data3 = weather_test_data1.reshape(length_test, prop_num_test * int(days/7))
weather_test_df = pd.DataFrame(weather_data3)


other_test_df = pd.DataFrame(other_test_data, columns=['MG', 'Genotype_ID', 'State', 'Year', 'Location'])

In [15]:
other_test_df

Unnamed: 0,MG,Genotype_ID,State,Year,Location
0,3.0,3069.0,"""IA""",2010.0,41.0
1,4.0,2526.0,"""IN""",2004.0,154.0
2,3.0,636.0,"""IA""",2014.0,41.0
3,5.0,1350.0,"""MD""",2005.0,113.0
4,3.0,2983.0,"""IL""",2006.0,148.0
...,...,...,...,...,...
10332,1.0,4511.0,"""MI""",2013.0,64.0
10333,3.0,5106.0,"""NE""",2007.0,136.0
10334,2.0,5826.0,"""IL""",2008.0,148.0
10335,1.0,5466.0,"""SD""",2005.0,10.0


In [16]:
other_test_df["State"] = encoder.transform(other_test_df["State"])

In [17]:
test_states = other_test_df["State"].values
test_states

array([ 4,  6,  4, ...,  5, 24, 17])

In [18]:
len(test_states)

10337

In [19]:
combined_test_df = pd.concat([weather_test_df, other_test_df], axis = 1, join = 'inner')
X_test = combined_test_df.values


In [20]:
len(X_test)

10337

In [21]:
from sklearn import preprocessing

X_test = scaler.transform(X_test)
X_test = np.asarray(X_test).astype(np.float32)


In [22]:
X_test[:, -3] = test_states

X_test = X_test.reshape(X_test.shape[0],1,X_test.shape[1])

test_predict=model.predict(X_test)*max_val
np.save("test_predict_new.npy",  test_predict)  # save the new

In [23]:
np.save("test_predict_new.npy",  test_predict)  # save the new