In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from keras.models import Model, Input
from keras.layers import Dense, Dropout
from keras.utils.np_utils import to_categorical

# Load dataset
train_data = pd.read_csv('sample_data/train.csv')
test_data = pd.read_csv('sample_data/test.csv')

train_data['Train_Flag'] = 1
test_data['Train_Flag'] = 0
test_data['Crop_Damage'] = 0

merged_dataset = pd.concat((train_data, test_data))

# Convert ID column to integer
merged_dataset['ID_Value'] = merged_dataset['ID'].apply(lambda x: x.strip('F')).astype('int')
merged_dataset = merged_dataset.sort_values(['ID_Value'])
merged_dataset = merged_dataset.reset_index(drop=True)

In [8]:
# Add more columns
merged_dataset['Soil_Type_Damage'] = merged_dataset.sort_values(['ID_Value']).groupby(['Soil_Type'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values
merged_dataset['Estimated_Insects_Count_Damage'] = merged_dataset.sort_values(['ID_Value']).groupby(['Estimated_Insects_Count'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values
merged_dataset['Crop_Type_Damage'] = merged_dataset.sort_values(['ID_Value']).groupby(['Crop_Type'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values
merged_dataset['Pesticide_Use_Category_Damage'] = merged_dataset.sort_values(['ID_Value']).groupby(['Pesticide_Use_Category'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values
merged_dataset['Season_Damage'] = merged_dataset.sort_values(['ID_Value']).groupby(['Season'])['Crop_Damage'].apply(lambda x: x.shift().rolling(5, min_periods=1).mean()).fillna(-999).values

merged_dataset['Soil_Type_Damage_c2'] = merged_dataset.sort_values(['ID_Value']).groupby(['Soil_Type'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values
merged_dataset['Estimated_Insects_Count_Damage_c2'] = merged_dataset.sort_values(['ID_Value']).groupby(['Estimated_Insects_Count'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values
merged_dataset['Crop_Type_Damage_c2'] = merged_dataset.sort_values(['ID_Value']).groupby(['Crop_Type'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values
merged_dataset['Pesticide_Use_Category_Damage_c2'] = merged_dataset.sort_values(['ID_Value']).groupby(['Pesticide_Use_Category'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values
merged_dataset['Season_Damage_c2'] = merged_dataset.sort_values(['ID_Value']).groupby(['Season'])['Crop_Damage'].apply(lambda x: x.shift(periods=2).rolling(5, min_periods=1).mean()).fillna(-999).values

merged_dataset.loc[merged_dataset['Train_Flag'] == 0, 'Crop_Damage'] = -999

In [9]:
# Add more feature columns
merged_dataset['Crop_Damage_c1'] = merged_dataset['Crop_Damage'].shift(fill_value=-999)
merged_dataset['Estimated_Insects_Count_c1'] = merged_dataset['Estimated_Insects_Count'].shift(fill_value=-999)
merged_dataset['Crop_Type_c1'] = merged_dataset['Crop_Type'].shift(fill_value=-999)
merged_dataset['Soil_Type_c1'] = merged_dataset['Soil_Type'].shift(fill_value=-999)
merged_dataset['Pesticide_Use_Category_c1'] = merged_dataset['Pesticide_Use_Category'].shift(fill_value=-999)
merged_dataset['Number_Doses_Week_c1'] = merged_dataset['Number_Doses_Week'].shift(fill_value=-999)
merged_dataset['Number_Weeks_Used_c1'] = merged_dataset['Number_Weeks_Used'].shift(fill_value=-999)
merged_dataset['Number_Weeks_Quit_c1'] = merged_dataset['Number_Weeks_Quit'].shift(fill_value=-999)
merged_dataset['Season_c1'] = merged_dataset['Season'].shift(fill_value=-999)

merged_dataset['Crop_Damage_c2'] = merged_dataset['Crop_Damage'].shift(periods=2, fill_value=-999)
merged_dataset['Estimated_Insects_Count_c2'] = merged_dataset['Estimated_Insects_Count'].shift(periods=2, fill_value=-999)
merged_dataset['Crop_Type_c2'] = merged_dataset['Crop_Type'].shift(periods=2, fill_value=-999)
merged_dataset['Soil_Type_c2'] = merged_dataset['Soil_Type'].shift(periods=2, fill_value=-999)
merged_dataset['Pesticide_Use_Category_c2'] = merged_dataset['Pesticide_Use_Category'].shift(periods=2, fill_value=-999)
merged_dataset['Number_Doses_Week_c2'] = merged_dataset['Number_Doses_Week'].shift(periods=2, fill_value=-999)
merged_dataset['Number_Weeks_Used_c2'] = merged_dataset['Number_Weeks_Used'].shift(periods=2, fill_value=-999)
merged_dataset['Number_Weeks_Quit_c2'] = merged_dataset['Number_Weeks_Quit'].shift(periods=2, fill_value=-999)
merged_dataset['Season_c2'] = merged_dataset['Season'].shift(periods=2, fill_value=-999)

In [10]:
# Split data into train and test datasets
train_dataset, test_dataset = merged_dataset[merged_dataset.Train_Flag == 1], merged_dataset[merged_dataset.Train_Flag == 0]

train_dt = train_dataset.drop(columns=['Train_Flag'], axis=11)
test_dt_1 = test_dataset.drop(columns=['Train_Flag'], axis=11)
test_dt = test_dt_1.drop(columns=['Crop_Damage'], axis=10)

# Delete the merged dataset to save memory
del merged_dataset

missing_value = -999

# Assign values to empty cells
train_dt['Number_Weeks_Used'] = train_dt['Number_Weeks_Used'].apply(lambda x: missing_value if pd.isna(x) else x)
test_dt['Number_Weeks_Used'] = test_dt['Number_Weeks_Used'].apply(lambda x: missing_value if pd.isna(x) else x)

train_dt['Number_Weeks_Used_c1'] = train_dt['Number_Weeks_Used_c1'].apply(lambda x: missing_value if pd.isna(x) else x)
test_dt['Number_Weeks_Used_c1'] = test_dt['Number_Weeks_Used_c1'].apply(lambda x: missing_value if pd.isna(x) else x)

train_dt['Number_Weeks_Used_c2'] = train_dt['Number_Weeks_Used_c2'].apply(lambda x: missing_value if pd.isna(x) else x)
test_dt['Number_Weeks_Used_c2'] = test_dt['Number_Weeks_Used_c2'].apply(lambda x: missing_value if pd.isna(x) else x)

dataset_X = train_dt.drop(columns=['Crop_Damage', 'ID', 'ID_Value'])
dataset_y = train_dt['Crop_Damage'].values

# Remove the ID and ID_Value columns from test dataset
test_dt.drop(columns=['ID', 'ID_Value'], inplace=True)

In [11]:
dataset_y = to_categorical(dataset_y, num_classes=3)

# Split data into train and test
train_X, test_X, train_y, test_y = train_test_split(dataset_X, dataset_y, random_state=42, test_size=0.3, shuffle=True)

# Number of output classes: (0=alive, 1=Damage due to other causes, 2=Damage due to Pesticides)
num_classes = 3

# Build a fully connected network
visible_layer = Input(shape=(36,))
fc_layer_1 = Dense(360, activation='relu')(visible_layer)
fc_layer_2 = Dense(247, activation='relu')(fc_layer_1)
fc_layer_3 = Dense(210, activation='relu')(fc_layer_2)
fc_layer_4 = Dense(172, activation='relu')(fc_layer_3)
dropout_layer = Dropout(rate=0.2)(fc_layer_4)
fc_layer_5 = Dense(120, activation='relu')(fc_layer_4)
fc_layer_6 = Dense(82, activation='relu')(fc_layer_5)
fc_layer_7 = Dense(51, activation='relu')(fc_layer_6)
fc_layer_8 = Dense(28, activation='relu')(fc_layer_7)
output_layer = Dense(num_classes, activation='softmax')(fc_layer_8)

model = Model(visible_layer, output_layer)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Fit Model
model.fit(train_X, train_y, epochs=70, batch_size=700, verbose=2)

# Evaluation
loss_function, accuracy = model.evaluate(train_X, train_y, verbose=2)
print(f'\nLoss Function: {loss_function}')
print(f'Training Accuracy: {accuracy * 100}')

# Actual prediction
prediction = model.predict(test_dt)

# Actual prediction
prediction = model.predict(test_dt)
classes_X = np.argmax(prediction, axis=1)

submission = pd.read_csv('sample_data/sample_submission.csv')
submission['Crop_Damage'] = classes_X
submission.to_csv('sample_data/submit_prediction.csv')

Epoch 1/70
89/89 - 3s - loss: 1.2959 - accuracy: 0.7955 - 3s/epoch - 30ms/step
Epoch 2/70
89/89 - 2s - loss: 0.5384 - accuracy: 0.8314 - 2s/epoch - 22ms/step
Epoch 3/70
89/89 - 2s - loss: 0.5002 - accuracy: 0.8339 - 2s/epoch - 22ms/step
Epoch 4/70
89/89 - 2s - loss: 0.4865 - accuracy: 0.8316 - 2s/epoch - 22ms/step
Epoch 5/70
89/89 - 2s - loss: 0.4492 - accuracy: 0.8363 - 2s/epoch - 22ms/step
Epoch 6/70
89/89 - 2s - loss: 0.4411 - accuracy: 0.8395 - 2s/epoch - 22ms/step
Epoch 7/70
89/89 - 2s - loss: 0.4255 - accuracy: 0.8436 - 2s/epoch - 22ms/step
Epoch 8/70
89/89 - 2s - loss: 0.4175 - accuracy: 0.8478 - 2s/epoch - 22ms/step
Epoch 9/70
89/89 - 2s - loss: 0.4097 - accuracy: 0.8501 - 2s/epoch - 22ms/step
Epoch 10/70
89/89 - 2s - loss: 0.4021 - accuracy: 0.8541 - 2s/epoch - 22ms/step
Epoch 11/70
89/89 - 2s - loss: 0.3992 - accuracy: 0.8563 - 2s/epoch - 22ms/step
Epoch 12/70
89/89 - 2s - loss: 0.3941 - accuracy: 0.8593 - 2s/epoch - 22ms/step
Epoch 13/70
89/89 - 2s - loss: 0.3740 - accuracy: