# Deep Learning Project

 Create 3 datasets: training, validation, and test. Save the newly created sets in a tensor friendly format (e.g. *.npz)
Since we are dealing with real life data, we will need to preprocess it a bit. This is the relevant dataset, which is already preprocessed

In [71]:
import numpy as np
import pandas as pd

# We will use the sklearn preprocessing library, as it will be easier to standardize the data.
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

# Load the data
data = pd.read_csv('yield_preprocessed.csv')
l = LabelEncoder()
Item=l.fit_transform(data["Item"])
data_preprocessed = data.drop(['Item'],axis=1)
data_preprocessed["Item"] = Item

#set out input and target
unscaled_inputs_all = data_preprocessed.iloc[:,0:-1]
targets_all = data_preprocessed.iloc[:,-1:]

# Standardize the inputs

In [72]:
# That's the only place we use sklearn functionality. We will take advantage of its preprocessing capabilities
scaled_inputs = preprocessing.scale(unscaled_inputs_all)

### split the data into train, validation, test 

In [73]:
# Count the total number of samples
samples_count = scaled_inputs.shape[0]

# Count the samples in each subset, assuming we want 80-10-10 distribution of training, validation, and test.
# Naturally, the numbers are integers.
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)

# The 'test' dataset contains all remaining data.
test_samples_count = samples_count - train_samples_count - validation_samples_count

# Create variables that record the inputs and targets for training
# In our shuffled dataset, they are the first "train_samples_count" observations
train_inputs = scaled_inputs[:train_samples_count]
train_targets = targets_all[:train_samples_count]

# Create variables that record the inputs and targets for validation.
# They are the next "validation_samples_count" observations, folllowing the "train_samples_count" we already assigned
validation_inputs = scaled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = targets_all[train_samples_count:train_samples_count+validation_samples_count]

# Create variables that record the inputs and targets for test.
# They are everything that is remaining.
test_inputs = scaled_inputs[train_samples_count+validation_samples_count:]
test_targets = targets_all[train_samples_count+validation_samples_count:]


# save dataset .npz

In [74]:
# Save the three datasets in *.npz.
# In the next lesson, you will see that it is extremely valuable to name them in such a coherent way!

np.savez('data_train', inputs=train_inputs, targets=train_targets)
np.savez('data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('data_test', inputs=test_inputs, targets=test_targets)