# Structured data classification 0

### The dataset

Here's the description of each feature:

## Setup

In [2]:
import os

# TensorFlow is the only backend that supports string inputs.
os.environ["KERAS_BACKEND"] = "tensorflow"

import tensorflow as tf
import pandas as pd
import keras
from keras import layers

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [4]:
import random
SEED = 2
random.seed(SEED)
np.random.seed(SEED)

## Preparing the data

Let's download the data and load it into a Pandas dataframe:

In [7]:
Kenya = pd.read_csv("./Data/Kenya_training.csv")
Spain =  pd.read_csv("./Data/Spain_training.csv")
VNM =  pd.read_csv("./Data/VNM_training.csv")
VNM.rename(columns={'Lon': 'lon', 'Lat': 'lat'}, inplace=True) # It's to allign with the other two sources.

dataframe = pd.concat([Kenya, Spain, VNM], axis=0)

dataframe = dataframe.drop(['ID'], axis=1)
dataframe['TARGET'] = dataframe['TARGET']-1


In [9]:
cols = dataframe.columns
y = dataframe.pop('TARGET')
X = dataframe
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)
dataframe_test = pd.concat([X_test, y_test], axis=1)
dataframe_test.columns = cols
dataframe = pd.concat([X_train, y_train], axis=1)
dataframe.columns = cols

The dataset includes 2825 samples with 18 columns per sample (13 features, plus the target
label):

In [12]:
print(dataframe.shape)
print(dataframe.columns)

(1892, 15)
Index(['lon', 'lat', 'blue_p50', 'green_p50', 'nir_p50', 'nira_p50', 're1_p50',
       're2_p50', 're3_p50', 'red_p50', 'swir1_p50', 'swir2_p50', 'VV_p50',
       'VH_p50', 'TARGET'],
      dtype='object')


Here's a preview of a few samples:

In [15]:
dataframe.head()

Unnamed: 0,lon,lat,blue_p50,green_p50,nir_p50,nira_p50,re1_p50,re2_p50,re3_p50,red_p50,swir1_p50,swir2_p50,VV_p50,VH_p50,TARGET
696,108.33076,11.93596,1509.5,1681.0,3345.5,3373.0,2079.0,2881.5,3179.0,1748.0,3360.0,2589.0,-11.054607,-17.094818,1
780,-2.211877,36.955029,2279.0,2705.0,3957.0,3860.0,3410.5,3571.0,3757.5,3250.0,4274.5,3644.5,-7.130405,-13.848156,1
598,-3.421818,36.801776,2114.0,2631.0,4142.0,4120.5,3617.5,3965.0,4075.5,3243.0,4090.5,3516.5,-3.006588,-7.232254,1
756,37.237998,0.075863,2762.0,3089.0,3540.5,3812.5,3477.5,3813.0,3904.5,3074.0,4099.5,3963.0,-9.193412,-15.262289,1
217,108.407296,12.022917,2240.0,2358.0,4227.5,4432.0,2690.5,3833.5,4339.0,2258.0,3858.0,2919.0,-5.666233,-10.101684,0


In [17]:
val_dataframe = dataframe.sample(frac=0.2, random_state=2)
train_dataframe = dataframe.drop(val_dataframe.index)

print(
    f"Using {len(train_dataframe)} samples for training "
    f"and {len(val_dataframe)} for validation"
)

Using 1163 samples for training and 378 for validation


Let's generate `tf.data.Dataset` objects for each dataframe:

In [20]:

def dataframe_to_dataset(dataframe):
    dataframe = dataframe.copy()
    labels = dataframe.pop("TARGET")
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    ds = ds.shuffle(buffer_size=len(dataframe))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

Each `Dataset` yields a tuple `(input, target)` where `input` is a dictionary of features
and `target` is the value `0` or `1`:

In [23]:
for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

Input: {'lon': <tf.Tensor: shape=(), dtype=float64, numpy=-3.000238302>, 'lat': <tf.Tensor: shape=(), dtype=float64, numpy=36.76225045>, 'blue_p50': <tf.Tensor: shape=(), dtype=float64, numpy=2568.0>, 'green_p50': <tf.Tensor: shape=(), dtype=float64, numpy=2918.0>, 'nir_p50': <tf.Tensor: shape=(), dtype=float64, numpy=5516.0>, 'nira_p50': <tf.Tensor: shape=(), dtype=float64, numpy=5707.0>, 're1_p50': <tf.Tensor: shape=(), dtype=float64, numpy=3325.0>, 're2_p50': <tf.Tensor: shape=(), dtype=float64, numpy=5076.0>, 're3_p50': <tf.Tensor: shape=(), dtype=float64, numpy=5609.0>, 'red_p50': <tf.Tensor: shape=(), dtype=float64, numpy=2902.0>, 'swir1_p50': <tf.Tensor: shape=(), dtype=float64, numpy=3671.0>, 'swir2_p50': <tf.Tensor: shape=(), dtype=float64, numpy=2688.0>, 'VV_p50': <tf.Tensor: shape=(), dtype=float64, numpy=-5.508221>, 'VH_p50': <tf.Tensor: shape=(), dtype=float64, numpy=-12.995374>}
Target: tf.Tensor(0, shape=(), dtype=int64)


2024-07-31 14:36:23.182952: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Let's batch the datasets:

In [26]:
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)

## Feature preprocessing with Keras layers


All features are continuous numerical features:

For each of these features, we will use a `Normalization()` layer to make sure the mean
of each feature is 0 and its standard deviation is 1.

Below, we define a utility function to do that operation:

- `encode_numerical_feature` to apply featurewise normalization to numerical features.

In [29]:

def encode_numerical_feature(feature, name, dataset):
    # Create a Normalization layer for our feature
    normalizer = layers.Normalization()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the statistics of the data
    normalizer.adapt(feature_ds)

    # Normalize the input feature
    encoded_feature = normalizer(feature)
    return encoded_feature

## Build a model

With this done, we can create our end-to-end model:

### Adaptative Learning Rate

In [37]:
class AdaptiveLearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, initial_learning_rate, decay_steps, decay_rate):
        self.initial_learning_rate = tf.cast(initial_learning_rate, tf.float32)
        self.decay_steps = tf.cast(decay_steps, tf.float32)
        self.decay_rate = tf.cast(decay_rate, tf.float32)

    def __call__(self, step):
        step = tf.cast(step, tf.float32)  # Ensure step is also float32
        return self.initial_learning_rate * tf.math.pow(self.decay_rate, (step / self.decay_steps))


# Example usage
initial_learning_rate = 0.001
decay_steps = 1000
decay_rate = 0.96

lr_schedule = AdaptiveLearningRateScheduler(initial_learning_rate, decay_steps, decay_rate)

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',    # Metric to monitor
    patience=1000,           # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True # Restore model weights from the epoch with the best value of the monitored quantity
)


In [39]:
 # Numerical features
lon = keras.Input(shape=(1,), name="lon")
lat = keras.Input(shape=(1,), name="lat")
blue_p50 = keras.Input(shape=(1,), name="blue_p50")
green_p50 = keras.Input(shape=(1,), name="green_p50")
nir_p50 = keras.Input(shape=(1,), name="nir_p50")
nira_p50 = keras.Input(shape=(1,), name="nira_p50")
re1_p50 = keras.Input(shape=(1,), name="re1_p50")
re2_p50 = keras.Input(shape=(1,), name="re2_p50")
re3_p50 = keras.Input(shape=(1,), name="re3_p50")
red_p50 = keras.Input(shape=(1,), name="red_p50")
swir1_p50 = keras.Input(shape=(1,), name="swir1_p50")
swir2_p50 = keras.Input(shape=(1,), name="swir2_p50")
VV_p50 = keras.Input(shape=(1,), name="VV_p50")
VH_p50 = keras.Input(shape=(1,), name="VH_p50")

all_inputs = [
    lon,
    lat,
    blue_p50,
    green_p50,
    nir_p50,
    nira_p50,
    re1_p50,
    re2_p50,
    re3_p50,
    red_p50,
    swir1_p50,
    swir2_p50,
    VV_p50,
    VH_p50
]

# Integer categorical features
lon_encoded = encode_numerical_feature(lon, "lon", train_ds)
lat_encoded = encode_numerical_feature(lat, "lat", train_ds)
blue_p50_encoded = encode_numerical_feature(blue_p50, "blue_p50", train_ds)
green_p50_encoded = encode_numerical_feature(green_p50, "green_p50", train_ds)
nir_p50_encoded = encode_numerical_feature(nir_p50, "nir_p50", train_ds)
nira_p50_encoded = encode_numerical_feature(nira_p50, "nira_p50", train_ds)
re1_p50_encoded = encode_numerical_feature(re1_p50, "re1_p50", train_ds)
re2_p50_encoded = encode_numerical_feature(re2_p50, "re2_p50", train_ds)
re3_p50_encoded = encode_numerical_feature(re3_p50, "re3_p50", train_ds)
red_p50_encoded = encode_numerical_feature(red_p50, "red_p50", train_ds)
swir1_p50_encoded = encode_numerical_feature(swir1_p50, "swir1_p50", train_ds)
swir2_p50_encoded = encode_numerical_feature(swir2_p50, "swir2_p50", train_ds)
VV_p50_encoded = encode_numerical_feature(VV_p50, "VV_p50", train_ds)
VH_p50_encoded = encode_numerical_feature(VH_p50, "VH_p50", train_ds)

all_features = layers.concatenate(
    [
        lon_encoded,
        lat_encoded,
        blue_p50_encoded,
        green_p50_encoded,
        nir_p50_encoded,
        nira_p50_encoded,
        re1_p50_encoded,
        re2_p50_encoded,
        re3_p50_encoded,
        red_p50_encoded,
        swir1_p50_encoded,
        swir2_p50_encoded,
        VV_p50_encoded,
        VH_p50
    ]
)
x1 = layers.Dense(64, activation="relu")(all_features)
x1 = layers.Dropout(0.5)(x1)
x2 = layers.Dense(64, activation="relu")(all_features)
x2 = layers.Dropout(0.5)(x2)
x3 = layers.Dense(64, activation="relu")(all_features)
x3 = layers.Dropout(0.5)(x3)
x4 = layers.Dense(64, activation="relu")(all_features)
x4 = layers.Dropout(0.5)(x4)
x5 = layers.Dense(64, activation="relu")(all_features)
x5 = layers.Dropout(0.5)(x5)
x6 = layers.Dense(64, activation="relu")(all_features)
x6 = layers.Dropout(0.5)(x6)
x7 = layers.Dense(64, activation="relu")(all_features)
x7 = layers.Dropout(0.5)(x7)
x8 = layers.Dense(64, activation="relu")(all_features)
x8 = layers.Dropout(0.5)(x3)

x1 = layers.concatenate((x1,x2))
x2 = layers.concatenate((x3,x4))
x3 = layers.concatenate((x5,x6))
x4 = layers.concatenate((x6,x7))

x1 = layers.Dense(128, activation="relu")(x1)
x1 = layers.Dropout(0.5)(x1)
x2 = layers.Dense(128, activation="relu")(x2)
x2 = layers.Dropout(0.5)(x2)
x3 = layers.Dense(128, activation="relu")(x3)
x3 = layers.Dropout(0.5)(x3)
x4 = layers.Dense(128, activation="relu")(x4)
x4 = layers.Dropout(0.5)(x4)

x1 = layers.concatenate((x1,x2))
x2 = layers.concatenate((x3,x4))

x1 = layers.Dense(256, activation="relu")(x1)
x1 = layers.Dropout(0.5)(x1)
x2 = layers.Dense(256, activation="relu")(x2)
x2 = layers.Dropout(0.5)(x2)

x = layers.concatenate((x1,x2))

x = layers.Dense(512, activation="relu")(x)
x = layers.Dropout(0.5)(x)

x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

output = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(all_inputs, output)

2024-07-31 13:50:26.666407: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-07-31 13:50:26.700524: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-07-31 13:50:26.732850: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-07-31 13:50:26.764712: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-07-31 13:50:26.796204: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-07-31 13:50:26.827322: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2024-07-31 13:50:26.858635: W tensorflow/core/framework/local_rendezvous.cc:404] L

In [41]:
model.compile(tf.keras.optimizers.Adam(),"binary_crossentropy", metrics=["accuracy"])

Let's visualize our connectivity graph:

## Train the model

In [None]:
model.fit(train_ds, epochs=10, validation_data=val_ds, callbacks=[early_stopping], verbose=2)

Epoch 1/10


## Inference on new data

To get a prediction for a new sample, you can simply call `model.predict()`. There are
just two things you need to do:

1. wrap scalars into a list so as to have a batch dimension (models only process batches
of data, not single samples)
2. Call `convert_to_tensor` on each feature

In [34]:
from tensorflow.keras.models import load_model
model = load_model('StructData_Model0.h5')

  super().__init__(**kwargs)


sample = {
    "age": 60,
    "sex": 1,
    "cp": 1,
    "trestbps": 145,
    "chol": 233,
    "fbs": 1,
    "restecg": 2,
    "thalach": 150,
    "exang": 0,
    "oldpeak": 2.3,
    "slope": 3,
    "ca": 0,
    "thal": "fixed",
}

input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
predictions = model.predict(input_dict)



### Test Eval

In [None]:
from sklearn.metrics import accuracy_score

y = dataframe_test['TARGET']
X = dict(dataframe_test.drop(['TARGET'], axis=1))

pred = model.predict(X).astype(int)
pred = pred.reshape(-1)
accuracy_score(y, pred)

In [None]:
Kenya = pd.read_csv("Kenya_testing.csv")
Spain =  pd.read_csv("Spain_validation.csv")
VNM =  pd.read_csv("VNM_testing.csv")
VNM.rename(columns={'Lon': 'lon', 'Lat': 'lat'}, inplace=True) # It's to allign with the other two sources.

dataframe = pd.concat([Kenya, Spain, VNM], axis=0)


In [None]:
#    dataframe = dataframe.copy()
#    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe)))
#    ds = ds.shuffle(buffer_size=len(dataframe))
#    return ds
ds = dict(dataframe)
#

In [None]:
predictions = model.predict(ds).astype(int)



In [None]:
predictions[0][0]

1

In [None]:
predictions[0]

array([1.], dtype=float32)