In [None]:
pip install tensorflow==2.12

In [2]:
import keras
from keras import layers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import os
import urllib.request
import gzip

In [3]:
def load_covtype_dataset():
    '''Downloads the Cover Type dataset from UCI repository, returning a file handle'''
    CURRENT_DIR = os.getcwd()
    COVTYPE_FILENAME = 'covtype.data'
    COVTYPE_DATA_PATH = os.path.join(CURRENT_DIR, COVTYPE_FILENAME)
    COVTYPE_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz'
    if os.path.isfile(COVTYPE_DATA_PATH):
        print('Using local cached copy in', COVTYPE_DATA_PATH)
    else:
        print('Dataset not found locally. Downloading in', COVTYPE_DATA_PATH)
        with urllib.request.urlopen(COVTYPE_URL) as response:
            with gzip.GzipFile(fileobj=response) as uncompressed, open(COVTYPE_DATA_PATH, 'wb') as out_file:
                file_header = uncompressed.read()
                out_file.write(file_header)
    return COVTYPE_DATA_PATH

In [4]:
covtype_file = load_covtype_dataset()

Using local cached copy in /content/covtype.data


In [5]:
df_covtype = pd.read_csv(covtype_file, header=None)

In [6]:
def features_renaming(df_covtype):
    '''Rename each column to meaningful labels'''
    first_fourteen_old_feature_names = df_covtype.columns[np.arange(0,14)]
    first_fourteen_new_feature_names = ['Elevation', 'Aspect', 'Slope',
                                        'Horizontal_Distance_To_Hydrology',
                                        'Vertical_Distance_To_Hydrology',
                                        'Horizontal_Distance_To_Roadways',
                                        'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm',
                                        'Horizontal_Distance_To_Fire_Points',
                                        'Wilderness_Area_1', 'Wilderness_Area_2',
                                        'Wilderness_Area_3', 'Wilderness_Area_4']
    old_to_new_name_mapping = dict(zip(first_fourteen_old_feature_names, first_fourteen_new_feature_names))
    df_covtype.rename(columns=old_to_new_name_mapping, inplace=True)
    soil_type_old_feature_names = df_covtype.columns[np.arange(14,54)]
    soil_type_new_feature_names = ['Soil_Type_' + str(i) for i in np.arange(1,41)]
    old_to_new_name_mapping = dict(zip(soil_type_old_feature_names, soil_type_new_feature_names))
    df_covtype.rename(columns=old_to_new_name_mapping, inplace=True)
    df_covtype.rename(columns={54: 'Cover_Type'}, inplace=True)
    return df_covtype

In [7]:
df_covtype = features_renaming(df_covtype)
df_covtype.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Soil_Type_40,Cover_Type
0,2596,51,3,258,0,510,221,232,148,6279,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
1,2590,56,2,212,-6,390,220,235,151,6225,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,2804,139,9,268,65,3180,234,238,135,6121,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,2785,155,18,242,118,3090,238,238,122,6211,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,2595,45,2,153,-1,391,220,234,150,6172,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0


In [8]:
val_dataframe = df_covtype.sample(frac=0.2, random_state=1337)
train_dataframe = df_covtype.drop(val_dataframe.index)

print(
    "Using %d samples for training and %d for validation"
    % (len(train_dataframe), len(val_dataframe))
)

Using 45395 samples for training and 11349 for validation


In [9]:
def dataframe_to_dataset(df_covtype):
    df_covtype = df_covtype.copy()
    labels = df_covtype.pop("Cover_Type")
    ds = tf.data.Dataset.from_tensor_slices((dict(df_covtype), labels))
    ds = ds.shuffle(buffer_size=len(df_covtype))
    return ds


train_ds = dataframe_to_dataset(train_dataframe)
val_ds = dataframe_to_dataset(val_dataframe)

In [10]:
for x, y in train_ds.take(1):
    print("Input:", x)
    print("Target:", y)

Input: {'Elevation': <tf.Tensor: shape=(), dtype=int64, numpy=2559>, 'Aspect': <tf.Tensor: shape=(), dtype=int64, numpy=351>, 'Slope': <tf.Tensor: shape=(), dtype=int64, numpy=20>, 'Horizontal_Distance_To_Hydrology': <tf.Tensor: shape=(), dtype=int64, numpy=95>, 'Vertical_Distance_To_Hydrology': <tf.Tensor: shape=(), dtype=int64, numpy=23>, 'Horizontal_Distance_To_Roadways': <tf.Tensor: shape=(), dtype=int64, numpy=1357>, 'Hillshade_9am': <tf.Tensor: shape=(), dtype=int64, numpy=179>, 'Hillshade_Noon': <tf.Tensor: shape=(), dtype=int64, numpy=201>, 'Hillshade_3pm': <tf.Tensor: shape=(), dtype=int64, numpy=156>, 'Horizontal_Distance_To_Fire_Points': <tf.Tensor: shape=(), dtype=int64, numpy=1499>, 'Wilderness_Area_1': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'Wilderness_Area_2': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'Wilderness_Area_3': <tf.Tensor: shape=(), dtype=int64, numpy=1>, 'Wilderness_Area_4': <tf.Tensor: shape=(), dtype=int64, numpy=0>, 'Soil_Type_1': <tf.Tensor: shap

In [11]:
train_ds = train_ds.batch(32)
val_ds = val_ds.batch(32)

In [12]:
from keras.utils import FeatureSpace

feature_space = FeatureSpace(
    features={
        "Elevation": "integer_categorical",
        "Aspect": "integer_categorical",
        "Slope": "integer_categorical",
        "Horizontal_Distance_To_Hydrology": "integer_categorical",
        "Vertical_Distance_To_Hydrology": "integer_categorical",
        "Horizontal_Distance_To_Roadways": "integer_categorical",
        "Hillshade_9am": "integer_categorical",
        "Hillshade_Noon": "integer_categorical",
        "Hillshade_3pm": "integer_categorical",
        "Horizontal_Distance_To_Fire_Points": "integer_categorical",
        "Wilderness_Area_1": "integer_categorical",
        "Wilderness_Area_2": "integer_categorical",
        "Wilderness_Area_3": "integer_categorical",
        "Wilderness_Area_4": "integer_categorical",
        "Soil_Type_1": "integer_categorical",
        "Soil_Type_2": "integer_categorical",
        "Soil_Type_3": "integer_categorical",
        "Soil_Type_4": "integer_categorical",
        "Soil_Type_5": "integer_categorical",
        "Soil_Type_6": "integer_categorical",
        "Soil_Type_7": "integer_categorical",
        "Soil_Type_8": "integer_categorical",
        "Soil_Type_9": "integer_categorical",
        "Soil_Type_10": "integer_categorical",
        "Soil_Type_11": "integer_categorical",
        "Soil_Type_12": "integer_categorical",
        "Soil_Type_13": "integer_categorical",
        "Soil_Type_14": "integer_categorical",
        "Soil_Type_15": "integer_categorical",
        "Soil_Type_16": "integer_categorical",
        "Soil_Type_17": "integer_categorical",
        "Soil_Type_18": "integer_categorical",
        "Soil_Type_19": "integer_categorical",
        "Soil_Type_20": "integer_categorical",
        "Soil_Type_21": "integer_categorical",
        "Soil_Type_22": "integer_categorical",
        "Soil_Type_23": "integer_categorical",
        "Soil_Type_24": "integer_categorical",
        "Soil_Type_25": "integer_categorical",
        "Soil_Type_26": "integer_categorical",
        "Soil_Type_27": "integer_categorical",
        "Soil_Type_28": "integer_categorical",
        "Soil_Type_29": "integer_categorical",
        "Soil_Type_30": "integer_categorical",
        "Soil_Type_31": "integer_categorical",
        "Soil_Type_32": "integer_categorical",
        "Soil_Type_33": "integer_categorical",
        "Soil_Type_34": "integer_categorical",
        "Soil_Type_35": "integer_categorical",
        "Soil_Type_36": "integer_categorical",
        "Soil_Type_37": "integer_categorical",
        "Soil_Type_38": "integer_categorical",
        "Soil_Type_39": "integer_categorical",
        "Soil_Type_40": "integer_categorical",
    },
    crossing_dim=32,
    output_mode="concat",
)

In [15]:
feature_space = FeatureSpace(
    features={
        "Elevation": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Aspect": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Slope": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Horizontal_Distance_To_Hydrology": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Vertical_Distance_To_Hydrology": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Horizontal_Distance_To_Roadways": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Hillshade_9am": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Hillshade_Noon": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Hillshade_3pm": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Horizontal_Distance_To_Fire_Points": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Wilderness_Area_1": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Wilderness_Area_2": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Wilderness_Area_3": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Wilderness_Area_4": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_1": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_2": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_3": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_4": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_5": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_6": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_7": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_8": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_9": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_10": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_11": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_12": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_13": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_14": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_15": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_16": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_17": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_18": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_19": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_20": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_21": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_22": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_23": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_24": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_25": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_26": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_27": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_28": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_29": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_30": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_31": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_32": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_33": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_34": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_35": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_36": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_37": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_38": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_39": FeatureSpace.integer_categorical(num_oov_indices=0),
        "Soil_Type_40": FeatureSpace.integer_categorical(num_oov_indices=0),

    },
    output_mode="concat",
)

In [16]:
train_ds_with_no_labels = train_ds.map(lambda x, _: x)
feature_space.adapt(train_ds_with_no_labels)

In [17]:
for x, _ in train_ds.take(1):
    preprocessed_x = feature_space(x)
    print("preprocessed_x.shape:", preprocessed_x.shape)
    print("preprocessed_x.dtype:", preprocessed_x.dtype)

preprocessed_x.shape: (32, 14993)
preprocessed_x.dtype: <dtype: 'float32'>


In [18]:
preprocessed_train_ds = train_ds.map(
    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
preprocessed_train_ds = preprocessed_train_ds.prefetch(tf.data.AUTOTUNE)

preprocessed_val_ds = val_ds.map(
    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
preprocessed_val_ds = preprocessed_val_ds.prefetch(tf.data.AUTOTUNE)

In [37]:
dict_inputs = feature_space.get_inputs()
encoded_features = feature_space.get_encoded_features()

x = keras.layers.Dense(32, activation="relu")(encoded_features)
x = keras.layers.Dropout(0.5)(x)
predictions = keras.layers.Dense(1, activation="sigmoid")(x)

training_model = keras.Model(inputs=encoded_features, outputs=predictions)
training_model.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
)

inference_model = keras.Model(inputs=dict_inputs, outputs=predictions)

In [None]:
training_model.fit(
    preprocessed_train_ds, epochs=10, validation_data=preprocessed_val_ds
)