In [None]:
!pip install tensorflow tensorflow_data_validation tensorflow_transform ml-metadata

Collecting tensorflow_data_validation
  Downloading tensorflow_data_validation-1.16.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting tensorflow_transform
  Downloading tensorflow_transform-1.16.0-py3-none-any.whl.metadata (13 kB)
Collecting ml-metadata
  Downloading ml_metadata-1.16.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)
Collecting pandas<2,>=1.0 (from tensorflow_data_validation)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pyarrow<11,>=10 (from tensorflow_data_validation)
  Downloading pyarrow-10.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pyfarmhash<0.4,>=0.2.2 (from tensorflow_data_validation)
  Downloading pyfarmhash-0.3.2.tar.gz (99 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.9/99.9 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup

In [None]:
import os
import requests
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
import tensorflow as tf
import tensorflow_data_validation as tfdv
import tensorflow_transform as tft
from ml_metadata.metadata_store import metadata_store


In [None]:
FULL_DATASET = '../covertype.csv'
SMALL_DATASET= '../covertype_small.csv'
TRAINING_DATASET='../covertype_training.csv'
TRAINING_DATASET_WITH_MISSING = '../covertype_training_missing.csv'
EVALUATION_DATASET='../covertype_evaluation.csv'
EVALUATION_DATASET_WITH_ANOMALIES='../covertype_evaluation_anomalies.csv'
SERVING_DATASET='../covertype_serving.csv'

ORIGINAL_DATASET_PATH = 'gs://workshop-datasets/covertype/orig/covtype.data'

In [None]:
## download the dataset
# Directory of the raw data files
_data_root = './data/covertype'

In [None]:
# Path to the raw training data
_data_filepath = os.path.join(_data_root, 'covertype_train.csv')

In [None]:
# Download data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath):
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export= \
    download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath, 'wb').write(r.content)

In [None]:
file_path = "./data/covertype/covertype_train.csv"  # Reemplazar con la ruta correcta
df = pd.read_csv(file_path)

In [None]:
columns = df.columns.tolist()
columns

['Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points',
 'Wilderness_Area',
 'Soil_Type',
 'Cover_Type']

In [None]:
# Selección de características
features = ['Elevation','Aspect','Hillshade_3pm', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
            'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Horizontal_Distance_To_Fire_Points']
target = 'Cover_Type'

X = df[features]
y = df[target]

In [None]:
# Convertir variables categóricas a numéricas con One-Hot Encoding
X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,Elevation,Aspect,Hillshade_3pm,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Horizontal_Distance_To_Fire_Points
0,2991,119,133,7,67,11,1015,233,234,1570
1,2876,3,144,18,485,71,2495,192,202,1557
2,3171,315,162,2,277,9,4374,213,237,1052
3,3087,342,166,13,190,31,4774,193,221,752
4,2835,158,141,10,212,41,3596,231,242,3280
...,...,...,...,...,...,...,...,...,...,...
116198,3150,220,187,16,285,47,2275,200,253,866
116199,3125,47,120,13,234,2,2430,224,212,1426
116200,3166,152,136,11,67,0,1275,234,240,2404
116201,3154,285,198,14,738,46,6012,181,239,1320


In [None]:
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
selected_feature_indices = selector.get_support(indices=True)
selected_features = X.columns[selected_feature_indices]
print("Mejores características seleccionadas:", selected_features)

Mejores características seleccionadas: Index(['Elevation', 'Aspect', 'Hillshade_3pm', 'Slope',
       'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
       'Horizontal_Distance_To_Fire_Points'],
      dtype='object')


In [None]:
# Normalización de datos
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)

In [None]:
# Creación del pipeline con TFX
stats = tfdv.generate_statistics_from_dataframe(df)
tfdv.visualize_statistics(stats)

In [None]:
schema = tfdv.infer_schema(stats)
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Elevation',INT,required,,-
'Aspect',INT,required,,-
'Slope',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Vertical_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Hillshade_9am',INT,required,,-
'Hillshade_Noon',INT,required,,-
'Hillshade_3pm',INT,required,,-
'Horizontal_Distance_To_Fire_Points',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Wilderness_Area',"'Cache', 'Commanche', 'Neota', 'Rawah'"
'Soil_Type',"'C2702', 'C2703', 'C2704', 'C2705', 'C2706', 'C2717', 'C3501', 'C3502', 'C4201', 'C4703', 'C4704', 'C4744', 'C4758', 'C5101', 'C5151', 'C6101', 'C6102', 'C6731', 'C7101', 'C7102', 'C7103', 'C7201', 'C7202', 'C7700', 'C7701', 'C7702', 'C7709', 'C7710', 'C7745', 'C7746', 'C7755', 'C7756', 'C7757', 'C7790', 'C8703', 'C8707', 'C8708', 'C8771', 'C8772', 'C8776'"


In [None]:
dataset_anomalies = tfdv.validate_statistics(stats, schema)
tfdv.display_anomalies(dataset_anomalies)

In [None]:
# Transformación de datos
def preprocessing_fn(inputs):
    outputs = {}
    outputs['Elevation'] = tft.scale_to_z_score(inputs['Elevation'])
    outputs['Slope'] = tft.scale_to_0_1(inputs['Slope'])
    outputs['Cover_Type'] = inputs['Cover_Type']
    return outputs

In [None]:
# Función para asignar un dominio si no existe
def set_domain_if_missing(schema, feature_name, min_val, max_val):
    feature = tfdv.get_feature(schema, feature_name)
    if feature.domain is None:
        feature.int_domain.CopyFrom(schema_pb2.IntDomain(min=min_val, max=max_val))

# Ajustar los dominios de valores esperados
set_domain_if_missing(schema, 'Hillshade_9am', 0, 255)
set_domain_if_missing(schema, 'Hillshade_Noon', 0, 255)
set_domain_if_missing(schema, 'Slope', 0, 90)
set_domain_if_missing(schema, 'Cover_Type', 0, 6)

# Mostrar el esquema curado
tfdv.display_schema(schema)
print("Esquema curado correctamente.")

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'Elevation',INT,required,,-
'Aspect',INT,required,,-
'Slope',INT,required,,-
'Horizontal_Distance_To_Hydrology',INT,required,,-
'Vertical_Distance_To_Hydrology',INT,required,,-
'Horizontal_Distance_To_Roadways',INT,required,,-
'Hillshade_9am',INT,required,,-
'Hillshade_Noon',INT,required,,-
'Hillshade_3pm',INT,required,,-
'Horizontal_Distance_To_Fire_Points',INT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Wilderness_Area',"'Cache', 'Commanche', 'Neota', 'Rawah'"
'Soil_Type',"'C2702', 'C2703', 'C2704', 'C2705', 'C2706', 'C2717', 'C3501', 'C3502', 'C4201', 'C4703', 'C4704', 'C4744', 'C4758', 'C5101', 'C5151', 'C6101', 'C6102', 'C6731', 'C7101', 'C7102', 'C7103', 'C7201', 'C7202', 'C7700', 'C7701', 'C7702', 'C7709', 'C7710', 'C7745', 'C7746', 'C7755', 'C7756', 'C7757', 'C7790', 'C8703', 'C8707', 'C8708', 'C8771', 'C8772', 'C8776'"


Esquema curado correctamente.


In [None]:


print("Pipeline completado exitosamente.")

Pipeline completado exitosamente.
