In [43]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [44]:
FULL_DATASET = '../covertype.csv'
SMALL_DATASET= '../covertype_small.csv'
TRAINING_DATASET='../covertype_training.csv'
TRAINING_DATASET_WITH_MISSING = '../covertype_training_missing.csv'
EVALUATION_DATASET='../covertype_evaluation.csv'
EVALUATION_DATASET_WITH_ANOMALIES='../covertype_evaluation_anomalies.csv'
SERVING_DATASET='../covertype_serving.csv'

ORIGINAL_DATASET_PATH = 'gs://workshop-datasets/covertype/orig/covtype.data'

In [19]:
## download the dataset
# Directory of the raw data files
_data_root = './data/covertype'

In [16]:
# Path to the raw training data
_data_filepath = os.path.join(_data_root, 'covertype_train.csv')

In [17]:
# Download data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath):
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export= \
    download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath, 'wb').write(r.content)

In [22]:
file_path = "./data/covertype/covertype_train.csv"  # Reemplazar con la ruta correcta
df = pd.read_csv(file_path)

In [24]:
columns = df.columns.tolist()
columns

['Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points',
 'Wilderness_Area',
 'Soil_Type',
 'Cover_Type']

In [25]:
# Separar características y etiquetas
X = df.drop(columns=["Cover_Type"])
y = df["Cover_Type"]

In [31]:
# Convertir variables categóricas a numéricas con One-Hot Encoding
X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_C7755,Soil_Type_C7756,Soil_Type_C7757,Soil_Type_C7790,Soil_Type_C8703,Soil_Type_C8707,Soil_Type_C8708,Soil_Type_C8771,Soil_Type_C8772,Soil_Type_C8776
0,2991,119,7,67,11,1015,233,234,133,1570,...,False,False,False,False,False,False,False,False,False,False
1,2876,3,18,485,71,2495,192,202,144,1557,...,False,False,True,False,False,False,False,False,False,False
2,3171,315,2,277,9,4374,213,237,162,1052,...,False,False,False,False,False,False,False,False,False,False
3,3087,342,13,190,31,4774,193,221,166,752,...,False,False,False,False,False,False,False,False,False,False
4,2835,158,10,212,41,3596,231,242,141,3280,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116198,3150,220,16,285,47,2275,200,253,187,866,...,False,True,False,False,False,False,False,False,False,False
116199,3125,47,13,234,2,2430,224,212,120,1426,...,False,False,False,False,False,False,False,False,False,False
116200,3166,152,11,67,0,1275,234,240,136,2404,...,False,False,False,False,False,False,False,False,False,False
116201,3154,285,14,738,46,6012,181,239,198,1320,...,False,False,False,False,False,False,False,False,False,False


In [32]:
# Normalización
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [33]:
X_scaled

array([[ 0.1161673 , -0.32740029, -0.9479552 , ..., -0.16591187,
        -0.15481714, -0.12289779],
       [-0.29416093, -1.36472602,  0.51523184, ..., -0.16591187,
        -0.15481714, -0.12289779],
       [ 0.75842018,  1.4253225 , -1.61304022, ..., -0.16591187,
        -0.15481714, -0.12289779],
       ...,
       [ 0.74057982, -0.032299  , -0.41588719, ..., -0.16591187,
        -0.15481714, -0.12289779],
       [ 0.69776296,  1.1570486 , -0.01683618, ..., -0.16591187,
        -0.15481714, -0.12289779],
       [ 0.25889016, -0.26480305, -1.34700622, ..., -0.16591187,
        -0.15481714, -0.12289779]])

In [34]:
# Reducción de características con PCA
pca = PCA(n_components=10)  # Seleccionamos 10 componentes principales
X_reduced = pca.fit_transform(X_scaled)


In [35]:
# División en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

In [37]:
# Entrenamiento del modelo
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [42]:
# Predicción y evaluación
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Precisión del modelo: {accuracy:.4f}")
print(f"Precisión: {precision:.4f}")
print(f"Sensibilidad (Recall): {recall:.4f}")
print(f"Puntaje F1: {f1:.4f}")


Precisión del modelo: 0.8605
Precisión: 0.8602
Sensibilidad (Recall): 0.8605
Puntaje F1: 0.8581


In [40]:
# Guardar el modelo y los preprocesadores
joblib.dump(model, "./models/covertype_model.pkl")
joblib.dump(scaler, "./models/scaler.pkl")
joblib.dump(pca, "./models/pca.pkl")

['./models/pca.pkl']