In [5]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, f_classif
import os

In [6]:
_data_root = './data/covertype'

In [7]:
_data_filepath = os.path.join(_data_root, 'covertype_train.csv')

In [17]:
# Download data
os.makedirs(_data_root, exist_ok=True)
if not os.path.isfile(_data_filepath):
    #https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/
    url = 'https://docs.google.com/uc?export= \
    download&confirm={{VALUE}}&id=1lVF1BCWLH4eXXV_YOJzjR7xZjj-wAGj9'
    r = requests.get(url, allow_redirects=True, stream=True)
    open(_data_filepath, 'wb').write(r.content)

In [8]:
file_path = "./data/covertype/covertype_train.csv"
df = pd.read_csv(file_path)

In [9]:
columns = df.columns.tolist()
columns

['Elevation',
 'Aspect',
 'Slope',
 'Horizontal_Distance_To_Hydrology',
 'Vertical_Distance_To_Hydrology',
 'Horizontal_Distance_To_Roadways',
 'Hillshade_9am',
 'Hillshade_Noon',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points',
 'Wilderness_Area',
 'Soil_Type',
 'Cover_Type']

In [10]:
# Separar características y etiquetas
X = df.drop(columns=["Cover_Type"])
y = df["Cover_Type"]

In [11]:
# Convertir variables categóricas a numéricas con One-Hot Encoding
X = pd.get_dummies(X, drop_first=True)
X

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_C7755,Soil_Type_C7756,Soil_Type_C7757,Soil_Type_C7790,Soil_Type_C8703,Soil_Type_C8707,Soil_Type_C8708,Soil_Type_C8771,Soil_Type_C8772,Soil_Type_C8776
0,2991,119,7,67,11,1015,233,234,133,1570,...,0,0,0,0,0,0,0,0,0,0
1,2876,3,18,485,71,2495,192,202,144,1557,...,0,0,1,0,0,0,0,0,0,0
2,3171,315,2,277,9,4374,213,237,162,1052,...,0,0,0,0,0,0,0,0,0,0
3,3087,342,13,190,31,4774,193,221,166,752,...,0,0,0,0,0,0,0,0,0,0
4,2835,158,10,212,41,3596,231,242,141,3280,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116198,3150,220,16,285,47,2275,200,253,187,866,...,0,1,0,0,0,0,0,0,0,0
116199,3125,47,13,234,2,2430,224,212,120,1426,...,0,0,0,0,0,0,0,0,0,0
116200,3166,152,11,67,0,1275,234,240,136,2404,...,0,0,0,0,0,0,0,0,0,0
116201,3154,285,14,738,46,6012,181,239,198,1320,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Normalización
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
X_scaled

array([[ 0.1161673 , -0.32740029, -0.9479552 , ..., -0.16591187,
        -0.15481714, -0.12289779],
       [-0.29416093, -1.36472602,  0.51523184, ..., -0.16591187,
        -0.15481714, -0.12289779],
       [ 0.75842018,  1.4253225 , -1.61304022, ..., -0.16591187,
        -0.15481714, -0.12289779],
       ...,
       [ 0.74057982, -0.032299  , -0.41588719, ..., -0.16591187,
        -0.15481714, -0.12289779],
       [ 0.69776296,  1.1570486 , -0.01683618, ..., -0.16591187,
        -0.15481714, -0.12289779],
       [ 0.25889016, -0.26480305, -1.34700622, ..., -0.16591187,
        -0.15481714, -0.12289779]])

In [14]:
# Aplicar Selección de Características Univariada con SelectKBest
selector = SelectKBest(score_func=f_classif, k=7)  # Seleccionamos las 7 mejores características
X_selected = selector.fit_transform(X_scaled, y)
X_selected

array([[ 0.1161673 , -0.90005564, -0.09208143, ..., -0.24422718,
        -0.16591187, -0.15481714],
       [-0.29416093, -0.90005564, -0.09208143, ..., -0.24422718,
        -0.16591187, -0.15481714],
       [ 0.75842018,  1.11104242, -0.09208143, ..., -0.24422718,
        -0.16591187, -0.15481714],
       ...,
       [ 0.74057982,  1.11104242, -0.09208143, ..., -0.24422718,
        -0.16591187, -0.15481714],
       [ 0.69776296,  1.11104242, -0.09208143, ..., -0.24422718,
        -0.16591187, -0.15481714],
       [ 0.25889016, -0.90005564, -0.09208143, ..., -0.24422718,
        -0.16591187, -0.15481714]])

In [15]:
# Obtener las columnas seleccionadas
selected_features = np.array(X.columns)[selector.get_support()]
selected_features

array(['Elevation', 'Wilderness_Area_Rawah', 'Soil_Type_C2704',
       'Soil_Type_C2705', 'Soil_Type_C4703', 'Soil_Type_C8771',
       'Soil_Type_C8772'], dtype=object)

In [16]:
# Imprimir qué características fueron retenidas
feature_selection_result = pd.DataFrame({
    "Feature": X.columns,
    "Retained": selector.get_support()
})
feature_selection_result

Unnamed: 0,Feature,Retained
0,Elevation,True
1,Aspect,False
2,Slope,False
3,Horizontal_Distance_To_Hydrology,False
4,Vertical_Distance_To_Hydrology,False
5,Horizontal_Distance_To_Roadways,False
6,Hillshade_9am,False
7,Hillshade_Noon,False
8,Hillshade_3pm,False
9,Horizontal_Distance_To_Fire_Points,False


In [17]:
print("Características seleccionadas:")
print(feature_selection_result)

Características seleccionadas:
                               Feature  Retained
0                            Elevation      True
1                               Aspect     False
2                                Slope     False
3     Horizontal_Distance_To_Hydrology     False
4       Vertical_Distance_To_Hydrology     False
5      Horizontal_Distance_To_Roadways     False
6                        Hillshade_9am     False
7                       Hillshade_Noon     False
8                        Hillshade_3pm     False
9   Horizontal_Distance_To_Fire_Points     False
10           Wilderness_Area_Commanche     False
11               Wilderness_Area_Neota     False
12               Wilderness_Area_Rawah      True
13                     Soil_Type_C2703     False
14                     Soil_Type_C2704      True
15                     Soil_Type_C2705      True
16                     Soil_Type_C2706     False
17                     Soil_Type_C2717     False
18                     Soil_Type_C3501

In [18]:
# Reducción de características con PCA
pca = PCA(n_components=5)  # Reducir a 5 componentes principales
X_reduced = pca.fit_transform(X_scaled)


In [19]:
# División en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

In [20]:
# Entrenamiento del modelo
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [21]:
# Predicción y evaluación
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Precisión del modelo: {accuracy:.4f}")
print(f"Precisión: {precision:.4f}")
print(f"Sensibilidad (Recall): {recall:.4f}")
print(f"Puntaje F1: {f1:.4f}")


Precisión del modelo: 0.7957
Precisión: 0.7943
Sensibilidad (Recall): 0.7957
Puntaje F1: 0.7917


In [40]:
# Guardar el modelo y los preprocesadores
joblib.dump(model, "./models/covertype_model.pkl")
joblib.dump(scaler, "./models/scaler.pkl")
joblib.dump(pca, "./models/pca.pkl")

['./models/pca.pkl']