### Identifica los valores discretos

In [1]:
import pandas as pd
df = pd.DataFrame({
    'atributo_1': [0.0, 0.5, 1.0, 0.5, 0.0],
    'atributo_2': [1.2, 2.3, 3.4, 4.5, 5.6],
    'atributo_3': [1, 2, 1, 2, 3],
    'atributo_4': ['A', 'B', 'A', 'C', 'B']
})

# Umbral para determinar si un atributo es discreto (número de valores únicos)
umbral_valores_unicos = 5

# Identificar columnas numéricas discretas (incluyendo flotantes)
atributos_discretos = []
for col in df.select_dtypes(include=['float','int']).columns:
    if df[col].nunique() < umbral_valores_unicos:
        atributos_discretos.append(col)


In [2]:
atributos_discretos

['atributo_1', 'atributo_3']

### Identifica instancias como outliers

In [4]:
import pandas as pd

# Ejemplo de DataFrame
df = pd.DataFrame({
    'atributo_1': [1, 2, 3, 4, 100, 6, 7, 8, 9, 10],
    'atributo_2': [10, 12, 14, 16, 18, 20, 22, 24, 26, 500]
})
# Calcular los percentiles Q1 (25%) y Q3 (75%)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
# Calcular el rango intercuartil (IQR)
IQR = Q3 - Q1

# Definir un rango para considerar valores atípicos
outliers = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
# Mostrar los outliers
outliers_detectados = df[outliers.any(axis=1)]

print("Outliers detectados:")
print(outliers_detectados)

Outliers detectados:
   atributo_1  atributo_2
4         100          18
9          10         500


### Imputer con KNN

In [12]:
import numpy as np

from sklearn.impute import KNNImputer

df = pd.DataFrame({
    'atributo_1': [1,3,np.nan,8],
    'atributo_2': [2,4,6,8],
    'atributo_3': [np.nan,3,5,7]
})
#[[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]

imputer = KNNImputer(n_neighbors=2)

imputer.fit_transform(df)


array([[1. , 2. , 4. ],
       [3. , 4. , 3. ],
       [5.5, 6. , 5. ],
       [8. , 8. , 7. ]])

### Ajusta datos limpios a tu modelo
#### Regresion lineal

In [19]:

from sklearn.linear_model import LinearRegression 
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

NameError: name 'housing_prepared' is not defined

In [17]:
from sklearn.metrics import mean_squared_error

In [None]:
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels,housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

## Arboles de decision

In [15]:
from sklearn.tree import DecisionTreeRegressor

In [16]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(housing_prepared, housing_labels)

NameError: name 'housing_prepared' is not defined

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

### Validación Cruzada

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [None]:
tree_rmse_scores

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

In [None]:
display_scores(tree_rmse_scores)

In [None]:
#Cross-validaton con regresión lineal
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)

In [None]:
### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
#Regresión con random forest
forest_reg = RandomForestRegressor()
forest_reg.fit(housing_prepared, housing_labels)

In [None]:
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

###  Guardar modelo

In [None]:
import joblib

In [None]:
#joblib.dump(forest_reg, "my_model.pkl")
# cargar modelo
#my_model_loaded = joblib.load("my_model.pkl")

### Busqueda de mejores hiperparametros

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
#Búsqueda de parámetros
param_grid = [{'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
              {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}, ]

In [None]:
forest_reg = RandomForestRegressor()
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')

In [None]:
grid_search.fit(housing_prepared, housing_labels)

In [None]:
print("Grid mejores parámetros: ",grid_search.best_params_)
print("Grid mejor estimador: ",grid_search.best_estimator_)

cvres = grid_search.cv_results_
print("Resultados de Grid search")
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)