In [81]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

In [82]:
df = pd.DataFrame(
    {'A': [1, np.nan, 3], 
     'B': [4, 5, np.nan],
     'C': ['madrid', 'bcn', np.nan],
     })
df.head()

Unnamed: 0,A,B,C
0,1.0,4.0,madrid
1,,5.0,bcn
2,3.0,,


In [83]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       2 non-null      float64
 1   B       2 non-null      float64
 2   C       2 non-null      object 
dtypes: float64(2), object(1)
memory usage: 204.0+ bytes


In [84]:
numerical_columns = df.select_dtypes(exclude=['object', 'category']).columns.to_list() # np.number alternativa
categorical_columns = df.select_dtypes(include=['object', 'category']).columns.to_list()

imputer_numerical = SimpleImputer(strategy='constant', fill_value=0)
np_numerical = imputer_numerical.fit_transform(df[numerical_columns]) # array de numpy con las codificaciones

imputer_categorical = SimpleImputer(strategy='constant', fill_value='others')
np_categorical = imputer_categorical.fit_transform(df[categorical_columns])

df_final = pd.concat(
    [
        pd.DataFrame(np_numerical, columns=numerical_columns).reset_index(drop=True), # categoricas
        pd.DataFrame(np_categorical, columns=categorical_columns).reset_index(drop=True) # numéricas
    ],
    axis=1
)
df_final.head()

Unnamed: 0,A,B,C
0,1.0,4.0,madrid
1,0.0,5.0,bcn
2,3.0,0.0,others


In [85]:
df = pd.read_csv('../../Data/AB_NYC_2019.csv').drop(['id','name','host_id','host_name','last_review'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   neighbourhood_group             48895 non-null  object 
 1   neighbourhood                   48895 non-null  object 
 2   latitude                        48895 non-null  float64
 3   longitude                       48895 non-null  float64
 4   room_type                       48895 non-null  object 
 5   price                           48895 non-null  int64  
 6   minimum_nights                  48895 non-null  int64  
 7   number_of_reviews               48895 non-null  int64  
 8   reviews_per_month               38843 non-null  float64
 9   calculated_host_listings_count  48895 non-null  int64  
 10  availability_365                48895 non-null  int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 4.1+ MB


In [86]:
from sklearn.model_selection import train_test_split

X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [87]:
df['neighbourhood'] = df['neighbourhood'].str.replace('-', '').str.replace('"', '').str.replace("'", '').str.replace(' ', '_')

KeyboardInterrupt: 

In [None]:
df.head()

Unnamed: 0,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,1,365
3,Brooklyn,Clinton_Hill,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
4,Manhattan,East_Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 11 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   neighbourhood_group             48895 non-null  object 
 1   neighbourhood                   48895 non-null  object 
 2   latitude                        48895 non-null  float64
 3   longitude                       48895 non-null  float64
 4   room_type                       48895 non-null  object 
 5   price                           48895 non-null  int64  
 6   minimum_nights                  48895 non-null  int64  
 7   number_of_reviews               48895 non-null  int64  
 8   reviews_per_month               38843 non-null  float64
 9   calculated_host_listings_count  48895 non-null  int64  
 10  availability_365                48895 non-null  int64  
dtypes: float64(3), int64(5), object(3)
memory usage: 4.1+ MB


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class OutlierToNaN(BaseEstimator, TransformerMixin):
    """
    Reemplaza valores outliers por NaN usando el método IQR de Tukey
    Funciona con DataFrames de pandas o arrays NumPy
    """
    def __init__(self, factor=1.5):
        self.factor = factor
        
    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            self.is_df_ = True
            self.numeric_cols_ = X.select_dtypes(include=[np.number]).columns
            Q1 = X[self.numeric_cols_].quantile(0.25)
            Q3 = X[self.numeric_cols_].quantile(0.75)
        else:
            self.is_df_ = False
            Q1 = np.percentile(X, 25, axis=0)
            Q3 = np.percentile(X, 75, axis=0)
        
        IQR = Q3 - Q1
        self.lower_bounds_ = Q1 - (self.factor * IQR)
        self.upper_bounds_ = Q3 + (self.factor * IQR)
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        if self.is_df_:
            numeric_data = X_copy[self.numeric_cols_]
            outlier_mask = (numeric_data < self.lower_bounds_) | (numeric_data > self.upper_bounds_)
            X_copy[self.numeric_cols_] = numeric_data.mask(outlier_mask, np.nan)
            return X_copy
        else:
            outlier_mask = (X_copy < self.lower_bounds_) | (X_copy > self.upper_bounds_)
            X_copy[outlier_mask] = np.nan
            return X_copy

In [None]:
# pipeline numéricas
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import r2_score

numerical_cols = df.select_dtypes(include=[np.number]).columns
print('numerical_cols', numerical_cols)
pipeline_numerical = Pipeline([
    ('outlier_nan', OutlierToNaN())
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    # ('scaler', MinMaxScaler())
])

categorical_cols = df.select_dtypes(exclude=[np.number]).columns
print('categorical_cols', numerical_cols)
pipeline_categorical = Pipeline([
   ('imputer', SimpleImputer(strategy='constant', fill_value='others')),
   ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
   # ('encoder', OneHotEncoder(sparse_output=False, handle_unknown='ignore'))
])

pipeline_all = ColumnTransformer([
    ('numeric', pipeline_numerical, numerical_cols),
    ('categorical', pipeline_categorical, categorical_cols)
])

pipeline = make_pipeline(
    pipeline_all,
    # LinearRegression()
    RandomForestRegressor()
)
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
r2_score(y_test, y_pred)

numerical_cols Index(['latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')
categorical_cols Index(['latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')


KeyboardInterrupt: 