In [1]:
from feature_engine.outliers import OutlierTrimmer


def trimmer_skewed_iqr(variables, x_train, x_test, y_train, y_test):
    """
    Apply a default OulierTrimmer for skewed distribution.
    """
    return _build_trimmer(variables, 'iqr', 1.5, x_train, x_test, y_train, y_test)


def trimmer_normal_gaussian(variables, x_train, x_test, y_train, y_test):
    """
    Apply a default OulierTrimmer for normal distribution.
    - Gaussian method based.
    """
    return _build_trimmer(variables, 'gaussian', 3, x_train, x_test, y_train, y_test)


def trimmer_normal_quantile(variables, x_train, x_test, y_train, y_test):
    """
    Apply a default OulierTrimmer for normal distribution.
    - Quantile range based;
    - Its more agressive than Gaussian.
    """
    return _build_trimmer(variables, 'quantiles', 0.01, x_train, x_test, y_train, y_test)


def _build_trimmer(variables, capping_method, fold, x_train, x_test, y_train, y_test):
    trimmer = OutlierTrimmer(
        variables=variables,
        capping_method=capping_method,
        tail='both',
        fold=fold
    )
    trimmer.fit(x_train)
    # print(trimmer.left_tail_caps_)
    # print(trimmer.right_tail_caps_)
    x_train_trimmed = trimmer.transform(x_train)
    y_train_trimmed = y_train[x_train_trimmed.index]
    x_test_trimmed = trimmer.transform(x_test)
    y_test_trimmed = y_test[x_test_trimmed.index]
    return x_train_trimmed, x_test_trimmed, y_train_trimmed, y_test_trimmed


def trimmer_skewed_iqr(variables, data_frame):
    return _build_trimmer(variables, 'iqr', 1.5, data_frame)


def trimmer_normal_gaussian(variables, data_frame):
    return _build_trimmer(variables, 'gaussian', 3, data_frame)


def trimmer_normal_quantile(variables, data_frame):
    return _build_trimmer(variables, 'quantiles', 0.01, data_frame)


def _build_trimmer(variables, capping_method, fold, data_frame):
    return OutlierTrimmer(
        variables=variables,
        capping_method=capping_method,
        tail='both',
        fold=fold
    ).fit_transform(data_frame)


In [7]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

data = pd.read_csv("datasets/houses_to_rent_v2.csv")
data.rename({"parking spaces": "parking_spaces",
             "hoa (R$)": "hoa",
             "rent amount (R$)": "rent",
             "property tax (R$)": "property_tax",
             "fire insurance (R$)": "fire_insurance",
             "total (R$)": "total"}, axis="columns", inplace=True)
data.describe()

 

Unnamed: 0,area,rooms,bathroom,parking_spaces,hoa,rent,property_tax,fire_insurance,total
count,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0,10692.0
mean,149.21792,2.506079,2.236813,1.609147,1174.022,3896.247194,366.704358,53.300879,5490.487
std,537.016942,1.171266,1.407198,1.589521,15592.31,3408.545518,3107.832321,47.768031,16484.73
min,11.0,1.0,1.0,0.0,0.0,450.0,0.0,3.0,499.0
25%,56.0,2.0,1.0,0.0,170.0,1530.0,38.0,21.0,2061.75
50%,90.0,2.0,2.0,1.0,560.0,2661.0,125.0,36.0,3581.5
75%,182.0,3.0,3.0,2.0,1237.5,5000.0,375.0,68.0,6768.0
max,46335.0,13.0,10.0,12.0,1117000.0,45000.0,313700.0,677.0,1120000.0


In [10]:

# pré-processamento
df = data.drop(["total", "floor"], axis=1)
df.drop_duplicates(inplace=True)
# remoção de outliers
df = trimmer_skewed_iqr(
    variables=['area', 'hoa', 'property_tax', 'fire_insurance'],
    data_frame=df,
)
df = trimmer_normal_quantile(
    variables=['rooms', 'bathroom', 'parking_spaces'],
    data_frame=df,
)

df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8464 entries, 0 to 10691
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   city            8464 non-null   object
 1   area            8464 non-null   int64 
 2   rooms           8464 non-null   int64 
 3   bathroom        8464 non-null   int64 
 4   parking_spaces  8464 non-null   int64 
 5   animal          8464 non-null   object
 6   furniture       8464 non-null   object
 7   hoa             8464 non-null   int64 
 8   rent            8464 non-null   int64 
 9   property_tax    8464 non-null   int64 
 10  fire_insurance  8464 non-null   int64 
dtypes: int64(8), object(3)
memory usage: 793.5+ KB


In [None]:
# separação dos dados de treino e teste
X = df.loc[:, df.columns != "rent"]
Y = df["rent"]
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=1)
# feature engineering
ct = make_column_transformer(
    (MinMaxScaler(),  # ou StandardScaler(), dependendo de qual performa melhor
     ['area', 'rooms', 'bathroom', 'parking_spaces', 'hoa', 'property_tax', 'fire_insurance']),
    (OneHotEncoder(categories="auto",
                   # to return k-1 (drop=false to return k dummies)
                   drop="first",
                   sparse_output=False,
                   handle_unknown="error"),
     ['city', 'animal', 'furniture']),
    remainder="passthrough"
)
ct.fit(X_train)
X_train = ct.transform(X_train)
X_test = ct.transform(X_test)