In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

import json

In [3]:
df_raw = pd.read_csv('../data/raw/data_v1.csv', sep=';')
df_raw.head()

Unnamed: 0,WEB_PROPERTY_CODE,PRICE,PRICE_ADMIN_INCLUDED,AREA,LATITUDE,LONGITUDE,ANTIQUITY,CONSTRUCTION_YEAR,BUILT_AREA,PRIVATE_AREA,GARAGE,BATHROOMS,ROOMS,FLOOR,PROPERTY_TYPE,OPERATION_TYPE,STRATUM,BEDROOMS
0,192688170,2300000000,2302200000,268.0,4.702433,-74.035299,De 16 a 30 años,,268.0,268.0,3,4,0,6.0,Apartamento,Venta,6,3
1,4828374,199999000,199999000,49.0,4.570417,-74.099243,,,49.0,42.0,0,2,0,0.0,Apartamento,Venta,2,3
2,191798455,333900000,334200000,44.0,4.646693,-74.067,,,44.0,0.0,1,1,0,5.0,Apartamento,Venta,4,1
3,192594335,460000000,460477000,90.0,4.730525,-74.062372,Más de 30 años,,90.0,90.0,1,2,3,3.0,Apartamento,Venta,5,3
4,192668352,760000000,760915000,81.0,4.675249,-74.047173,Más de 30 años,,81.0,81.0,1,2,2,0.0,Apartamento,Venta,6,2


In [4]:
y_column = ['PRICE']

numeric_features = [
    'AREA',
    'BUILT_AREA',
    'PRIVATE_AREA',
    'LATITUDE',
    'LONGITUDE',
    'FLOOR',
    'ROOMS',
    'BATHROOMS',
    'GARAGE',
]

categorical_features = [
    'ANTIQUITY',
    'STRATUM',
    'BEDROOMS'
]
all_features = numeric_features + categorical_features

df = df_raw.loc[:,y_column + all_features]

In [None]:
with open('../src/config.json', 'r') as f:
    config_dict = json.load(f)

y_column = config_dict['y_column']
numeric_features = config_dict['numeric_features']
categorical_features = config_dict['categorical_features']
all_features = numeric_features + categorical_features

### Primera Inspección

In [5]:
# Data Analysis and Quality Check
print("Dataset shape:", df.shape)
print("\nData types:")
print(df.dtypes)
print("\nMissing values:")
print(df.isnull().sum())
print("\nBasic statistics:")
df.describe().T.map(lambda x: f"{x:,.0f}")

Dataset shape: (26150, 13)

Data types:
PRICE             int64
AREA            float64
BUILT_AREA      float64
PRIVATE_AREA    float64
LATITUDE        float64
LONGITUDE       float64
FLOOR           float64
ROOMS             int64
BATHROOMS         int64
GARAGE            int64
ANTIQUITY        object
STRATUM           int64
BEDROOMS          int64
dtype: object

Missing values:
PRICE             0
AREA              0
BUILT_AREA        1
PRIVATE_AREA      7
LATITUDE          0
LONGITUDE         0
FLOOR           640
ROOMS             0
BATHROOMS         0
GARAGE            0
ANTIQUITY       563
STRATUM           0
BEDROOMS          0
dtype: int64

Basic statistics:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PRICE,26150,979513228,9204973425,1800000,303000000,560000000,1100000000,998000000000
AREA,26150,121,104,0,60,87,168,9206
BUILT_AREA,26149,121,104,0,60,87,168,9206
PRIVATE_AREA,26143,1199,125860,0,42,70,137,14390000
LATITUDE,26150,5,0,4,5,5,5,5
LONGITUDE,26150,-74,0,-76,-74,-74,-74,-74
FLOOR,25510,4,4,0,1,3,5,202
ROOMS,26150,2,2,0,0,2,3,13
BATHROOMS,26150,3,1,0,2,2,3,11
GARAGE,26150,1,1,0,1,1,2,11


In [6]:
for c in categorical_features:
    print(df[c].value_counts(dropna=False))
    print('-'*10)
print()
print("="*40)
print("Variables continuas")
print("="*40)
print()
for c in numeric_features:
    n_nans = df[c].isna().sum()
    if n_nans > 0:
        print(f"Total NaN in {c}: {n_nans} and {n_nans * 100 / len(df):,.2f}%")

ANTIQUITY
De 16 a 30 años    7392
De 9 a 15 años     5756
Más de 30 años     5612
De 1 a 8 años      5040
Menor a 1 año      1787
NaN                 563
Name: count, dtype: int64
----------
STRATUM
6      8660
3      5822
4      5603
5      4210
2      1026
0       601
110     206
1        22
Name: count, dtype: int64
----------
BEDROOMS
3     14773
2      6323
1      2639
4      2075
8       170
5       127
0        20
6        13
7         4
9         2
19        1
11        1
10        1
13        1
Name: count, dtype: int64
----------

Variables continuas

Total NaN in BUILT_AREA: 1 and 0.00%
Total NaN in PRIVATE_AREA: 7 and 0.03%
Total NaN in FLOOR: 640 and 2.45%


In [None]:
# TODO: CAMBIAR POR FUNCIONES DEDICADAS E SU PROPIO MODULO EN SRC

fig, axes = plt.subplots(1, 2, figsize = (16,8))

sns.histplot(x = np.log(df['AREA']), ax = axes[0])
sns.histplot(x = np.log(df['PRICE']), ax = axes[1])

In [None]:
# TODO: CAMBIAR POR FUNCIONES DEDICADAS E SU PROPIO MODULO EN SRC

fig, axes = plt.subplots(len(categorical_features), 1, figsize = (16, 8 * len(categorical_features)))
for i, var in enumerate(categorical_features):
    sns.violinplot(df, x = var, y = np.log(df['PRICE']), ax = axes[i])

### Filtros Manuales

In [7]:
# Categorical features can"t have NaN
for c in categorical_features:

    # Default Value for Nan numeric
    if pd.api.types.is_numeric_dtype(df[c]):
        df[c] = df[c].fillna(999)

    # Devault Value for Nan String
    elif pd.api.types.is_object_dtype(df[c]):
        df[c] = df[c].fillna('No se sabe')

df[categorical_features] = df[categorical_features].astype('category')

# Drop erroneous data
df = df[df['FLOOR'] != 202]
df = df[df['STRATUM'] != 101]

# Drop remaining NaN
df.dropna(inplace=True)
print(df.dtypes)
print(len(df))
df.head(3)


PRICE              int64
AREA             float64
BUILT_AREA       float64
PRIVATE_AREA     float64
LATITUDE         float64
LONGITUDE        float64
FLOOR            float64
ROOMS              int64
BATHROOMS          int64
GARAGE             int64
ANTIQUITY       category
STRATUM         category
BEDROOMS        category
dtype: object
25508


Unnamed: 0,PRICE,AREA,BUILT_AREA,PRIVATE_AREA,LATITUDE,LONGITUDE,FLOOR,ROOMS,BATHROOMS,GARAGE,ANTIQUITY,STRATUM,BEDROOMS
0,2300000000,268.0,268.0,268.0,4.702433,-74.035299,6.0,0,4,3,De 16 a 30 años,6,3
1,199999000,49.0,49.0,42.0,4.570417,-74.099243,0.0,0,2,0,No se sabe,2,3
2,333900000,44.0,44.0,0.0,4.646693,-74.067,5.0,0,1,1,No se sabe,4,1


In [None]:
# TODO: CAMBIAR POR FUNCIONES DEDICADAS E SU PROPIO MODULO EN SRC

fig, axes = plt.subplots(1, 2, figsize = (16,8))

sns.histplot(x = np.log(df['AREA']), ax = axes[0])
sns.histplot(x = np.log(df['PRICE']), ax = axes[1])

In [None]:
# TODO: CAMBIAR POR FUNCIONES DEDICADAS E SU PROPIO MODULO EN SRC

fig, axes = plt.subplots(len(categorical_features), 1, figsize = (16, 8 * len(categorical_features)))
for i, var in enumerate(categorical_features):
    sns.violinplot(df, x = var, y = np.log(df['PRICE']), ax = axes[i])

In [8]:
df.to_pickle('../data/interim/data_v2.pkl')