In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

%matplotlib inline

plt.style.use('default') # Make the graphs a bit prettier

plt.rcParams['figure.figsize'] = (12, 4)

In [2]:
#pip install -U statsmodels
import statsmodels.api as sm 

  from pandas.core import datetools


#  Preprocesamiento de Datos

El analisis exploratorio que se presenta a continuacion tiene como fin determinar que feathers influyen en mayor medida a la hora de determinar el precio de un inmueble en Capital Federal y Gran Buenos Aires, asi como tambien encontrar datos curiosos en el data set.

Los datos fueron provistos por [Properati](http://www.properati.com.ar) y reorganizo en un solo dataset de la siguiente manera : [GitHub | properati_argentina](https://github.com/BraianVicente/properati_argentina/blob/master/src/TP1%20-%20Properati%20-%20Parseo%20de%20datos%20por%20fecha.ipynb)

In [3]:
sell_properati = pd.read_csv('../data/properati_full_month.csv',low_memory=False)

In [15]:
proterati_testing_dataset =pd.read_csv('../data/properati_dataset_testing_noprice.csv',low_memory=False)

In [16]:
proterati_testing_dataset = proterati_testing_dataset[proterati_testing_dataset['operation'] == 'venta']

Transformando el formato del feature created_on para manejarlo como fecha

In [6]:
sell_properati['created_on'] = sell_properati['created_on'].apply(lambda x : pd.to_datetime(x))

In [7]:
sell_properati['year'] = sell_properati['created_on'].apply(lambda x : x.year)

In [None]:
def parse_year_month(date):
    if len(str(date.month)) == 1 :
        return str(date.year) + '-0' + str(date.month)
    return str(date.year) + '-' + str(date.month) 

In [None]:
sell_properati['year_month'] = sell_properati.created_on.apply(parse_year_month)

Completamos la columna 'state_name'

In [8]:
sell_properati['state_name'] = sell_properati.place_with_parent_names.apply(lambda x : x.split('|')[2])

Filtramos unicamente los que nos interesan que corresponde a capital federal y alrededores

In [9]:
sell_properati = sell_properati[
    (sell_properati['state_name'] == 'Bs.As. G.B.A. Zona Norte' ) |
    (sell_properati['state_name'] == 'Bs.As. G.B.A. Zona Sur' ) |
    (sell_properati['state_name'] == 'Bs.As. G.B.A. Zona Oeste' ) |
    (sell_properati['state_name'] == 'Capital Federal' ) 
    ]

In [17]:
proterati_testing_dataset = proterati_testing_dataset[
    (proterati_testing_dataset['state_name'] == 'Bs.As. G.B.A. Zona Norte' ) |
    (proterati_testing_dataset['state_name'] == 'Bs.As. G.B.A. Zona Sur' ) |
    (proterati_testing_dataset['state_name'] == 'Bs.As. G.B.A. Zona Oeste' ) |
    (proterati_testing_dataset['state_name'] == 'Capital Federal' ) 
    ]

In [18]:
proterati_testing_dataset[
    (proterati_testing_dataset['state_name'] == 'Bs.As. G.B.A. Zona Norte' ) |
    (proterati_testing_dataset['state_name'] == 'Bs.As. G.B.A. Zona Sur' ) |
    (proterati_testing_dataset['state_name'] == 'Bs.As. G.B.A. Zona Oeste' ) |
    (proterati_testing_dataset['state_name'] == 'Capital Federal' ) 
    ].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14164 entries, 0 to 14165
Data columns (total 17 columns):
id                         14164 non-null int64
created_on                 14164 non-null object
property_type              14164 non-null object
operation                  14164 non-null object
place_name                 14164 non-null object
place_with_parent_names    14164 non-null object
country_name               14164 non-null object
state_name                 14164 non-null object
lat-lon                    10486 non-null object
lat                        10486 non-null float64
lon                        10486 non-null float64
surface_total_in_m2        11851 non-null float64
surface_covered_in_m2      13004 non-null float64
floor                      1368 non-null float64
rooms                      7500 non-null float64
expenses                   2542 non-null object
description                14164 non-null object
dtypes: float64(6), int64(1), object(10)
memory usage: 1

In [19]:
testing_data_id = proterati_testing_dataset[['id']]

In [20]:
sell_properati['zone'] = sell_properati.place_with_parent_names.apply(lambda x : x.split('|')[3])

Filtramos por los datos que son reelevantes para nuestro analisis, obteniendo unicamente las propiedades de GBA y capital federal

In [21]:
sell_properati.drop(inplace=True,\
    labels=['country_name','description','extra','id', u'image_thumbnail','operation'\
            ,'place_with_parent_names_l1','place_with_parent_names_l2', 'properati_url','surface_in_m2','title',\
           ],axis=1)

In [22]:
proterati_testing_dataset.columns

Index([u'id', u'created_on', u'property_type', u'operation', u'place_name',
       u'place_with_parent_names', u'country_name', u'state_name', u'lat-lon',
       u'lat', u'lon', u'surface_total_in_m2', u'surface_covered_in_m2',
       u'floor', u'rooms', u'expenses', u'description'],
      dtype='object')

In [23]:
proterati_testing_dataset.drop(inplace=True,\
                              labels=['created_on', 'property_type', 'operation', 'place_name', \
       'place_with_parent_names', 'country_name', 'state_name', 'lat-lon', 'description'],axis=1)

Podemos seguir trabajando con los datos que no cuentan con la informacion de Zona utilizando la ubicacion para identificarlos segun el barrio al que pertenecen pero en esta ocacion decidimos dejarlos de lado pues no son una cantidad significativa de datos

#### Las propiedades del tigre son las unicas que no contienen informacion en 'place_name'

In [24]:
sell_properati.loc[pd.isnull(sell_properati['place_name']),'place_name'] = sell_properati.place_name.apply(lambda x : x if pd.notnull(x) else 'Tigre' )

In [25]:
sell_properati = sell_properati[sell_properati['zone'] != '']

In [26]:
#Calcula el precio la incognita (incognita = divisor/dividendo) para todo divisor mayor a cero y todo dividendo mayor a cero.

def dataframe_calc_value(df,incognita,divisor,dividendo):
    df.loc[ (pd.isnull(df[incognita])) |  (df[incognita] == 0 ) & (pd.notnull(df[divisor]) ) & (df[divisor] > 0) & \
            (pd.notnull(df[dividendo]) & (df[dividendo] > 0)) ,incognita] = \
    df.loc[ (pd.isnull(df[incognita])) |  (df[incognita] == 0 ) & (pd.notnull(df[divisor]) ) & (df[divisor] > 0) & \
            (pd.notnull(df[dividendo]) & (df[dividendo] > 0))][divisor] / \
    df.loc[ (pd.isnull(df[incognita])) |  (df[incognita] == 0 ) & (pd.notnull(df[divisor]) ) & (df[divisor] > 0) & \
            (pd.notnull(df[dividendo]) & (df[dividendo] > 0))][dividendo]

# Continuaremos trabajando con el set de datos completo para disponer de toda la informacion.

### Se agrega la superficie total de los inmbuebles que disponen del precio por metro cuadrado en dolares


In [27]:
sell_properati_filter = sell_properati

In [28]:
dataframe_calc_value(sell_properati_filter,'surface_total_in_m2','price_aprox_usd','price_usd_per_m2')

### Calculamos la superficie total en metros cuadrados de las propiedades que no cuentan con este dato

In [29]:
dataframe_calc_value(sell_properati_filter,'surface_total_in_m2','price','price_per_m2')

### Calculamos el precio por metro cuadrado en USD de los inmuebles que no cuentan con dicha informacion

In [30]:
dataframe_calc_value(sell_properati_filter,'price_usd_per_m2','price_aprox_usd','surface_total_in_m2')

### Se agrega el precio por metro cuadrado en la moneda actual en cada unos de las pripiedades

In [31]:
dataframe_calc_value(sell_properati_filter,'price_per_m2','price','surface_total_in_m2')

In [32]:
sell_properati_filter = sell_properati_filter[~((sell_properati_filter['surface_total_in_m2'] == 0 ) | \
                     (sell_properati_filter['price'] == 0 ))]

In [33]:
sell_properati_filter[['surface_total_in_m2','price_usd_per_m2','price_per_m2']].mean()

surface_total_in_m2    7402.093163
price_usd_per_m2       2268.710530
price_per_m2           3892.322905
dtype: float64

In [34]:
sell_properati_filter[(pd.notnull(sell_properati_filter['surface_total_in_m2']))]\
            [['surface_total_in_m2','price_usd_per_m2','price_per_m2','price_aprox_usd','price']].shape

(278094, 5)

Filtraremos los datos que tengan una superficie total en metros cuadrados entre 0 y 1000

In [35]:
filter_properati = sell_properati_filter[(sell_properati_filter['price_aprox_usd'] < 600000) &\
                                         (sell_properati_filter['price_usd_per_m2'] < 5000) & \
                                         (sell_properati_filter['surface_total_in_m2'] < 1000)]

In [36]:
float(filter_properati.shape[0]) / float(sell_properati.shape[0]) * 100

62.02927539329903

In [37]:
filter_properati.columns

Index([u'created_on', u'currency', u'expenses', u'floor', u'geonames_id',
       u'lat', u'lat-lon', u'lon', u'place_name', u'place_with_parent_names',
       u'price', u'price_aprox_local_currency', u'price_aprox_usd',
       u'price_per_m2', u'price_usd_per_m2', u'property_type', u'rooms',
       u'state_name', u'surface_covered_in_m2', u'surface_total_in_m2',
       u'year', u'zone'],
      dtype='object')

In [38]:
data = filter_properati[['floor','lat','lon','price_aprox_usd','rooms','surface_total_in_m2']]

testing_data_id = proterati_testing_dataset[['id']]

In [39]:
testing_data = proterati_testing_dataset[['floor','lat','lon','rooms','surface_total_in_m2']]

In [40]:
def devolverInt(x):
    return float(re.search(r'\d+', x).group())

In [41]:
import re
string1 = "$ 1500.- aprox results should get"
int(re.search(r'\d+', string1).group())


1500

In [42]:
devolverInt(string1)

1500.0

data['expenses'] = data.expenses.apply(lambda x : devolverInt(x) if pd.notnull(x) else 0 )

In [43]:
testing_data['floor'] = testing_data.floor.apply(lambda x : float(x) if pd.notnull(x) else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [44]:
data['floor'] = data.floor.apply(lambda x : float(x) if pd.notnull(x) else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [45]:
testing_data['lat'] = testing_data.lat.apply(lambda x : float(x) if pd.notnull(x) else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [46]:
data['lat'] = data.lat.apply(lambda x : float(x) if pd.notnull(x) else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [47]:
testing_data['lon'] = testing_data.lon.apply(lambda x :float(x) if pd.notnull(x) else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [48]:
data['lon'] = data.lon.apply(lambda x :float(x) if pd.notnull(x) else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [49]:
data['price_aprox_usd'] = data.price_aprox_usd.apply(lambda x : float(x) if pd.notnull(x) else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


data['price_usd_per_m2'] = data.price_usd_per_m2.apply(lambda x : float(x) if pd.notnull(x) else 0 )

In [50]:
testing_data['rooms'] = testing_data.rooms.apply(lambda x :float(x) if pd.notnull(x) else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [51]:
data['rooms'] = data.rooms.apply(lambda x : float(x) if pd.notnull(x) else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [52]:
testing_data['surface_total_in_m2'] = testing_data.surface_total_in_m2.apply(lambda x :float(x) if pd.notnull(x) else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [53]:
data['surface_total_in_m2'] = data.surface_total_in_m2.apply(lambda x : float(x) if pd.notnull(x) else 0 )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243749 entries, 2 to 597227
Data columns (total 6 columns):
floor                  243749 non-null float64
lat                    243749 non-null float64
lon                    243749 non-null float64
price_aprox_usd        243749 non-null float64
rooms                  243749 non-null float64
surface_total_in_m2    243749 non-null float64
dtypes: float64(6)
memory usage: 13.0 MB


In [55]:
testing_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14164 entries, 0 to 14165
Data columns (total 5 columns):
floor                  14164 non-null float64
lat                    14164 non-null float64
lon                    14164 non-null float64
rooms                  14164 non-null float64
surface_total_in_m2    14164 non-null float64
dtypes: float64(5)
memory usage: 663.9 KB


In [56]:
train = data

In [57]:
test = testing_data

# Machine Learning


In [58]:
from sklearn import neighbors

In [59]:
knn = neighbors.KNeighborsRegressor(n_neighbors=12,n_jobs=-1,p=1)

In [60]:
train_target = train[['price_aprox_usd']]

In [61]:
train.drop(inplace=True,labels=['price_aprox_usd'],axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


test_target = test[['price_aprox_usd']]

test.drop(inplace=True,labels=['price_aprox_usd','price_usd_per_m2'],axis=1)

In [79]:
knn.fit(train,train_target)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=12, p=1,
          weights='uniform')

In [80]:
test_prediction = knn.predict(test)

In [81]:
test_prediction.shape

(14164, 1)

mean_squared_error(test_prediction, test_target)

In [82]:
predictionDF = pd.DataFrame(test_prediction,columns={"prediction"})


In [88]:
testing_data_id.reset_index(inplace=True)

In [89]:
predictionDF['id'] = testing_data_id['id']

In [90]:
testing_data_id.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14164 entries, 0 to 14163
Data columns (total 2 columns):
index    14164 non-null int64
id       14164 non-null int64
dtypes: int64(2)
memory usage: 221.4 KB


In [91]:
predictionDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14164 entries, 0 to 14163
Data columns (total 2 columns):
prediction    14164 non-null float64
id            14164 non-null int64
dtypes: float64(1), int64(1)
memory usage: 221.4 KB



def predictError(predict,spectate): 
    if (spectate > predict) : 
        return ((spectate-predict)/spectate)*100 
    else : return ((predict-spectate)/predict )*100


predictionDF['errorValue'] = predictionDF.apply(axis=1,func=lambda x : predictError(x[0],x[1]))


predictionDF.sample(5)


print 'El error promedio que comete KNN Regression al predecir es de', float(predictionDF['errorValue'].mean()),'%'


print "El algoritmo KNN Regresion predice un ",\
        (float(predictionDF[predictionDF['errorValue'] < 10].shape[0]) / float(predictionDF.shape[0] )* 100) ,\
        "% de los valores, con un error menor al 15%"

predictionDF.describe()

predictionDF.errorValue.plot.hist()