In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error

%matplotlib inline

plt.style.use('default') # Make the graphs a bit prettier

plt.rcParams['figure.figsize'] = (12, 4)

In [2]:
#pip install -U statsmodels
import statsmodels.api as sm 

  from pandas.core import datetools


#  Preprocesamiento de Datos

El analisis exploratorio que se presenta a continuacion tiene como fin determinar que feathers influyen en mayor medida a la hora de determinar el precio de un inmueble en Capital Federal y Gran Buenos Aires, asi como tambien encontrar datos curiosos en el data set.

Los datos fueron provistos por [Properati](http://www.properati.com.ar) y reorganizo en un solo dataset de la siguiente manera : [GitHub | properati_argentina](https://github.com/BraianVicente/properati_argentina/blob/master/src/TP1%20-%20Properati%20-%20Parseo%20de%20datos%20por%20fecha.ipynb)

In [3]:
sell_properati = pd.read_csv('../data/properati_full_month.csv',low_memory=False)

Transformando el formato del feature created_on para manejarlo como fecha

In [4]:
sell_properati['created_on'] = sell_properati['created_on'].apply(lambda x : pd.to_datetime(x))

In [5]:
sell_properati['year'] = sell_properati['created_on'].apply(lambda x : x.year)

In [6]:
def parse_year_month(date):
    if len(str(date.month)) == 1 :
        return str(date.year) + '-0' + str(date.month)
    return str(date.year) + '-' + str(date.month) 

In [7]:
sell_properati['year_month'] = sell_properati.created_on.apply(parse_year_month)

Completamos la columna 'state_name'

In [8]:
sell_properati['state_name'] = sell_properati.place_with_parent_names.apply(lambda x : x.split('|')[2])

Filtramos unicamente los que nos interesan que corresponde a capital federal y alrededores

In [9]:
sell_properati = sell_properati[
    (sell_properati['state_name'] == 'Bs.As. G.B.A. Zona Norte' ) |
    (sell_properati['state_name'] == 'Bs.As. G.B.A. Zona Sur' ) |
    (sell_properati['state_name'] == 'Bs.As. G.B.A. Zona Oeste' ) |
    (sell_properati['state_name'] == 'Capital Federal' ) 
    ]

In [10]:
sell_properati['zone'] = sell_properati.place_with_parent_names.apply(lambda x : x.split('|')[3])

Filtramos por los datos que son reelevantes para nuestro analisis, obteniendo unicamente las propiedades de GBA y capital federal

In [11]:
sell_properati.drop(inplace=True,\
    labels=['country_name','description','extra','id', u'image_thumbnail','operation'\
            ,'place_with_parent_names_l1','place_with_parent_names_l2', 'properati_url','surface_in_m2','title',\
           ],axis=1)

Podemos seguir trabajando con los datos que no cuentan con la informacion de Zona utilizando la ubicacion para identificarlos segun el barrio al que pertenecen pero en esta ocacion decidimos dejarlos de lado pues no son una cantidad significativa de datos

#### Las propiedades del tigre son las unicas que no contienen informacion en 'place_name'

In [12]:
sell_properati.loc[pd.isnull(sell_properati['place_name']),'place_name'] = sell_properati.place_name.apply(lambda x : x if pd.notnull(x) else 'Tigre' )

In [13]:
sell_properati = sell_properati[sell_properati['zone'] != '']

In [14]:
#Calcula el precio la incognita (incognita = divisor/dividendo) para todo divisor mayor a cero y todo dividendo mayor a cero.

def dataframe_calc_value(df,incognita,divisor,dividendo):
    df.loc[ (pd.isnull(df[incognita])) |  (df[incognita] == 0 ) & (pd.notnull(df[divisor]) ) & (df[divisor] > 0) & \
            (pd.notnull(df[dividendo]) & (df[dividendo] > 0)) ,incognita] = \
    df.loc[ (pd.isnull(df[incognita])) |  (df[incognita] == 0 ) & (pd.notnull(df[divisor]) ) & (df[divisor] > 0) & \
            (pd.notnull(df[dividendo]) & (df[dividendo] > 0))][divisor] / \
    df.loc[ (pd.isnull(df[incognita])) |  (df[incognita] == 0 ) & (pd.notnull(df[divisor]) ) & (df[divisor] > 0) & \
            (pd.notnull(df[dividendo]) & (df[dividendo] > 0))][dividendo]

# Continuaremos trabajando con el set de datos completo para disponer de toda la informacion.

### Se agrega la superficie total de los inmbuebles que disponen del precio por metro cuadrado en dolares


In [15]:
sell_properati_filter = sell_properati

In [16]:
dataframe_calc_value(sell_properati_filter,'surface_total_in_m2','price_aprox_usd','price_usd_per_m2')

### Calculamos la superficie total en metros cuadrados de las propiedades que no cuentan con este dato

In [17]:
dataframe_calc_value(sell_properati_filter,'surface_total_in_m2','price','price_per_m2')

### Calculamos el precio por metro cuadrado en USD de los inmuebles que no cuentan con dicha informacion

In [18]:
dataframe_calc_value(sell_properati_filter,'price_usd_per_m2','price_aprox_usd','surface_total_in_m2')

### Se agrega el precio por metro cuadrado en la moneda actual en cada unos de las pripiedades

In [19]:
dataframe_calc_value(sell_properati_filter,'price_per_m2','price','surface_total_in_m2')

In [20]:
sell_properati_filter = sell_properati_filter[~((sell_properati_filter['surface_total_in_m2'] == 0 ) | \
                     (sell_properati_filter['price'] == 0 ))]

In [21]:
sell_properati_filter[['surface_total_in_m2','price_usd_per_m2','price_per_m2']].mean()

surface_total_in_m2    7402.093163
price_usd_per_m2       2268.710530
price_per_m2           3892.322905
dtype: float64

In [22]:
sell_properati_filter[(pd.notnull(sell_properati_filter['surface_total_in_m2']))]\
            [['surface_total_in_m2','price_usd_per_m2','price_per_m2','price_aprox_usd','price']].shape

(278094, 5)

Filtraremos los datos que tengan una superficie total en metros cuadrados entre 0 y 1000

In [23]:
filter_properati = sell_properati_filter[(sell_properati_filter['price_aprox_usd'] < 600000) &\
                                         (sell_properati_filter['price_usd_per_m2'] < 5000) & \
                                         (sell_properati_filter['surface_total_in_m2'] < 1000)]

In [24]:
float(filter_properati.shape[0]) / float(sell_properati.shape[0]) * 100

62.02927539329903

In [25]:
filter_properati.columns

Index([u'created_on', u'currency', u'expenses', u'floor', u'geonames_id',
       u'lat', u'lat-lon', u'lon', u'place_name', u'place_with_parent_names',
       u'price', u'price_aprox_local_currency', u'price_aprox_usd',
       u'price_per_m2', u'price_usd_per_m2', u'property_type', u'rooms',
       u'state_name', u'surface_covered_in_m2', u'surface_total_in_m2',
       u'year', u'year_month', u'zone'],
      dtype='object')

In [26]:
data = filter_properati[['floor','expenses','price_aprox_usd','rooms','surface_total_in_m2']].copy()

In [27]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243749 entries, 2 to 597227
Data columns (total 5 columns):
floor                  47217 non-null float64
expenses               19623 non-null object
price_aprox_usd        243749 non-null float64
rooms                  160435 non-null float64
surface_total_in_m2    243749 non-null float64
dtypes: float64(4), object(1)
memory usage: 11.2+ MB


In [28]:
def devolverInt(x):
    return float(re.search(r'\d+', x).group())

In [29]:
import re
string1 = "$ 1500.- aprox results should get"
int(re.search(r'\d+', string1).group())


1500

In [30]:
devolverInt(string1)

1500.0

In [31]:
data['expenses'] = data.expenses.apply(lambda x : devolverInt(x) if pd.notnull(x) else 0 )

In [32]:
data['floor'] = data.floor.apply(lambda x : float(x) if pd.notnull(x) else 0 )

data['lat'] = data.lat.apply(lambda x : float(x) if pd.notnull(x) else 0 )

data['lon'] = data.lon.apply(lambda x :float(x) if pd.notnull(x) else 0 )

In [33]:
data['price_aprox_usd'] = data.price_aprox_usd.apply(lambda x : float(x) if pd.notnull(x) else 0 )

data['price_usd_per_m2'] = data.price_usd_per_m2.apply(lambda x : float(x) if pd.notnull(x) else 0 )

In [34]:
data['rooms'] = data.rooms.apply(lambda x : float(x) if pd.notnull(x) else 0 )

In [35]:
data['surface_total_in_m2'] = data.surface_total_in_m2.apply(lambda x : float(x) if pd.notnull(x) else 0 )

In [36]:
data.tail()

Unnamed: 0,floor,expenses,price_aprox_usd,rooms,surface_total_in_m2
597200,0.0,0.0,130000.0,1.0,49.0
597213,0.0,0.0,160000.0,4.0,203.0
597221,0.0,0.0,65000.0,2.0,120.0
597225,0.0,8600.0,550000.0,3.0,324.0
597227,1.0,0.0,450000.0,3.0,268.0


In [67]:
train = data.sample(int(float(data.shape[0]) * 0.8)).copy()

In [68]:
test = data.sample(int(float(data.shape[0]) * 0.1)).copy()

# Machine Learning


In [69]:
train_target = train[['price_aprox_usd']].copy().astype(int)

In [71]:
train.drop(inplace=True,labels=['price_aprox_usd'],axis=1)

In [72]:
test_target = test[['price_aprox_usd']].copy()

In [73]:
test.drop(inplace=True,labels=['price_aprox_usd'],axis=1)

In [74]:
msqDF = pd.DataFrame(columns={'k-Neighbors','p_minkowski','mean_squear_error'})

In [75]:
from sklearn import linear_model

In [84]:
log_reg = linear_model.LogisticRegression(n_jobs=-1)

In [85]:
log_reg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [None]:
log_reg.fit(train,train_target)

In [None]:
test_prediction = log_reg.predict(test)

In [None]:
msq_log_reg = mean_squared_error(test_prediction, test_target)

In [None]:
msq_log_reg

 # Calculando diferencia entre prediccion y valor esperado

In [None]:
predictionDF = pd.DataFrame(test_prediction,columns={"prediction"})


In [None]:

predictionDF.loc[:,'spectate'] = test_target.values


In [None]:

def predictError(predict,spectate): 
    if (spectate > predict) : 
        return ((spectate-predict)/spectate)*100 
    else : return ((predict-spectate)/predict )*100


In [None]:
predictionDF['errorValue'] = predictionDF.apply(axis=1,func=lambda x : predictError(x[0],x[1]))


In [None]:
predictionDF.sample(5)


In [None]:
print 'El error promedio que comete Random Forest Regresion al predecir es de', float(predictionDF['errorValue'].mean()),'%'


In [None]:
print "El algoritmo Logistic Regrecion Regresion predice un ",\
        (float(predictionDF[predictionDF['errorValue'] < 10].shape[0]) / float(predictionDF.shape[0] )* 100) ,\
        "% de los valores, con un error menor al 15%"

In [None]:
predictionDF.describe()

In [None]:
predictionDF.errorValue.plot.hist()