In [67]:
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestRegressor

# load data
data = pd.read_csv("version_nueva.csv",sep=",",encoding="utf-8")

# drop some features
data = data.drop(['propertyCode','thumbnail','url','province',
                  'newDevelopment','title','subtitle',
                  'externalReference','priceByArea','propertyType'],axis=1)

data = data.drop(['numPhotos', 'hasVideo', 'showAddress'],axis=1)

# replace string by numbers in 'floor' column
data['floor']=data['floor'].fillna(value='0')
data = data.replace(to_replace='bj', value='0', inplace=False, limit=None, regex=False, method='pad', axis=None)
data = data.replace(to_replace='en', value='1', inplace=False, limit=None, regex=False, method='pad', axis=None)
data = data.replace(to_replace='ss', value='-1', inplace=False, limit=None, regex=False, method='pad', axis=None)
data = data.replace(to_replace='st', value='-1', inplace=False, limit=None, regex=False, method='pad', axis=None)

#Converting some selected features to float
tofloat=['price','size','rooms','floor','bathrooms','exterior']
###To be used when the floor feature is only numeric:
###tofloat=['price','priceByArea','numPhotos','size','propertyCode','rooms','floor']
for col in tofloat:
    data[col]=data[col].astype(float)

#Take only municipality Barcelona
data = data[data['municipality']=='Barcelona']
data = data.drop(['municipality'], axis=1)

################
#NEW IN VERSION 4:
# 1. Add priceperarea data from the Open data BCN external file

dataprice=pd.read_csv("LLOGUER_MITJA_MENSUAL_2016_edited.csv", encoding="utf-8")
price_columns_name=["area","priceperarea"]
dataprice.columns=price_columns_name
dataprice.loc[:,('priceperarea')] = dataprice.loc[:,('priceperarea')].map(lambda x:x.replace(',',''))
dataprice.loc[:,('priceperarea')] = pd.to_numeric(dataprice.loc[:,('priceperarea')])

left=data
right=dataprice
data=pd.merge(left, right, how='left', on=None, left_on="neighborhood", right_on="area")
data = data.drop(['area', 'neighborhood'], axis=1)

# 2. Add a combinated column about typology
data['typologycombined'] = np.where(data['subTypology'].isnull(), "type_" + data["typology"], "type_" + data["typology"] +"_"+ data["subTypology"])
data = data.drop(['typology'], axis=1)
data = data.drop(['subTypology'], axis=1)
################

data['floor'][data['floor']==1000]=0.0

def dummies_for(data, cols, binary=None):
    for col in cols:
            data = pd.concat((data, pd.get_dummies(data[col]).astype(np.int32)), axis=1)
            data = data.drop(col, axis=1)   
    return data

data = dummies_for(data,  ['typologycombined'])
data = dummies_for(data,  ['status'])
data = dummies_for(data,  ['district'])
#data = dummies_for(data,  ['neighborhood'])

#Outliers removal
per=np.percentile(np.array(data['price']), [2.5, 89])
data=data[data['price']>per[0]]
data=data[data['price']<per[1]]

y =data['price']
X = data.drop(['price'], axis=1)
X=np.array(X)
y=np.array(y)

clf = RandomForestRegressor(n_estimators=200, max_features='sqrt', bootstrap=False, random_state=44, n_jobs=-1)
clf = clf.fit(X, y)
filename = 'myRandomForest_district.sav'
pickle.dump(clf, open(filename, 'wb'))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [68]:
filename = 'myRandomForest_district.sav'
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(X[4,].reshape(1, -1))
print(result)

[ 750.]


In [60]:
y[4]

750.0

In [61]:
data.columns

Index([u'rooms', u'bathrooms', u'exterior', u'hasLift', u'size', u'floor',
       u'longitude', u'latitude', u'price', u'hasParkingSpace',
       u'isParkingSpaceIncludedInPrice', u'priceperarea', u'type_chalet',
       u'type_chalet_independantHouse', u'type_chalet_semidetachedHouse',
       u'type_chalet_terracedHouse', u'type_flat', u'type_flat_duplex',
       u'type_flat_penthouse', u'type_flat_studio', u'good', u'newdevelopment',
       u'renew', u'Ciutat Vella', u'Eixample', u'Gràcia', u'Horta Guinardó',
       u'Les Corts', u'Nou Barris', u'Sant Andreu', u'Sant Martí',
       u'Sants-Montjuïc', u'Sarrià-Sant Gervasi'],
      dtype='object')

In [62]:
X[4,]

array([   3.     ,    1.     ,    1.     ,    1.     ,   75.     ,
          2.     ,    2.13782,   41.35947,    0.     ,    0.     ,
        542.3    ,    0.     ,    0.     ,    0.     ,    0.     ,
          1.     ,    0.     ,    0.     ,    0.     ,    1.     ,
          0.     ,    0.     ,    0.     ,    0.     ,    0.     ,
          0.     ,    0.     ,    0.     ,    0.     ,    0.     ,
          1.     ,    0.     ])

In [63]:
y[4]

750.0

In [64]:
values={u'rooms': 3., u'bathrooms': 1., u'exterior': 1., u'hasLift': 1., u'size': 75., u'floor': 2.,
       u'longitude': 2.13782, u'latitude': 41.35947, u'hasParkingSpace': 0.,
       u'isParkingSpaceIncludedInPrice': 0., u'priceperarea': 542.3, u'type_chalet': 0.,
       u'type_chalet_independantHouse': 0., u'type_chalet_semidetachedHouse': 0.,
       u'type_chalet_terracedHouse': 0., u'type_flat': 1., u'type_flat_duplex': 0.,
       u'type_flat_penthouse': 0., u'type_flat_studio': 0., u'good': 1., u'newdevelopment': 0.,
       u'renew': 0., u'Ciutat Vella': 0., u'Eixample': 0., u'Gràcia': 0., u'Horta Guinardó': 0.,
       u'Les Corts': 0., u'Nou Barris': 0., u'Sant Andreu': 0., u'Sant Martí': 0.,
       u'Sants-Montjuïc': 1., u'Sarrià-Sant Gervasi': 0.}

In [69]:
def mypredict(values):
    X=np.array([values[u'rooms'], values[u'bathrooms'], values[u'exterior'], values[u'hasLift'], values[u'size'], 
          values[u'floor'], values[u'longitude'], values[u'latitude'], values[u'hasParkingSpace'], 
          values[u'isParkingSpaceIncludedInPrice'], values[u'priceperarea'], values[u'type_chalet'], 
          values[u'type_chalet_independantHouse'], values[u'type_chalet_semidetachedHouse'], 
          values[u'type_chalet_terracedHouse'], values[u'type_flat'], values[u'type_flat_duplex'], 
          values[u'type_flat_penthouse'], values[u'type_flat_studio'], values[u'good'], values[u'newdevelopment'], 
          values[u'renew'], values[u'Ciutat Vella'], values[u'Eixample'], values[u'Gràcia'], 
          values[u'Horta Guinardó'], values[u'Les Corts'], values[u'Nou Barris'], values[u'Sant Andreu'], 
          values[u'Sant Martí'], values[u'Sants-Montjuïc'], values[u'Sarrià-Sant Gervasi']])
    filename = 'myRandomForest_district.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    result = loaded_model.predict(X.reshape(1, -1))
    return result[0]

In [70]:
print mypredict(values)

750.0


rooms: numeric
bathrooms: numeric
exterior: binary
hasLift: binary
size: numeric
floor: numeric
adress: string
CP: numeric
hasParkingSpace': binary
isParkingSpaceIncludedInPrice': binary 
priceperarea': not necessary
type: [u'type_chalet',
       u'type_chalet_independantHouse', u'type_chalet_semidetachedHouse',
       u'type_chalet_terracedHouse', u'type_flat', u'type_flat_duplex',
       u'type_flat_penthouse', u'type_flat_studio']
status: [u'good', u'newdevelopment', u'renew']
district: [u'Ciutat Vella', u'Eixample', u'Gràcia', u'Horta Guinardó',
       u'Les Corts', u'Nou Barris', u'Sant Andreu', u'Sant Martí',
       u'Sants-Montjuïc', u'Sarrià-Sant Gervasi']
