#Importando as Bibliotecas:

In [1]:
# importar os pacotes necessários
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# importar o dataset para um dataframe
df = pd.read_csv('sao-paulo-properties-april-2019.csv')

# ver as 5 primeiras entradas
df.head()

  import pandas.util.testing as tm


Unnamed: 0,Price,Condo,Size,Rooms,Toilets,Suites,Parking,Elevator,Furnished,Swimming Pool,New,District,Negotiation Type,Property Type,Latitude,Longitude
0,930,220,47,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.543138,-46.479486
1,1000,148,45,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.550239,-46.480718
2,1000,100,48,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.542818,-46.485665
3,1000,200,48,2,2,1,1,0,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.547171,-46.483014
4,1300,410,55,2,2,1,1,1,0,0,0,Artur Alvim/São Paulo,rent,apartment,-23.525025,-46.482436


#Limpando alguns outliers:

In [0]:
df = df[df.Rooms<7]
df = df[df.Toilets<6]
df = df[df.Suites<5]
df = df[df.Parking<5]

In [0]:
def tirar_outlier(df, lista):
  df_clean = df[lista]
  for col, dados in df_clean.iteritems():
    df_clean = df_clean[((df_clean[col] - df_clean[col].mean()) / df_clean[col].std()).abs() < 3]
    df_clean = df_clean[np.abs(df_clean[col]-df_clean[col].mean()) <= (3*df_clean[col].std())]

  valores_retirados = df.shape[0] - df_clean.shape[0]
  print('Foi retirado {} valores de {} ({:.2f})%'.format(valores_retirados, df.shape[0], valores_retirados/df.shape[0]*100))
  print('Ficamos com {} resultados'.format(df_clean.shape[0]))

  for i in range(0, len(df)):
    if i not in df_clean.index:
        df = df[df.index != i]

  df = df.reset_index(drop=True)
  return df

In [4]:
df = tirar_outlier(df, ['Latitude', 'Longitude'])
df['District'] = df['District'].apply(lambda x: x.split('/')[0])

Foi retirado 985 valores de 13485 (7.30)%
Ficamos com 12500 resultados


#Criando o Data Frame com os valores médios para os bairros:

In [5]:
Bairros_LatLong = pd.DataFrame()
Bairros_LatLong['Latitude'] = df.Latitude.groupby(df.District).mean()
Bairros_LatLong['Longitude'] = df.Longitude.groupby(df.District).mean()

df.drop(['District', 'Property Type'], axis = 1, inplace = True)
Bairros_LatLong

Unnamed: 0_level_0,Latitude,Longitude
District,Unnamed: 1_level_1,Unnamed: 2_level_1
Alto de Pinheiros,-23.545107,-46.711620
Anhanguera,-23.589292,-46.671385
Aricanduva,-23.553916,-46.522669
Artur Alvim,-23.544731,-46.486363
Barra Funda,-23.523698,-46.662652
...,...,...
Vila Matilde,-23.536455,-46.526908
Vila Olimpia,-23.597079,-46.682313
Vila Prudente,-23.588982,-46.575585
Vila Sônia,-23.598046,-46.733250


#Criando o Data Frame para Sale:

In [6]:
Sale = df[df['Negotiation Type'] == 'sale'].drop('Negotiation Type', axis = 1)
Sale.head(1)

Unnamed: 0,Price,Condo,Size,Rooms,Toilets,Suites,Parking,Elevator,Furnished,Swimming Pool,New,Latitude,Longitude
4455,732600,1000,74,1,2,1,2,1,0,1,0,-23.552129,-46.692244


In [7]:
print('Para o Data Frame Sale:')
Sale_clean = tirar_outlier(Sale, ['Price'])

Para o Data Frame Sale:
Foi retirado 261 valores de 5905 (4.42)%
Ficamos com 5644 resultados


#Primeiro Modelo:

In [8]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

X = Sale_clean.drop(['Price'], axis = 1)
y = Sale_clean[['Price']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

modelo = XGBRegressor(random_state=42, silent=True)
modelo.fit(X_train, y_train)
y_pred = modelo.predict(X_test)
resultados = r2_score(y_test, y_pred)

#print('R² = {:.3f}'.format(r2_score(y_test, y_pred)))
resultados.round(3)

0.888

#Exportando os Modelos:

In [0]:
import pickle

with open('modelo_simples.pkl', 'wb') as file:
    pickle.dump(modelo, file)

In [0]:
features_simples = X_train.columns.values

with open('features_simples.names', 'wb') as file:
    pickle.dump(features_simples, file)

In [0]:
with open('/content/modelo_simples.pkl', 'rb') as file:
    modelo_simples = pickle.load(file)
with open('/content/features_simples.names', 'rb') as file:
    features_names = pickle.load(file)

In [0]:
Bairros_LatLong.to_csv('Bairros_LatLong.csv')