In [98]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import warnings

from sklearn.model_selection import (train_test_split, 
                                     cross_validate,
                                     cross_val_score)

from sklearn.tree import (DecisionTreeRegressor, 
                          ExtraTreeRegressor)

from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import (LinearRegression, 
                                  ElasticNet, 
                                  ElasticNetCV, 
                                  Ridge, 
                                  RidgeCV, 
                                  Lasso, 
                                  LassoCV)

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest

from sklearn.preprocessing import (MinMaxScaler, 
                                   RobustScaler, 
                                   StandardScaler, 
                                   MaxAbsScaler) 

from sklearn.metrics import (r2_score, 
                             mean_squared_error, 
                             mean_absolute_error)

from matplotlib import pyplot as plt
import seaborn as sns

In [4]:
warnings.filterwarnings('ignore') 

In [5]:
sns.set_style('darkgrid')

In [6]:
df = pd.read_csv('kc_house_data.csv')

In [7]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [8]:
df.drop('id', axis = 1, inplace = True)
df.date = pd.to_datetime(df.date)
df.waterfront = df.waterfront.astype('bool')
df.condition = df.condition.astype('category')
df.grade = df.grade.astype('category')
df.view = df.view.astype('category')
df.yr_built = pd.to_datetime(df.yr_built, format = '%Y')#.dt.year
df.zipcode = df.zipcode.astype('object')

df.yr_renovated = df.yr_renovated.replace(0, np.nan)
df.yr_renovated = pd.to_datetime(df.yr_renovated, format = '%Y')

## Definição de features e target

In [9]:
X = df.drop(['lat','long','zipcode','price','date', 'yr_renovated','yr_built'], axis = 1)
y = df.price

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle = True)

## Criação de Pipelines

In [100]:
scalers = [StandardScaler(), RobustScaler(), MinMaxScaler(), ]
regressors = [LinearRegression(), RandomForestRegressor(), DecisionTreeRegressor(), ExtraTreeRegressor()]

for scaler in scalers:
    for regressor in regressors:
        pipe = Pipeline(steps = [('scaling', scaler), 
                                 ('modeling', regressor)])
        
        score = cross_validate(pipe, X=X_train, y=y_train, scoring = 'r2', cv=5)['test_score'].mean()
        print(regressor, scaler, score)

LinearRegression() StandardScaler() 0.602194423309228
RandomForestRegressor() StandardScaler() 0.7249741638203858
DecisionTreeRegressor() StandardScaler() 0.4698085141625018
ExtraTreeRegressor() StandardScaler() 0.4371392027433723
LinearRegression() RobustScaler() 0.6021944233092282
RandomForestRegressor() RobustScaler() 0.727790979786534
DecisionTreeRegressor() RobustScaler() 0.4823680478224161
ExtraTreeRegressor() RobustScaler() 0.43147551946099155
LinearRegression() MinMaxScaler() 0.6021944233092281
RandomForestRegressor() MinMaxScaler() 0.7245050925957911
DecisionTreeRegressor() MinMaxScaler() 0.48337644470870467
ExtraTreeRegressor() MinMaxScaler() 0.429687738051722


In [43]:
cross_validate(LinearRegression(), X=X_train, y=y_train, scoring = 'r2', cv=5)['test_score'].mean()

0.602194423309226

In [83]:
asd = pipe.fit(X_train, y_train)

In [89]:
asd.named_steps['modeling'].coef_

array([-30759.75043018,  -9489.12779261,  89019.01930107,   -128.21641324,
        -1609.38607266,  47840.43661714,  46358.36283763,  33471.57130865,
       115846.01536672,  70835.60184519,  51436.03322329,   7369.75148202,
       -18090.06442584])