In [2]:
import numpy as np
import os
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.svm import SVR
from sklearn.feature_selection import RFE, RFECV
from sklearn.impute import SimpleImputer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import make_pipeline, FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score 
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_curve, roc_auc_score, auc, log_loss
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt

In [3]:
# Acessar o patch anterior ao patch do código e acessar a pasta "data"
PATH = os.path.join(os.path.dirname(os.getcwd()), "data")

In [4]:
def load_train_test_data(path=PATH):
    train_path = os.path.join(path, "train.csv")
    test_path = os.path.join(path, "test.csv")
    return pd.read_csv(train_path), pd.read_csv(test_path)

In [5]:
train, test = load_train_test_data(PATH)

In [6]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
train.shape

(1460, 81)

In [8]:
train.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [10]:
variables = train.drop("SalePrice", axis=1)
var_resp = train["SalePrice"].copy()

In [11]:
test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [12]:
x_train, x_test, y_train, y_test = train_test_split(variables, var_resp, test_size=0.2, random_state=2)

In [13]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1168 entries, 455 to 1192
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1168 non-null   int64  
 1   MSSubClass     1168 non-null   int64  
 2   MSZoning       1168 non-null   object 
 3   LotFrontage    967 non-null    float64
 4   LotArea        1168 non-null   int64  
 5   Street         1168 non-null   object 
 6   Alley          74 non-null     object 
 7   LotShape       1168 non-null   object 
 8   LandContour    1168 non-null   object 
 9   Utilities      1168 non-null   object 
 10  LotConfig      1168 non-null   object 
 11  LandSlope      1168 non-null   object 
 12  Neighborhood   1168 non-null   object 
 13  Condition1     1168 non-null   object 
 14  Condition2     1168 non-null   object 
 15  BldgType       1168 non-null   object 
 16  HouseStyle     1168 non-null   object 
 17  OverallQual    1168 non-null   int64  
 18  Overal

In [14]:
# Colunas com alta qtd de missings
columns = ["Alley", "PoolQC", "MiscFeature", "Fence", "FireplaceQu"]

x_train.drop(columns, axis = 1, inplace = True)
x_test.drop(columns, axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [15]:
# A razao da qtd de quartos pela qtd de comodos e a razao da qtd de banheiros
# pela qtd de quartos podem ser variaveis que melhorem o modelo
def add_features(data):
    data["bedrooms_per_rooms_abv_grad"] = data["BedroomAbvGr"].div(data["TotRmsAbvGrd"]).replace(np.inf, 0)
    data["bath_per_bedrooms_abv_grad"] = data["FullBath"].div(data["BedroomAbvGr"]).replace(np.inf, 0)
    data["LotFrontage"] = data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    data["YrBltAndRemod"] = data['YearBuilt'] + data['YearRemodAdd']
    data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']
    data['Total_sqr_footage'] = data['BsmtFinSF1'] + data['BsmtFinSF2'] + data['1stFlrSF'] + data['2ndFlrSF']
    data['Total_Bathrooms'] = (data['FullBath'] + (0.5 * data['HalfBath']) +
                               data['BsmtFullBath'] + (0.5 * data['BsmtHalfBath']))
    data['Total_porch_sf'] = (data['OpenPorchSF'] + data['3SsnPorch'] +
                              data['EnclosedPorch'] + data['ScreenPorch'] +
                              data['WoodDeckSF'])
    return data

In [16]:
x_train = add_features(x_train)
x_test = add_features(x_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["bedrooms_per_rooms_abv_grad"] = data["BedroomAbvGr"].div(data["TotRmsAbvGrd"]).replace(np.inf, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["bath_per_bedrooms_abv_grad"] = data["FullBath"].div(data["BedroomAbvGr"]).replace(np.inf, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  da

In [17]:
x_train.Exterior1st.unique()

array(['HdBoard', 'BrkFace', 'Plywood', 'VinylSd', 'Wd Sdng', 'MetalSd',
       'Stucco', 'WdShing', 'CemntBd', 'CBlock', 'AsbShng', 'Stone',
       'BrkComm', 'AsphShn'], dtype=object)

In [18]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('oneHot', OneHotEncoder(handle_unknown='ignore')),
    ])

In [19]:
num_attribs = x_train.select_dtypes(include=np.number).columns.tolist()
cat_attribs = x_train.select_dtypes(include='object').columns.tolist()

In [20]:
full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [21]:
train_prepared = full_pipeline.fit_transform(x_train)

In [22]:
train_prepared.shape

(1168, 276)

In [23]:
test_prepared = full_pipeline.transform(x_test)

In [24]:
test_prepared

<292x276 sparse matrix of type '<class 'numpy.float64'>'
	with 23942 stored elements in Compressed Sparse Row format>

## Melhorando o desempenho com gridsearch e cross validation

In [27]:
C_options = [0.001, 0.01, 0.1, 1, 10, 50, 100]
gamma_options = [0.001, 0.01, 0.1, 1]

param_grid = dict(C = C_options, gamma = gamma_options)

svr = SVR()
grid_search = GridSearchCV(svr, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, return_train_score=True, n_jobs=-1)
grid_search = grid_search.fit(train_prepared, y_train)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   16.7s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:   48.6s finished


In [28]:
grid_search.best_params_

{'C': 100, 'gamma': 0.01}

In [29]:
grid_search.best_estimator_

SVR(C=100, gamma=0.01)

In [31]:
grid_search.best_estimator_.score

<bound method RegressorMixin.score of SVR(C=100, gamma=0.01)>

In [33]:
# Colunas com alta qtd de missings
columns = ["Alley", "PoolQC", "MiscFeature", "Fence", "FireplaceQu"]

test.drop(columns, axis = 1, inplace = True)

test = add_features(test)

In [34]:
final_test = full_pipeline.transform(test)

In [35]:
final_test

<1459x276 sparse matrix of type '<class 'numpy.float64'>'
	with 119638 stored elements in Compressed Sparse Row format>

In [36]:
final_test.shape

(1459, 276)

In [37]:
y_predicted_test = grid_search.predict(final_test)

In [38]:
y_predicted_test.shape

(1459,)

In [39]:
y_predicted_test

array([155153.59690212, 163364.2182036 , 166876.57670732, ...,
       162390.78412577, 158511.38517037, 168731.66069934])

In [40]:
test_ID = test['Id'].copy()

In [41]:
dataset = pd.DataFrame({'Id': test_ID, 'SalePrice': y_predicted_test})

In [42]:
dataset.head()

Unnamed: 0,Id,SalePrice
0,1461,155153.596902
1,1462,163364.218204
2,1463,166876.576707
3,1464,168708.745564
4,1465,165712.821627


In [43]:
dataset.to_csv(os.path.join(PATH, "svr_output.csv"), sep=",", index=False)