In [3]:
# Importação das bibliotecas
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error,r2_score
from sklearn.feature_selection import GenericUnivariateSelect, f_regression

# Pré-processamento

In [6]:
# Dados de treino e teste
train_data = pd.read_csv("/train.csv")
test_data = pd.read_csv("/train.csv")

In [7]:
# Copiá dos dados
train_prep = train_data.copy()
test_prep = test_data.copy()

print(train_prep.shape)
print(test_prep.shape)

(1460, 81)
(1460, 81)


In [8]:
# Verificação de valores Nan
train_prep.info()
test_prep.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

### Lidando com valores NaN para recursos categóricos e numéricos em dados de treinamento

In [9]:
# Lidando com dados Nan
list_none=['Alley','BsmtQual', 'BsmtCond', 'BsmtExposure' ,'BsmtFinType1', 'BsmtFinType2' ,'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual' ,'GarageCond', 'PoolQC' ,'Fence','MiscFeature']
train_prep[list_none] = train_prep[list_none].fillna('None')
train_prep['LotFrontage'] = train_prep['LotFrontage'].fillna(np.round_(np.mean(train_prep['LotFrontage'])))
train_prep['GarageYrBlt'] = train_prep['GarageYrBlt'].fillna(-1)
list_Mas = ['MasVnrType','MasVnrArea','Electrical']
train_prep[list_Mas] = train_prep[list_Mas].fillna(train_prep.mode().iloc[0])

### Lidando com valores NaN para recursos categóricos e numéricos em dados de teste

In [10]:
# Lidando com dados Nan
list_none_test=['Alley','BsmtQual', 'BsmtCond','MSZoning' ,'BsmtExposure' ,'BsmtFinType1', 'BsmtFinType2' ,'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual' ,'GarageCond', 'PoolQC' ,'Fence','MiscFeature']
test_prep[list_none_test] = test_prep[list_none_test].fillna('None')
test_prep['LotFrontage'] = test_prep['LotFrontage'].fillna(np.round_(np.mean(test_prep['LotFrontage'])))
test_prep['GarageYrBlt'] = test_prep['GarageYrBlt'].fillna(-1)
list_Mas = ['MasVnrType','MasVnrArea','Electrical','Utilities','Exterior1st','Exterior2nd','BsmtHalfBath','BsmtFullBath','KitchenQual','Functional','GarageCars','GarageArea','SaleType']
test_prep[list_Mas] = test_prep[list_Mas].fillna(test_prep.mode().iloc[0])
list_0 = ['BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF']
test_prep[list_0] = test_prep[list_0].fillna(0)

In [11]:
# Remoção da feature 'Id'
train_prep.drop(['Id'], axis=1, inplace=True)
test_prep.drop(['Id'], axis=1, inplace=True)

In [12]:
# Remoção de outliers
Q1= train_prep['SalePrice'].quantile(0.95)
Q3= train_prep['SalePrice'].quantile(0.05)
train_prep = train_prep[(train_prep['SalePrice'] <= Q1) & (train_prep['SalePrice'] >= Q3)]

In [13]:
# Remoção dos recursos numéricos com menor correção
train_prep = train_prep.drop(['OverallCond', 'KitchenAbvGr','EnclosedPorch'], axis=1)
test_prep = test_prep.drop(['OverallCond', 'KitchenAbvGr','EnclosedPorch'], axis=1)

In [14]:
# Separando o X e Y
X = train_prep.drop(['SalePrice'], axis=1)
Y = train_prep['SalePrice']

In [16]:
# standard scaler para as features em treinamento e teste
SS = StandardScaler()
cols = X.select_dtypes('number').keys()
X[cols] =  SS.fit_transform(X[cols])
test_prep[cols] = SS.fit_transform(test_prep[cols])

In [17]:
# Modelo get dummy
X_final = pd.get_dummies(X,dtype=float, drop_first=True)
test_final = pd.get_dummies(test_prep, dtype=float, drop_first=True)

In [18]:
print(X_final.shape)
print(test_final.shape)

(1316, 250)
(1460, 257)


In [20]:
# Remoção das colunas que não estão em ambos dados
for feature in X_final:
    if feature not in test_final.keys():
        X_final = X_final.drop(feature, axis=1)


for feature in test_final:
    if feature not in X_final.keys():
        test_final = test_final.drop(feature, axis=1)

# GenericUniSel

In [21]:
selector = GenericUnivariateSelect(score_func=f_regression, mode='percentile',param=70)

X_feature = selector.fit_transform(X_final, Y)
cols = selector.get_support(indices=True)
selected_cols = X_final.columns[cols].tolist()

In [22]:
# Separação do treino e teste
X_train, X_test, Y_train, Y_test = train_test_split(X_feature , Y, test_size=0.3, random_state=1234)

Separando as features nos dados de treinamento e fazendo o mesmo para os dados de teste

In [23]:
final = pd.DataFrame()

for i in selected_cols:
    if i in test_final:
        final[i] = test_final[i]

  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
  final[i] = test_final[i]
 

# Modelo (MLPRegressor)

In [24]:
# Modelo MLP
ML = MLPRegressor(random_state=1234, max_iter=35, solver='lbfgs', alpha=0.1, hidden_layer_sizes=(10,50))
ML.fit(X_feature, Y)

Y_pred_ML = ML.predict(X_test)
ML.score(X_test, Y_test)
Y_pred_test_ML = ML.predict(final)

r2_ML = r2_score(Y_test, Y_pred_ML)


output = pd.DataFrame({'Id': test_data.Id, 'SalePrice': Y_pred_test_ML})
output.to_csv('submission.csv', index=False)
print("Seu envio foi salvo com sucesso!")

Seu envio foi salvo com sucesso!


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
