In [1]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

DATA_PATH = '../data/'

test_df = pd.read_csv(DATA_PATH + 'processed/test_input.csv')
test_df.shape

(8059, 64)

In [2]:
df_linear = pd.read_csv('model_preds/linear_regression.csv').loc[:, 'reg_linear_pred_price':]
df_linear_sub = pd.read_csv('model_preds/linear_regression_sub.csv')
df_ridge = pd.read_csv('model_preds/ridge.csv').loc[:, 'reg_ridge_pred_price':]
df_ridge_sub = pd.read_csv('model_preds/ridge_sub.csv')
df_catboost = pd.read_csv('model_preds/catboost.csv').loc[:, 'catboost_pred_price':]
df_catboost_sub = pd.read_csv('model_preds/catboost_sub.csv')
df_knn = pd.read_csv('model_preds/knn.csv').loc[:, 'knn_pred_price':]
df_knn_sub = pd.read_csv('model_preds/knn_sub.csv')

In [3]:
missing_columns = pd.read_csv('model_preds/linear_regression.csv')[['cent_price_cor', 'cent_trans_cor', 'kfold']]
missing_columns.head()

Unnamed: 0,cent_price_cor,cent_trans_cor,kfold
0,-0.169,0.375,0.0
1,-0.102,0.358,0.0
2,-0.231,0.403,0.0
3,-0.248,0.461,0.0
4,-0.305,0.417,0.0


In [4]:
df = pd.concat([df_linear, df_ridge, df_catboost, df_knn, missing_columns], axis=1)
df.head()

Unnamed: 0,reg_linear_pred_price,reg_linear_pred_trans,reg_ridge_pred_price,reg_ridge_pred_trans,catboost_pred_price,catboost_pred_trans,knn_pred_price,knn_pred_trans,cent_price_cor,cent_trans_cor,kfold
0,-0.20664,-0.20664,-0.206477,-0.206477,-0.176531,-0.176531,-0.199641,-0.199641,-0.169,0.375,0.0
1,-0.186976,-0.186976,-0.187075,-0.187075,-0.169957,-0.169957,-0.205234,-0.205234,-0.102,0.358,0.0
2,-0.210221,-0.210221,-0.210126,-0.210126,-0.199793,-0.199793,-0.202009,-0.202009,-0.231,0.403,0.0
3,-0.219074,-0.219074,-0.21914,-0.21914,-0.239445,-0.239445,-0.204926,-0.204926,-0.248,0.461,0.0
4,-0.196843,-0.196843,-0.196967,-0.196967,-0.231418,-0.231418,-0.202181,-0.202181,-0.305,0.417,0.0


In [5]:
def treinar_fold(fold, regressor):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    x_train = df_train.drop(['kfold', 'cent_price_cor', 'cent_trans_cor'], axis=1)
    y_train = df_train[['cent_price_cor', 'cent_trans_cor']]
    x_valid = df_valid.drop(['kfold', 'cent_price_cor', 'cent_trans_cor'], axis=1)
    y_valid = df_valid[['cent_price_cor', 'cent_trans_cor']]
    
    regressor.fit(x_train, y_train)
    y_pred = regressor.predict(x_valid)
    
    mse = mean_squared_error(y_valid, y_pred)
    print(f"Fold: {fold}, MSE = {mse}")
    
    df_valid.loc[:, 'ensemble_pred_price'] = y_pred[:, 0]
    df_valid.loc[:, 'ensemble_pred_trans'] = y_pred[:, 0]
    
    return df_valid, mse

In [6]:
reg = MultiOutputRegressor(Ridge())

dfs = []
mean_mse = 0;
for i in range(5):
    fold_df, fold_mse = treinar_fold(i, reg)
    mean_mse += fold_mse
    dfs.append(fold_df)
mean_mse /= 5
    
fin_valid_df = pd.concat(dfs)
print(f'MSE Media:{mean_mse}')
print(fin_valid_df.shape)
fin_valid_df.to_csv('model_preds/ensemble_basico.csv', index=False)

Fold: 0, MSE = 0.012834957189071775
Fold: 1, MSE = 0.013771748248445032
Fold: 2, MSE = 0.013965803013943826
Fold: 3, MSE = 0.01308646811923244
Fold: 4, MSE = 0.013536311424177425
MSE Media:0.0134390575989741
(11940, 13)


## Submissão

In [7]:
df_sub = pd.concat([df_linear_sub, df_ridge_sub, df_catboost_sub, df_knn_sub], axis=1)
df_sub.head()

Unnamed: 0,reg_linear_pred_price,reg_linear_pred_trans,reg_ridge_pred_price,reg_ridge_pred_trans,catboost_pred_price,catboost_pred_trans,knn_pred_price,knn_pred_trans
0,-0.195824,0.361692,-0.195665,0.361681,-0.17275,0.323119,-0.199809,0.368067
1,-0.198507,0.373562,-0.198512,0.373535,-0.213441,0.407223,-0.204037,0.364823
2,-0.206054,0.37077,-0.2063,0.370804,-0.201747,0.354451,-0.201943,0.357534
3,-0.207104,0.358972,-0.207049,0.358971,-0.143301,0.350456,-0.201533,0.366661
4,-0.205721,0.361012,-0.205692,0.361056,-0.196469,0.348525,-0.19656,0.360271


In [8]:
X_train = df.drop(['kfold', 'cent_price_cor', 'cent_trans_cor'], axis=1)
y_train = df[['cent_price_cor', 'cent_trans_cor']]

reg.fit(X_train, y_train)

y_pred = reg.predict(df_sub.values)

df_submission = pd.DataFrame(y_pred, columns=['cent_price_cor', 'cent_trans_cor'])
df_submission

Unnamed: 0,cent_price_cor,cent_trans_cor
0,-0.245937,0.405295
1,-0.245261,0.405526
2,-0.244761,0.404619
3,-0.246438,0.405803
4,-0.245809,0.405001
...,...,...
8054,-0.245103,0.404993
8055,-0.245369,0.405212
8056,-0.244966,0.404509
8057,-0.245986,0.405318


In [9]:
df_submission.to_csv('model_preds/ensemble_sub.csv', index=False)