In [1]:
import os
import json
import pandas as pd

In [2]:
os.chdir("..")

#### Importação Bibliotecas

In [3]:
from calysto.utils import load_settings
from calysto.feature_engineering import Engineering
from calysto.model_selection import Training
from calysto.submission import KaggleSubmission

#### Load Settings

In [4]:
kaggle_params = json.load(open("settings/main.json", "r"))
model_params = json.load(open("settings/train.json", "r"))
feature_params = json.load(open("settings/dataprep_best.json", "r"))

In [5]:
model_params

[{'estimator': ["GradientBoostingRegressor(random_state=1903, loss='huber')"],
  'estimator__n_estimators': [750],
  'estimator__learning_rate': [0.08],
  'estimator__subsample': [0.5],
  'estimator__min_samples_split': [2, 3, 5],
  'estimator__min_samples_leaf': [8, 5],
  'estimator__min_impurity_decrease': [0],
  'estimator__alpha': [0.3]}]

#### Read Data

In [6]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

#### Feature Engineering

In [7]:
data = Engineering(train)

In [8]:
X = data.feature(feature_params)

In [9]:
y = data.target("SalePrice")

#### Model training

In [10]:
model = Training(X, y, model_params)

In [11]:
model.train_test_split()

In [None]:
model.gridsearch(random=False)

In [None]:
best_model = model.best()

In [None]:
importance = pd.DataFrame()
importance["value"] = best_model.feature_importances_
importance["var"] = model.X_train.columns

In [None]:
importance.sort_values("value", ascending = False).head(30)

In [None]:
importance[importance["var"] == "RoofStyle_tipo2"]

In [None]:
importance[importance["value"] == 0]

In [None]:
import seaborn as sns
sns.set(style="whitegrid")
datq = importance[importance['value']>0].sort_values("value", ascending = False).head(10)
ax = sns.barplot(y="var", x="value", data=datq)

In [None]:
model.results.best_params_

In [None]:
train_mse = model.metrics()

#### Validação

In [None]:
valid_mse = model.validate()

In [None]:
model.corrplot()

In [None]:
model.hetplot()

#### Predição do conjunto de teste

In [12]:
test_data = Engineering(test)

In [13]:
X_test = test_data.feature(feature_params)

In [None]:
y_pred_test = best_model.predict(X_test)

In [None]:
y_pred_test_transform = test_data.detarget(y_pred_test)

### Submissão ao Kaggle

In [None]:
predictions = KaggleSubmission(kaggle_params)

In [None]:
predictions.save(test, y_pred_test_transform)

predictions.submit(message = "feature_eng melhorada")

### Versionamento do experimento

In [None]:
import neptune

In [None]:
neptune.init('adelmofilho/' + kaggle_params["competition_name"])

In [None]:
neptune.create_experiment(name="new features", params={"feature": feature_params,
                                                       "model":model_params[0]})

In [None]:
(train_mse, valid_mse)

In [None]:
neptune.log_metric('mean_squared_error_train',train_mse)
neptune.log_metric('mean_squared_error_valid',valid_mse)

neptune.log_metric('mean_squared_error_test', 0.12857)

In [None]:
train = model.X_train.reset_index(drop=True)*1
train.insert(0, "y", model.y_train)
train.to_csv("sagemaker/train.csv", index=False, header=False)

In [None]:
valid = model.X_valid.reset_index(drop=True)*1
valid.insert(0, "y", model.y_valid)
valid.to_csv("sagemaker/valid.csv", index=False,  header=False)

In [None]:
(X_test*1).to_csv("sagemaker/test.csv", index=False, header=False)

In [15]:
import pickle as pkl

In [None]:
!python -m pip install xgboost==0.90

In [19]:
import xgboost as xgb

In [None]:
!pip freeze

In [45]:
# unzip model
import tarfile
tar = tarfile.open('sagemaker/model.tar.gz')
tar.extractall()
tar.close() 

modelv0 = pkl.load(open("xgboost-model", "rb"))



In [46]:
train_xgb = pd.read_csv("sagemaker/train.csv",header=None)
train_xgb.shape

(1168, 222)

In [47]:
test_xgb = pd.read_csv("sagemaker/test.csv",header=None)
test_xgb.shape

(1459, 221)

In [48]:
import numpy as np

In [49]:
prediction = np.exp(modelv0.predict(xgb.DMatrix(test_xgb.values)))

In [50]:
df = pd.DataFrame()
df["Id"] = test["Id"]
df["SalePrice"] = prediction

In [51]:
df.to_csv("submissions/testxgb.csv", index=False)

In [52]:
df

Unnamed: 0,Id,SalePrice
0,1461,119324.492188
1,1462,158253.031250
2,1463,183085.078125
3,1464,159258.484375
4,1465,221199.203125
...,...,...
1454,2915,90598.609375
1455,2916,93849.484375
1456,2917,169159.156250
1457,2918,109011.828125
