### Importação de bibliotecas python

In [1]:
import os
import json
import pickle
import neptune
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

from datetime import datetime
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

### Importação de módulos do projeto

In [3]:
os.chdir("..")

In [4]:
from src.train import KaggleTrain
from src.submission import KaggleSubmission
from src.competition import KaggleCompetition

In [5]:
from src.utils import load_settings
from src.targeteng import target_eng
from src.featureng import feature_eng

### Load Settings

In [7]:
settings = load_settings()

In [11]:
competition = KaggleCompetition(settings["competition_name"],
                                settings["id_column"],
                                settings["target_column"])

### Read Data

In [14]:
train = pd.read_csv("data/train.csv")

In [15]:
test = pd.read_csv("data/test.csv")

### Exploratory Analysis

!cat data/data_description.txt

### Análise descritiva

variavel_analise = 'BsmtFullBath'

DescribeCount = test.\
    fillna('missing').\
    groupby([variavel_analise])['Id'].\
    describe()[['count']].\
    apply(lambda x: round(x, 1)).\
    reset_index()

DescribeCount

DescribeCount = train.\
    fillna('missing').\
    groupby([variavel_analise])['SalePrice'].\
    describe()[['mean', 'count', 'std', 'min', 'max']].\
    apply(lambda x: round(x, 1)).\
    reset_index()

DescribeCount

### Feature Engineering

### Train-Validation split

In [None]:
X = feature_eng(train, dataprep_list)

In [None]:
y = target_eng(train, target_column)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1903)

### Neptune Experiments

In [None]:
neptune.init('adelmofilho/' + competition_name)

### Grid Search

In [None]:
with open("settings/train.json", "r") as json_file:
        parameters = json.load(json_file)

In [None]:
model = KaggleTrain(X_train, y_train, parameters)

In [None]:
model.gridsearch()

In [None]:
best_model = model.best()

In [None]:
[mse_train, mse_valid] = model.validate(X_valid, y_valid)

In [None]:
model.residuals().head()

In [None]:
corrplot = model.corrplot()

In [None]:
hetplot = model.hetplot()

In [None]:
model.save()

### Register Experiment

In [None]:
PARAMS = {"dataprep": dataprep_list,
          "grid_search": parameters}

In [None]:
neptune.create_experiment(name="new features", params=PARAMS)

In [None]:
neptune.log_metric('mean_squared_error_train',mse_train)
neptune.log_metric('mean_squared_error_valid',mse_valid)

### Predição do conjunto de teste

In [None]:
X_test = feature_eng(test, dataprep_list)

In [None]:
X_test.columns

In [None]:
y_pred_test = best_model.predict(X_test)

In [None]:
y_pred_test_transform = target_eng(y_pred_test, target_column = target_column, inverse = True)

In [None]:
predictions = KaggleSubmission()

In [None]:
predictions["Id"] = test.Id
predictions["SalePrice"] = y_pred_test_transform

In [None]:
predictions.save('house-prices-advanced-regression-techniques', "Id", "SalePrice")

In [None]:
predictions.submit(message = "featureeng melhorada")

neptune.log_metric('mean_squared_error_test', 0.15738)