# Modeling

## Imports

In [1]:
from modeling import *
import pandas as pd
import numpy as np

## Parameters

In [2]:
invalid_cols = ["has_elevator", "rural_urbano", "geohash_4", "geohash_3", "geohash"]

seed = 1993

target_col = "price"

## Load train dataset

In [3]:
df_train = pd.read_feather("../data/processed/train.feather")
print(df_train.shape)

(72131, 48)


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72131 entries, 0 to 72130
Data columns (total 48 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   usableAreas                    72121 non-null  float64
 1   parkingSpaces                  70911 non-null  float64
 2   suites                         66178 non-null  float64
 3   bathrooms                      72130 non-null  float64
 4   totalAreas                     43795 non-null  float64
 5   bedrooms                       72131 non-null  int64  
 6   publicationType                72131 non-null  object 
 7   geohash                        72123 non-null  object 
 8   price                          72131 non-null  int64  
 9   businessType                   72131 non-null  object 
 10  yearlyIptu                     62150 non-null  float64
 11  monthlyCondoFee                68357 non-null  float64
 12  has_gym                        72131 non-null 

## Prepare dataset

In [5]:
X = prep_modeling(df_train, invalid_cols)

In [6]:
y = X[target_col].values
X = X.drop(columns=[target_col])

In [9]:
y_binned = bins_y(y)

In [128]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1993, stratify=y_binned)

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)

print("train size:", X_train.shape)
print("test size:", X_test.shape)

train size: (50491, 44)
test size: (21640, 44)


## Remove outliers values

Based on **Target feature stats** of [exploratory_analysis.ipynb](exploratory_analysis.ipynb) we have data outliers that were entered incorrectly.

Remove them isn't a best choicce, but for the first version of the model, I'll do that.

I'll remove all observations with price lower than 50000 and higher than 5000000.

In [129]:
X_train, y_train = remove_outliers(X_train, y_train, lower=10000, higher=13000000)

## Training

### XGBoost Regressor

In [159]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV

### Baseline

In [152]:
xgb_score = {}

xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

y_pred = xgb_model.predict(X_test)

xgb_score["MSE"] = mean_squared_error(y_test, y_pred)
xgb_score["Score"] = xgb_model.score(X_test, y_test)

xgb_score

{'MSE': 341956618186.56, 'Score': 0.714044332229657}

### Grid Search

In the next cell we'll run Grid Search to find best parameters. We'll apply cross-validation as well.

In [None]:
xgb_params = {
    "n_estimators": [50, 100, 150, 200],
    "max_depth": [10, 20, 30, 44],
    "learning_rate": [1e-1, 1e-2, 1e-3],
    "random_state": [seed]
}

grid_obj = GridSearchCV(xgb_model, xgb_params, n_jobs=1, cv=5, verbose=1)
grid_obj = grid_obj.fit(X[:1000], y[:1000])

xgb_model = grid_obj.best_estimator_

### Cross validation

In [166]:
cv_n = 5

xgb_scores = cross_val_score(xgb_model, X, y, cv=cv_n)

for n in range(cv_n):
    print("Fold {} - Score: {:.4f}".format(n+1, xgb_scores[n]))

Fold 1 - Score: 0.4675
Fold 2 - Score: 0.8631
Fold 3 - Score: 0.8450
Fold 4 - Score: 0.8209
Fold 5 - Score: 0.8181


### Check some predictions

In [167]:
for i in range(50):
    print(check_prediction(xgb_model, X_test.iloc[i], y_test[i]))

{'Prediction': 'R$ 1,174,789.62', 'Real': 'R$ 1,330,000.00'}
{'Prediction': 'R$ 736,565.31', 'Real': 'R$ 595,000.00'}
{'Prediction': 'R$ 389,548.00', 'Real': 'R$ 371,000.00'}
{'Prediction': 'R$ 589,651.25', 'Real': 'R$ 556,500.00'}
{'Prediction': 'R$ 3,139,063.50', 'Real': 'R$ 3,430,000.00'}
{'Prediction': 'R$ 278,799.31', 'Real': 'R$ 237,999.00'}
{'Prediction': 'R$ 554,661.38', 'Real': 'R$ 420,000.00'}
{'Prediction': 'R$ 2,200,463.25', 'Real': 'R$ 3,500,000.00'}
{'Prediction': 'R$ 264,893.16', 'Real': 'R$ 293,300.00'}
{'Prediction': 'R$ 383,267.31', 'Real': 'R$ 503,999.00'}
{'Prediction': 'R$ 212,278.17', 'Real': 'R$ 224,000.00'}
{'Prediction': 'R$ 1,344,855.75', 'Real': 'R$ 1,113,000.00'}
{'Prediction': 'R$ 274,496.62', 'Real': 'R$ 262,500.00'}
{'Prediction': 'R$ 1,247,619.62', 'Real': 'R$ 1,295,000.00'}
{'Prediction': 'R$ 2,077,493.25', 'Real': 'R$ 2,912,700.00'}
{'Prediction': 'R$ 416,478.56', 'Real': 'R$ 406,000.00'}
{'Prediction': 'R$ 1,165,462.62', 'Real': 'R$ 1,042,999.00'}
{'P

## Save trained model

In [169]:
xgb_model.save_model("model.xgb")

## Predict test dataset

In [192]:
df_test = pd.read_feather("../data/processed/test.feather")
print(df_test.shape)

(16036, 48)


In [193]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16036 entries, 0 to 16035
Data columns (total 48 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   usableAreas                    16029 non-null  float64
 1   parkingSpaces                  15772 non-null  float64
 2   suites                         14641 non-null  float64
 3   bathrooms                      16035 non-null  float64
 4   totalAreas                     9942 non-null   float64
 5   bedrooms                       16036 non-null  int64  
 6   publicationType                16036 non-null  object 
 7   geohash                        16031 non-null  object 
 8   price                          0 non-null      object 
 9   businessType                   16036 non-null  object 
 10  yearlyIptu                     13639 non-null  float64
 11  monthlyCondoFee                15100 non-null  float64
 12  has_gym                        16036 non-null 

In [None]:
test_preds = test_prediction(df_test, xgb_model)
test_preds = pd.DataFrame(test_preds)

print(test_preds.shape)
test_preds.head()

## Save predictions.csv

In [None]:
test_preds.to_csv("../predictions.csv", index=False, encoding="utf-8")

---
---
---

# That's the end!