In [8]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
import importlib.util
import sys
from catboost import CatBoostRegressor

spec = importlib.util.spec_from_file_location("preprocess", "../src/data/preprocess.py")
preprocess = importlib.util.module_from_spec(spec)
sys.modules["preprocess"] = preprocess
spec.loader.exec_module(preprocess)

spec = importlib.util.spec_from_file_location("build_features", "../src/features/build_features.py")
build_features = importlib.util.module_from_spec(spec)
sys.modules["build_features"] = build_features
spec.loader.exec_module(build_features)

spec = importlib.util.spec_from_file_location("predict_model", "../src/models/predict_model.py")
predict_model = importlib.util.module_from_spec(spec)
sys.modules["predict_model"] = predict_model
spec.loader.exec_module(predict_model)

spec = importlib.util.spec_from_file_location("train_model", "../src/models/train_model.py")
train_model = importlib.util.module_from_spec(spec)
sys.modules["train_model"] = train_model
spec.loader.exec_module(train_model)

In [9]:
x = pd.read_csv('../data/raw/train.csv')
x_test = pd.read_csv('../data/raw/test.csv')

In [10]:
NOT_FULL = [col for col in x.columns if x[col].dropna().shape[0] != 1460]
OBJ_TYPE = [col for col in x.columns if x[col].dtype == object]
FULL = [col for col in x.columns if x[col].dropna().shape[0] == 1460]
NUM_TYPE = [col for col in x.columns if x[col].dtype == np.int64 or x[col].dtype == np.float64]

In [11]:
x_train = x.drop(['SalePrice'], axis=1)
y = x['SalePrice']

In [12]:
a = x_train.corr(numeric_only=True)
correlations = dict()
for col1 in a.columns:
    for col2 in a.columns:
        if col1 != col2:
            if abs(a[col1][col2]) > 0.6:
                if col1 in correlations:
                    correlations[col1].append((col2, round(a[col1][col2], 1)))
                else:
                    correlations[col1] = [(col2, round(a[col1][col2], 1))]

Здесь можно увидеть множество самых сильных корреляций среди признаков

In [13]:
for i in correlations:
    print(i, correlations[i])

OverallQual [('GarageCars', 0.6)]
YearBuilt [('GarageYrBlt', 0.8)]
YearRemodAdd [('GarageYrBlt', 0.6)]
BsmtFinSF1 [('BsmtFullBath', 0.6)]
TotalBsmtSF [('1stFlrSF', 0.8)]
1stFlrSF [('TotalBsmtSF', 0.8)]
2ndFlrSF [('GrLivArea', 0.7), ('HalfBath', 0.6), ('TotRmsAbvGrd', 0.6)]
GrLivArea [('2ndFlrSF', 0.7), ('FullBath', 0.6), ('TotRmsAbvGrd', 0.8)]
BsmtFullBath [('BsmtFinSF1', 0.6)]
FullBath [('GrLivArea', 0.6)]
HalfBath [('2ndFlrSF', 0.6)]
BedroomAbvGr [('TotRmsAbvGrd', 0.7)]
TotRmsAbvGrd [('2ndFlrSF', 0.6), ('GrLivArea', 0.8), ('BedroomAbvGr', 0.7)]
GarageYrBlt [('YearBuilt', 0.8), ('YearRemodAdd', 0.6)]
GarageCars [('OverallQual', 0.6), ('GarageArea', 0.9)]
GarageArea [('GarageCars', 0.9)]


# Preprocessing and validation in Pipeline

In [69]:
pipe_of_insight = Pipeline([
    ('cleaning', preprocess.PreprocessTransformer()),
    ('features', build_features.FeaturesTransformer()),
    ('catmodel', CatBoostRegressor(n_estimators=100,
                                   depth=2,
                                   learning_rate=1, 
                                   loss_function='MAE',
                                 verbose=False))
])

scores = cross_val_score(estimator=pipe_of_insight,
                         X=x_train, y=y, 
                         scoring='r2', cv=5, 
                         n_jobs=1)
print(scores)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

[0.87121004 0.7302451  0.74908027 0.83832124 0.66747786]


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Для оценки модели в задаче регрессии я выбрал метрику R2 score, поскольку она отражает объективную оценку качества предсказаний, что позволяет сравнивать предсказания разных моделей. В то время как значения MSE или MAE могут сильно отличаться в зависимости от задачи, являясь из-за этого очень субъективными. 

Так же попробовал добавить 2 новых фичи: наличие обеих ванн на первом этаже и на этажах выше. Однако R2 с ними упал с 0.82 на тестовой выборке до 0.77, поэтому функция их добавления закомментирована в скрипте добавления фич

# Preprocessing and validation in sklearn functions

In [14]:
transformer1 = build_features.FeaturesTransformer()
transformer1.fit(x_train, y)
x_traint = transformer1.transform(x_train)

transformer2 = preprocess.PreprocessTransformer()
transformer2.fit(x_train, y)
x_train = transformer2.transform(x_train)

x_test = preprocess.prep_test(x_test)

X_train, X_test, y_train, y_test = train_test_split(x_train, y, random_state=64)
model = train_model.train(X_train, y_train)
test_score, train_score = predict_model.predict(model, X_train, X_test, y_train, y_test)

print(test_score)
print(train_score)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


0.820060174527121
0.909855806317886


### n_estimators = 50, max_depth = 8
    R2 for test: 0.651
    R2 for train: 0.979
### n_estimators = 50, max_depth = 4
    R2 for test: 0.653
    R2 for train: 0.936
### n_estimators = 100, max_depth = 8
    R2 for test: 0.626
    R2 for train: 0.992
### n_estimators = 100, max_depth = 4
    R2 for test: 0.656
    R2 for train: 0.952
### n_estimators = 100, max_depth = 2
    R2 for test: 0.82
    R2 for train: 0.91
### n_estimators = 50, max_depth = 2
    R2 for test: 0.813
    R2 for train: 0.889

In [79]:
final_pred = model.predict(x_test)

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


In [80]:
final_pred = pd.Series(data=final_pred)

In [81]:
final_pred

0       124830.000071
1       163408.999992
2       189550.000005
3       183074.999964
4       193138.000025
            ...      
1454     74851.999988
1455     86439.999942
1456    175686.999990
1457    126642.000010
1458    242407.000088
Length: 1459, dtype: float64

In [82]:
final_pred.to_csv('../data/raw/sample_solution.csv')

Это наши предсказания по валидационной выборке. За неимением y_val проверить их точность, к сожалению, не можем, поэтому просто сохраняем в .csv