In [85]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

In [156]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [53]:
from sklearn.linear_model import ElasticNet, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import cross_val_score, GridSearchCV

In [3]:
def create_dict_of_features():
    """
    Create dictionary with explanation of all attributes.
    
    Return
        dictionary with features from data_description.txt file.
    """
    features = dict()
    with open('./data_description.txt', 'r') as file:
        for line in file.readlines():
            if (line[0] not in [' ', '\t']) and (len(line) > 10):
                line = line.split(':')
                features[line[0]] = line[1].strip('\t\n').lstrip()
        file.close()
    return features

features = create_dict_of_features()

In [4]:
def feature_info(feature_name, plots=True):
    """
    Get an explanation of feature.
    
    Parameters
        feature_name: str
            Name of feature from housing price dataset
        
        plots: bool, default - True
            Draw plots with choosen feature.
    """
    if (feature_name in features.keys()):
        
        feature_type = houses[feature_name].dtype
        
        print(features[feature_name])
        print('   Feature statistics: ')
        
        if feature_type == 'object':
            print(houses[feature_name].describe(include='object'))
        else:
            print(houses[feature_name].describe())
            
            print('\nCorreleation: ')
            print(houses[[feature_name, 'SalePrice']].corr())
    else:
        print('There is no such a feature.')

In [5]:
def check_amount_of_na(ds, percent_of_nan = 40):
    nan = houses.isna().sum() / houses.shape[0] * 100
    return nan[nan > percent_of_nan].sort_values(ascending=False)

In [89]:
def plot_initial_vs_new(df, feature_name, function_to_transform=np.log):   
    """
    Plots numerical features.
    
    Parameters
        df: pandas DataFrame
            Pandas DataFrame
        
        feature_name: str
            Name of feature for plotting.
            
        function_to_transform: object, default = np.log
            This one is a function in which initial feature should be transformed
    """
    fig, ax = plt.subplots(1, 2, figsize=(8, 3))
    
    sns.histplot(df[feature_name], ax=ax[0], kde=True)
    ax[0].set_title('original')

    sns.histplot(df[feature_name].apply(function_to_transform), ax=ax[1], kde=True)
    ax[1].set_title('new')
    plt.tight_layout()
    plt.show();

In [58]:
# export train and test datasets
houses = pd.read_csv('./train.csv')
houses_test = pd.read_csv('./test.csv')

In [81]:
# check an amount of NaN values in houses dataset
check_amount_of_na(houses, percent_of_nan=0)

PoolQC          99.520548
MiscFeature     96.301370
Alley           93.767123
Fence           80.753425
FireplaceQu     47.260274
LotFrontage     17.739726
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
BsmtExposure     2.602740
BsmtFinType2     2.602740
BsmtFinType1     2.534247
BsmtCond         2.534247
BsmtQual         2.534247
MasVnrArea       0.547945
MasVnrType       0.547945
Electrical       0.068493
dtype: float64

In [132]:
# split houses dataset into training features and targets
X = houses.drop(['Id', 'SalePrice', 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=1)
y = houses['SalePrice']

In [133]:
# map quality categorical features to numeric
features = ['ExterQual', 'ExterCond', 'BsmtQual',
            'BsmtCond', 'HeatingQC', 'KitchenQual',
            'GarageQual', 'GarageCond']
qual_dict = {'Ex': 5, 'Gd': 4, 'TA': 3,
             'Fa': 2, 'Po': 1, 'NA': 0}
for feature in features:
    X[feature] = X[feature].map(qual_dict)

In [192]:
# split training set into categorical and numerical features
X_cat = X.select_dtypes(include='object')
X_cat['MSSubClass'] = X['MSSubClass'].values
X_num = X.select_dtypes(exclude='object').drop('MSSubClass', axis=1)

In [193]:
num_cols_with_na = X_num.isna().sum().sort_values(ascending=False)
num_cols_with_na = num_cols_with_na.index[num_cols_with_na!=0]

In [194]:
cat_cols_with_na = X_cat.isna().sum().sort_values(ascending=False)
cat_cols_with_na = cat_cols_with_na.index[cat_cols_with_na!=0]

In [195]:
num_cols_ixs = X_num.columns.get_indexer_for(num_cols_with_na).tolist()
cat_cols_ixs = X_cat.columns.get_indexer_for(cat_cols_with_na).tolist()

In [196]:
num_filler = ColumnTransformer(transformers=[('filler', SimpleImputer(strategy='median'), num_cols_ixs)],
                               remainder='passthrough')
cat_filler = ColumnTransformer(transformers=[('filler', SimpleImputer(strategy='most_frequent'), cat_cols_ixs)],
                               remainder='passthrough')

In [197]:
pipeline_num = Pipeline([('num_fill', num_filler),
                         ('scaler', StandardScaler())])
pipeline_cat = Pipeline([('cat_fill', cat_filler),
                         ('encoder', OneHotEncoder(sparse=False))])

In [198]:
X_num = pipeline_num.fit_transform(X_num)
X_cat = pipeline_cat.fit_transform(X_cat)

In [202]:
X_tr = np.concatenate((X_num, X_cat), axis=1)

In [182]:
# lists of features for conversion 
features_40NaN = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
# features_to_logarithm = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']
# features_to_sqrt = ['TotalBsmtSF', '2ndFlrSF', 'WoodDeckSF', 'OpenPorchSF']

In [208]:
svr = SVR()
-cross_val_score(svr, X_tr, y, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, verbose=1).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s finished


81117.73912499049

In [209]:
lasso = Lasso()
-cross_val_score(lasso, X_tr, y, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, verbose=1).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


34485.73755637911

In [211]:
forest = RandomForestRegressor()
-cross_val_score(forest, X_tr, y, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, verbose=1).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.4s finished


30000.193005167705