In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.linear_model import LassoCV, RidgeCV

In [2]:
def root_mean_log_squared_error(y_test,predicted):
    return np.sqrt(np.sum((np.log(1+y_test)-np.log(1+predicted))**2)/len(predicted))

In [3]:
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

# Обработка данных

In [4]:
answer = pd.DataFrame()
answer['Id'] = data_test['Id']

In [5]:
y = data['SalePrice']
del data['SalePrice'] 

In [6]:
#объединение сетов
data_full = pd.concat([data,data_test])

### Пропущенные данные

In [7]:
data_full.columns[data_full.isnull().any()]

Index(['MSZoning', 'LotFrontage', 'Alley', 'Utilities', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', 'BsmtFullBath',
       'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea',
       'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType'],
      dtype='object')

In [40]:
# Percentage of missing values
mis_val_percent = (100 * data_full.isnull().sum() /len(data_full)).sort_values(ascending=False)[:10]
mis_val_percent

PoolQC          99.657417
MiscFeature     96.402878
Alley           93.216855
Fence           80.438506
FireplaceQu     48.646797
LotFrontage     16.649538
GarageCond       5.447071
GarageQual       5.447071
GarageYrBlt      5.447071
GarageFinish     5.447071
dtype: float64

In [8]:
#удаление столбцов с NaN>80
data_full = data_full.drop(['Alley','PoolQC','MiscFeature','Fence'], axis = 1)

In [9]:
#Заполнение категориальных NaN
for col in ('FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtFinType2', 
            'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'MasVnrType', 'MSZoning', 'Functional', 'Utilities', 
            'SaleType', 'KitchenQual', 'Exterior1st','Exterior2nd','Electrical' ):
    data_full[col] = data_full[col].fillna('None')

In [10]:
#Заполнение числовых NaN 
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea', 'BsmtHalfBath', 'BsmtFullBath', 'TotalBsmtSF', 'BsmtUnfSF',
           'BsmtFinSF2', 'BsmtFinSF1','LotFrontage']:
    data_full[col] = data_full[col].fillna(int(0))

### Корреляция

In [11]:
#Корреляция
data_full = data_full.drop(['Id', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtHalfBath','LowQualFinSF',
              'GarageArea', 'MoSold', 'MiscVal', 'OpenPorchSF', 'YrSold', 'PoolArea', 
              '3SsnPorch', 'EnclosedPorch', 'HalfBath', 'GrLivArea', 'FullBath'], axis = 1)

### Кодирование 

In [12]:
#Кодирование
data_full = pd.get_dummies(data_full)
data_test = data_full.iloc[data.shape[0]:,:]
data = data_full.iloc[:data.shape[0],:]

In [13]:
print(np.all(data.columns==data_test.columns)) 

True


# Linear regression

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=7)

In [15]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [17]:
root_mean_log_squared_error(y_test, regressor.predict(X_test))

  


0.1888945562875964

# Lasso regression

In [18]:
#Возвращает одномерный массив из указанного количества элементов, значения которых равномерно распределенны по 
#логарифмической шкале внутри заданного интервала.
np.logspace(-1,4,25)

array([1.00000000e-01, 1.61559810e-01, 2.61015722e-01, 4.21696503e-01,
       6.81292069e-01, 1.10069417e+00, 1.77827941e+00, 2.87298483e+00,
       4.64158883e+00, 7.49894209e+00, 1.21152766e+01, 1.95734178e+01,
       3.16227766e+01, 5.10896977e+01, 8.25404185e+01, 1.33352143e+02,
       2.15443469e+02, 3.48070059e+02, 5.62341325e+02, 9.08517576e+02,
       1.46779927e+03, 2.37137371e+03, 3.83118685e+03, 6.18965819e+03,
       1.00000000e+04])

In [19]:
lasso = LassoCV(alphas = np.logspace(-1,4,25))

In [20]:
lasso.fit(X_train,y_train)



LassoCV(alphas=array([1.00000e-01, 1.61560e-01, 2.61016e-01, 4.21697e-01, 6.81292e-01,
       1.10069e+00, 1.77828e+00, 2.87298e+00, 4.64159e+00, 7.49894e+00,
       1.21153e+01, 1.95734e+01, 3.16228e+01, 5.10897e+01, 8.25404e+01,
       1.33352e+02, 2.15443e+02, 3.48070e+02, 5.62341e+02, 9.08518e+02,
       1.46780e+03, 2.37137e+03, 3.83119e+03, 6.18966e+03, 1.00000e+04]),
    copy_X=True, cv='warn', eps=0.001, fit_intercept=True, max_iter=1000,
    n_alphas=100, n_jobs=None, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [21]:
y_predicted = lasso.predict(X_test)
root_mean_log_squared_error(y_predicted,y_test)

0.132367276216172