In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LassoCV

from sklearn.metrics import mean_squared_log_error

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def rmsle(clf, X_test, y_test):
    return np.sqrt(mean_squared_log_error(y_test, clf.predict(X_test)))

#### В данной модели будет использована самая простая предобработка данных без отбора признаков, учета корреляции и т.д

In [4]:
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

## Обработка данных

In [5]:
answer = pd.DataFrame()
answer['Id'] = data_test['Id']

In [6]:
#объединение сетов
data_full = pd.concat([data,data_test])

In [7]:
del data_full['Id']

### Пропущенные данные

In [8]:
data_full.columns[data_full.isnull().any()]

Index(['MSZoning', 'LotFrontage', 'Alley', 'Utilities', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', 'BsmtFullBath',
       'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea',
       'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SalePrice'],
      dtype='object')

In [9]:
mis_val_percent = (100 * data_full.isnull().sum() /len(data_full)).sort_values(ascending=False)[:10]
mis_val_percent

PoolQC         99.657417
MiscFeature    96.402878
Alley          93.216855
Fence          80.438506
SalePrice      49.982871
FireplaceQu    48.646797
LotFrontage    16.649538
GarageCond      5.447071
GarageYrBlt     5.447071
GarageQual      5.447071
dtype: float64

In [10]:
#удаление столбцов с NaN>80
data_full = data_full.drop(['Alley','PoolQC','MiscFeature','Fence'], axis = 1)

In [11]:
#Заполнение числовых NaN 
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea', 'BsmtHalfBath', 'BsmtFullBath', 'TotalBsmtSF', 'BsmtUnfSF',
           'BsmtFinSF2', 'BsmtFinSF1','LotFrontage']:
    data_full[col] = data_full[col].fillna(int(0))

In [12]:
#Заполнение категориальных NaN
for col in ('FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtFinType2', 
            'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'MasVnrType', 'MSZoning', 'Functional', 'Utilities', 
            'SaleType', 'KitchenQual', 'Exterior1st','Exterior2nd','Electrical' ):
    data_full[col] = data_full[col].fillna('None')

### Outliers

In [13]:
data_test = data_full.iloc[data.shape[0]:,:]
data = data_full.iloc[:data.shape[0],:]

In [14]:
outliers = {"GrLivArea": 4000,
            "LotArea": 160000, "1stFlrSF": 4000,
            "BsmtFinSF1": 4200, "TotalBsmtSF": 6100}

for col in outliers:
        data = data[data[col] < outliers[col]]

In [15]:
data_full = pd.concat([data,data_test])

### Кодирование 

In [16]:
y = data_full.iloc[:data.shape[0],:]['SalePrice']
del data_full['SalePrice']

In [17]:
#Кодирование
data_full = pd.get_dummies(data_full)
data_test = data_full.iloc[data.shape[0]:,:]
data = data_full.iloc[:data.shape[0],:]

In [18]:
print(np.all(data.columns==data_test.columns)) 

True


# Linear regression: 0.14461 score

In [19]:
clf = LassoCV(alphas = np.logspace(-1,4,25))

In [20]:
print(np.sum(cross_val_score(clf, data, y, 
                             cv= ShuffleSplit(n_splits = 8, random_state=0), 
                             scoring = rmsle)) / 8)

0.1340948547041632


In [21]:
result = pd.DataFrame()

In [22]:
result['Id'] = answer['Id']
result['SalePrice'] = clf.fit(data, y).predict(data_test)

In [23]:
result.to_csv('out.csv', index=False, header=True)

## Запишем данные

### Для использования в других моделях

In [24]:
data['SalePrice'] = y
data_test['Id'] = answer['Id']

In [25]:
data.to_csv('ready_data.csv',  header=True)
data_test.to_csv('ready_data_test.csv',  header=True)