In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.metrics import mean_squared_log_error

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### В данной модели будет использована самая простая предобработка данных без отбора признаков, учета корреляции и т.д

In [3]:
data = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')

## Обработка данных

In [4]:
answer = pd.DataFrame()
answer['Id'] = data_test['Id']

In [5]:
#объединение сетов
data_full = pd.concat([data,data_test])

In [6]:
del data_full['Id']

### Пропущенные данные

In [7]:
data_full.columns[data_full.isnull().any()]

Index(['MSZoning', 'LotFrontage', 'Alley', 'Utilities', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Electrical', 'BsmtFullBath',
       'BsmtHalfBath', 'KitchenQual', 'Functional', 'FireplaceQu',
       'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea',
       'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SalePrice'],
      dtype='object')

In [8]:
mis_val_percent = (100 * data_full.isnull().sum() /len(data_full)).sort_values(ascending=False)[:10]
mis_val_percent

PoolQC         99.657417
MiscFeature    96.402878
Alley          93.216855
Fence          80.438506
SalePrice      49.982871
FireplaceQu    48.646797
LotFrontage    16.649538
GarageCond      5.447071
GarageYrBlt     5.447071
GarageQual      5.447071
dtype: float64

In [9]:
#удаление столбцов с NaN>80
data_full = data_full.drop(['Alley','PoolQC','MiscFeature','Fence'], axis = 1)

In [10]:
#Заполнение числовых NaN 
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea', 'BsmtHalfBath', 'BsmtFullBath', 'TotalBsmtSF', 'BsmtUnfSF',
           'BsmtFinSF2', 'BsmtFinSF1','LotFrontage']:
    data_full[col] = data_full[col].fillna(int(0))

In [11]:
#Заполнение категориальных NaN
for col in ('FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtFinType2', 
            'BsmtExposure', 'BsmtFinType1', 'BsmtCond', 'BsmtQual', 'MasVnrType', 'MSZoning', 'Functional', 'Utilities', 
            'SaleType', 'KitchenQual', 'Exterior1st','Exterior2nd','Electrical' ):
    data_full[col] = data_full[col].fillna('None')

### Outliers

In [12]:
data_test = data_full.iloc[data.shape[0]:,:]
data = data_full.iloc[:data.shape[0],:]

In [13]:
outliers = {"LotArea": 150000, "BsmtFinSF1": 4000, "TotalBsmtSF": 6000, 
            "1stFlrSF": 4000, "GrLivArea": 5000}

In [14]:
def clean_outliers(data, outliers):
    for col in outliers:
        data = data[data[col] < outliers[col]]
    return data

print("Before cleaning: %d" % len(data))

data = clean_outliers(data, outliers)

print("After cleaning: %d" % len(data))

Before cleaning: 1460
After cleaning: 1456


In [15]:
data_full = pd.concat([data,data_test])

### Кодирование 

In [16]:
y = data_full.iloc[:data.shape[0],:]['SalePrice']
del data_full['SalePrice']

In [17]:
#Кодирование
data_full = pd.get_dummies(data_full)
data_test = data_full.iloc[data.shape[0]:,:]
data = data_full.iloc[:data.shape[0],:]

In [18]:
data

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_New,SaleType_None,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,7,5,1915,1970,0.0,216.0,0.0,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,8,5,2000,2000,350.0,655.0,0.0,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,62.0,7917,6,5,1999,2000,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1456,20,85.0,13175,6,6,1978,1988,119.0,790.0,163.0,...,0,0,0,1,0,0,0,0,1,0
1457,70,66.0,9042,7,9,1941,2006,0.0,275.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1458,20,68.0,9717,5,6,1950,1996,0.0,49.0,1029.0,...,0,0,0,1,0,0,0,0,1,0


In [19]:
print(np.all(data.columns==data_test.columns)) 

True


# Linear regression: 0.14461 score

In [20]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=7)

In [21]:
from sklearn.linear_model import Ridge
import numpy as np

clf = LassoCV(alphas = np.logspace(-1,4,25))

clf.fit(X_train, y_train)

LassoCV(alphas=array([1.00000000e-01, 1.61559810e-01, 2.61015722e-01, 4.21696503e-01,
       6.81292069e-01, 1.10069417e+00, 1.77827941e+00, 2.87298483e+00,
       4.64158883e+00, 7.49894209e+00, 1.21152766e+01, 1.95734178e+01,
       3.16227766e+01, 5.10896977e+01, 8.25404185e+01, 1.33352143e+02,
       2.15443469e+02, 3.48070059e+02, 5.62341325e+02, 9.08517576e+02,
       1.46779927e+03, 2.37137371e+03, 3.83118685e+03, 6.18965819e+03,
       1.00000000e+04]),
        copy_X=True, cv=None, eps=0.001, fit_intercept=True, max_iter=1000,
        n_alphas=100, n_jobs=None, normalize=False, positive=False,
        precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
        verbose=False)

In [22]:
np.sqrt(mean_squared_log_error(y_test, clf.predict(X_test)))

0.1286638429532744

In [23]:
result = pd.DataFrame()

In [24]:
result['Id'] = answer['Id']
result['SalePrice'] = clf.predict(data_test)

In [25]:
result.to_csv('out.csv', index=False, header=True)

## Запишем данные

### Для использования в других моделях

In [26]:
data['SalePrice'] = y
data_test['Id'] = answer['Id']

In [27]:
data.to_csv('ready_data.csv',  header=True)
data_test.to_csv('ready_data_test.csv',  header=True)