In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from scipy import stats

sns.set_theme()

In [2]:
train_data = pd.read_csv("train.csv")
train_data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [3]:
train_data.dtypes.unique()

array([dtype('int64'), dtype('O'), dtype('float64')], dtype=object)

In [4]:
test_data = pd.read_csv("test.csv")
test_data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
5,1466,60,RL,75.0,10000,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,Normal
6,1467,20,RL,,7980,Pave,,IR1,Lvl,AllPub,...,0,0,,GdPrv,Shed,500,3,2010,WD,Normal
7,1468,60,RL,63.0,8402,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,5,2010,WD,Normal
8,1469,20,RL,85.0,10176,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2010,WD,Normal
9,1470,20,RL,70.0,8400,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,,0,4,2010,WD,Normal


In [5]:
print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [6]:
numeric_columns = list(train_data.drop(["SalePrice"], axis=1).select_dtypes(exclude=['O']))
categorical_columns = list(train_data.select_dtypes(include=['O']))

In [7]:
for column in numeric_columns:
    corr = train_data["SalePrice"].corr(train_data[column])
    if corr < 0.1:
        train_data = train_data.drop([column], axis=1)
        test_data = test_data.drop([column], axis=1)

numeric_columns = list(train_data.drop(["SalePrice"], axis=1).select_dtypes(exclude=['O']))

In [8]:
for column in numeric_columns:
    unique_values = train_data[column].unique()
    if len(unique_values) < 50:
        print(f"{column}: Кол-во уникальных значений = {unique_values}")

OverallQual: Кол-во уникальных значений = [ 7  6  8  5  9  4 10  3  1  2]
BsmtFullBath: Кол-во уникальных значений = [1 0 2 3]
FullBath: Кол-во уникальных значений = [2 1 3 0]
HalfBath: Кол-во уникальных значений = [1 0 2]
BedroomAbvGr: Кол-во уникальных значений = [3 4 1 2 0 5 6 8]
TotRmsAbvGrd: Кол-во уникальных значений = [ 8  6  7  9  5 11  4 10 12  3  2 14]
Fireplaces: Кол-во уникальных значений = [0 1 2 3]
GarageCars: Кол-во уникальных значений = [2 3 1 0 4]


In [9]:
train_data["Bath"] = train_data["BsmtFullBath"] + train_data["FullBath"] + train_data["HalfBath"]
train_data = train_data.drop(["BsmtFullBath", "FullBath", "HalfBath"], axis=1)

test_data["Bath"] = test_data["BsmtFullBath"] + test_data["FullBath"] + test_data["HalfBath"]
test_data = test_data.drop(["BsmtFullBath", "FullBath", "HalfBath"], axis=1)

In [10]:
train_data["Rooms"] = train_data["BedroomAbvGr"] + train_data["TotRmsAbvGrd"]
train_data = train_data.drop(["BedroomAbvGr", "TotRmsAbvGrd"], axis=1)

test_data["Rooms"] = test_data["BedroomAbvGr"] + test_data["TotRmsAbvGrd"]
test_data = test_data.drop(["BedroomAbvGr", "TotRmsAbvGrd"], axis=1)

In [11]:
bath_mode = stats.mode(train_data["Bath"]).mode[0]
train_data["Bath"] = train_data["Bath"].fillna(bath_mode)
test_data["Bath"] = test_data["Bath"].fillna(bath_mode)

rooms_mode = stats.mode(train_data["Rooms"]).mode[0]
train_data["Rooms"] = train_data["Rooms"].fillna(rooms_mode)
test_data["Rooms"] = test_data["Rooms"].fillna(rooms_mode)

In [12]:
numeric_columns = list(train_data.drop(["SalePrice"], axis=1).select_dtypes(exclude=['O']))

for column in numeric_columns:
    null_values_train = np.sum(train_data[column].isnull())
    null_values_test = np.sum(test_data[column].isnull())
    if null_values_train > 0:
        mean_train = train_data[column].mean()
        train_data[column] = train_data[column].fillna(mean_train)
    if null_values_test > 0:
        mean_test = test_data[column].mean()
        test_data[column] = test_data[column].fillna(mean_test)

In [13]:
multicollinear_features = []

for column1 in numeric_columns:
    for column2 in numeric_columns:
        if column1 != column2 and column2 not in multicollinear_features:
            corr = train_data[column1].corr(train_data[column2])
            if corr >= 0.7:
                multicollinear_features.append(column1)

train_data = train_data.drop(multicollinear_features, axis=1)
test_data = test_data.drop(multicollinear_features, axis=1)

numeric_columns = list(train_data.drop(["SalePrice"], axis=1).select_dtypes(exclude=['O']))

In [14]:
# train_data = train_data[train_data["LotFrontage"] < 150]
# train_data = train_data[train_data["LotArea"] < 20000]
# train_data = train_data[train_data["MasVnrArea"] < 450]
# train_data = train_data[train_data["BsmtFinSF1"] < 2000]
# train_data = train_data[train_data["BsmtUnfSF"] < 1750]
# train_data = train_data[train_data["1stFlrSF"] < 2250]
# train_data = train_data[train_data["GarageArea"] < 1000]
# train_data = train_data[train_data["WoodDeckSF"] < 450]
# train_data = train_data[train_data["OpenPorchSF"] < 200]

In [15]:
for column in categorical_columns:
    unique_values = train_data[column].nunique()
    if unique_values < 2:
        train_data = train_data.drop([column], axis=1)
        test_data = test_data.drop([column], axis=1)

categorical_columns = list(train_data.select_dtypes(include=['O']))

In [16]:
for column in categorical_columns:
    null_values = np.sum(train_data[column].isnull())
    if null_values >= 300:
        print(f"Колонка {column} содержит {null_values} нулевых значений")
        train_data = train_data.drop([column], axis=1)
        test_data = test_data.drop([column], axis=1)

categorical_columns = list(train_data.select_dtypes(include=['O']))

Колонка Alley содержит 1369 нулевых значений
Колонка FireplaceQu содержит 690 нулевых значений
Колонка PoolQC содержит 1453 нулевых значений
Колонка Fence содержит 1179 нулевых значений
Колонка MiscFeature содержит 1406 нулевых значений


In [17]:
for column in categorical_columns:
    train_mode = stats.mode(train_data[column]).mode[0]
    train_data[column] = train_data[column].fillna(train_mode)
    
    test_mode = stats.mode(test_data[column]).mode[0]
    test_data[column] = test_data[column].fillna(test_mode)

In [18]:
for column in categorical_columns:
    train_data[column] = train_data[column].astype("category")
    train_data[column] = train_data[column].cat.codes
    
    test_data[column] = test_data[column].astype("category")
    test_data[column] = test_data[column].cat.codes

In [19]:
multicollinear_features = []

for column1 in categorical_columns:
    for column2 in categorical_columns:
        if column1 != column2 and column2 not in multicollinear_features:
            corr = train_data[column1].corr(train_data[column2])
            if corr >= 0.7:
                multicollinear_features.append(column1)

train_data = train_data.drop(multicollinear_features, axis=1)
test_data = test_data.drop(multicollinear_features, axis=1)

In [20]:
from sklearn.ensemble import GradientBoostingRegressor

In [21]:
# transformer = RobustScaler().fit(train_data[numeric_columns])
# train_data[numeric_columns] = transformer.transform(train_data[numeric_columns])

# test_data[numeric_columns] = transformer.transform(test_data[numeric_columns])

In [22]:
y = train_data["SalePrice"]
X = train_data.drop(["SalePrice"], axis=1)

In [23]:
model = GradientBoostingRegressor()
model.fit(X,y)

In [24]:
train_score = model.score(X, y)
print(f"Оценка на тренировочных данных = {round(train_score, 3)}\n")

Оценка на тренировочных данных = 0.958



In [25]:
data = pd.read_csv("test.csv")
predict = model.predict(test_data)

result = pd.concat([data["Id"], pd.Series(predict)], axis=1)
result = result.rename({result.columns[1]: "SalePrice"}, axis=1)

result.to_csv("result.csv", index=False)