In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [None]:
train_data = pd.read_csv("./data/train.csv")
test_data = pd.read_csv("./data/test.csv")

In [None]:
train_data.head()

In [None]:
train_data.fillna('Unknown',inplace=True)
test_data.fillna('Unknown',inplace=True)

In [None]:
train_data_numeric = list(train_data._get_numeric_data().columns)
train_data_category = list(set(train_data.columns) - set(train_data._get_numeric_data().columns))
test_data_numeric = list(test_data._get_numeric_data().columns)
test_data_category = list(set(test_data.columns) - set(test_data._get_numeric_data().columns))

In [None]:
oe = OrdinalEncoder()
for col in train_data_category:
    train_data[col] = oe.fit_transform(np.asarray(train_data[col].astype('str')).reshape(-1, 1))
for col in test_data_category:
    test_data[col] = oe.fit_transform(np.asarray(test_data[col].astype('str')).reshape(-1, 1))

In [None]:
l = list(set(train_data._get_numeric_data().columns))
for col in l:
    if col == 'Id':
        continue
    upper_limit = int(train_data[col].mean() + 3 * train_data[col].std())
    lower_limit = int(train_data[col].mean() - 3 * train_data[col].std())
    train_data[col] = np.where(train_data[col] > upper_limit, upper_limit,
                               np.where(train_data[col] < lower_limit, lower_limit, train_data[col]))

In [None]:
l = list(set(test_data._get_numeric_data().columns))
for col in l:
    if col == 'Id':
        continue
    upper_limit = int(test_data[col].mean() + 3 * test_data[col].std())
    lower_limit = int(test_data[col].mean() - 3 * test_data[col].std())
    test_data[col] = np.where(test_data[col] > upper_limit, upper_limit,
                              np.where(test_data[col] < lower_limit, lower_limit, test_data[col]))

In [None]:
X = train_data.iloc[:, 0:-1]
y = train_data.loc[:, 'SalePrice']

# 全连接

In [None]:
model = MLPRegressor(activation='relu', solver='adam', alpha=0.0001, batch_size='auto', learning_rate='constant',
                     hidden_layer_sizes=(1000,1000))

In [None]:
model.fit(X, y)

In [None]:
pred = model.predict(X)
print(np.sqrt(mean_squared_error(y, pred)))

In [None]:
pred = model.predict(test_data)

In [None]:
result_df = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': pred})
result_df.to_csv("./data/result/result11.csv")

# 随机森林

In [None]:
rfr = RandomForestRegressor(n_estimators=2000)

In [None]:
rfr.fit(X,y)

In [None]:
pred = rfr.predict(X)
print(np.sqrt(mean_squared_error(y, pred)))

In [None]:
pred = rfr.predict(test_data)

In [None]:
result_df = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': pred})
result_df.to_csv("./data/result/result16.csv")

# XGBOOST

In [None]:
xgb = XGBRegressor(n_estimators=50)

In [None]:
xgb.fit(X=X, y=y)

In [None]:
pred = xgb.predict(X)
print(np.sqrt(mean_squared_error(y, pred)))

In [None]:
pred = xgb.predict(test_data)

In [None]:
result_df = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': pred})
result_df.to_csv("./data/result/result19.csv")