In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet

from sklearn import metrics
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score

from scipy import stats

In [2]:
train = pd.read_csv("./data/train.csv")
train_y = train[["SALE PRICE"]]
train_X = train
del train_X["SALE PRICE"]

test_X = pd.read_csv("./data/test.csv")
test_y = pd.read_csv("./data/test_groundtruth.csv")

print("train_X:",train_X.shape)
print("train_y:",train_y.shape)
print("test_X:",test_X.shape)
print("test_y:",test_y.shape)

train_X: (43064, 19)
train_y: (43064, 1)
test_X: (10767, 19)
test_y: (10767, 1)


In [3]:
train.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE DATE
0,1,UPPER EAST SIDE (79-96),13 CONDOS - ELEVATOR APARTMENTS,2,1569,1027,R4,402 EAST 90TH STREET,5A,10128,1,0,1,-,-,1910,2,R4,2016-09-30 00:00:00
1,4,HOLLISWOOD,10 COOPS - ELEVATOR APARTMENTS,2,10538,70,D4,"87-50 204TH STREET, B42",,11423,0,0,0,-,-,1954,2,D4,2017-02-10 00:00:00
2,3,DOWNTOWN-FULTON MALL,13 CONDOS - ELEVATOR APARTMENTS,2,170,1042,R4,265 STATE STREET,910,11201,1,0,1,0,0,2014,2,R4,2017-01-25 00:00:00
3,4,FLUSHING-NORTH,13 CONDOS - ELEVATOR APARTMENTS,2,4410,1023,R4,137-11 32 AVENUE,4W,11354,1,0,1,-,-,0,2,R4,2017-03-17 00:00:00
4,3,PARK SLOPE,09 COOPS - WALKUP APARTMENTS,2C,1067,29,C6,"862 PRESIDENT STREET, 1",,11215,0,0,0,0,0,1920,2,C6,2016-09-09 00:00:00


In [4]:
num_train_samples = len(train_X)
num_test_samples=len(test_X)
data_X = pd.concat([train_X, test_X])

# Data Pre Processing

In [5]:

# Let's delete some of the columns that we ** may not ** need
# 请注意 下面删除的特征很可能是有用的，合理的处理能够获得更为准确的预测模型，请探索所删除特征的使用
del data_X['ADDRESS']
del data_X['APARTMENT NUMBER']
#del data_X['BLOCK']
#del data_X['LOT']
#del data_X['BUILDING CLASS AT PRESENT']
#del data_X['BUILDING CLASS AT TIME OF SALE']
#del data_X['NEIGHBORHOOD']
#del data_X['SALE DATE']
#del data_X['LAND SQUARE FEET']
#del data_X['GROSS SQUARE FEET']

In [6]:
aa = data_X.isnull().sum()
aa[aa>0].sort_values(ascending=False)  

Series([], dtype: int64)

In [7]:
# Let's convert some of the columns to appropriate datatype

data_X['TAX CLASS AT TIME OF SALE'] = data_X['TAX CLASS AT TIME OF SALE'].astype('category')
data_X['TAX CLASS AT PRESENT'] = data_X['TAX CLASS AT PRESENT'].astype('category')
data_X['BUILDING CLASS AT TIME OF SALE'] = data_X['BUILDING CLASS AT TIME OF SALE'].astype('category')
data_X['BUILDING CLASS AT PRESENT'] = data_X['BUILDING CLASS AT PRESENT'].astype('category')
#data_X['APARTMENT NUMBER'] = data_X['APARTMENT NUMBER'].astype('category')
data_X['BOROUGH'] = data_X['BOROUGH'].astype('category')
data_X['SALE DATE']=data_X['SALE DATE'].astype(str)
data_X['oSALE DATE']=data_X['SALE DATE'].astype(str)
data_X.replace(' -  ',0,inplace=True)
data_X['GROSS SQUARE FEET']=data_X['GROSS SQUARE FEET'].astype("int64")
data_X['LAND SQUARE FEET']=data_X['LAND SQUARE FEET'].astype("int64")

In [8]:
#data_X["oTAX CLASS AT TIME OF SALE"]=data_X["TAX CLASS AT TIME OF SALE"].map({1:58,2:68,4:99})
data_X["oTAX CLASS AT PRESENT"]=data_X["TAX CLASS AT PRESENT"].map({"1":59,"1A":40,"1B":40,"1C":95,"2":63,"2A":110,"2B":200,"2C":85,"4":95," ":95})

In [9]:
data_X_train=data_X[:num_train_samples]
data_X_test=data_X[num_train_samples:]
for i in range(num_train_samples):
    tmp=data_X_train['SALE DATE'][i].split('-')
    data_X_train['oSALE DATE'][i]=(int(tmp[0])-2016)*12+int(tmp[1])
    #data_X_train['ADDRESS'][i]=data_X_train["ADDRESS"][i].split(',')[0]
for i in range ( num_test_samples ):
    tmp=data_X_test['SALE DATE'][i].split('-')
    data_X_test['oSALE DATE'][i]=(int(tmp[0])-2016)*12+int(tmp[1])
    #data_X_test['ADDRESS'][i]=data_X_test["ADDRESS"][i].split(',')[0]
data_X = pd.concat([data_X_train, data_X_test])

In [10]:
data_X['NEIGHBORHOOD'] = data_X['NEIGHBORHOOD'].astype('category')

In [11]:
data_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53831 entries, 0 to 10766
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   BOROUGH                         53831 non-null  category
 1   NEIGHBORHOOD                    53831 non-null  category
 2   BUILDING CLASS CATEGORY         53831 non-null  object  
 3   TAX CLASS AT PRESENT            53831 non-null  category
 4   BLOCK                           53831 non-null  int64   
 5   LOT                             53831 non-null  int64   
 6   BUILDING CLASS AT PRESENT       53831 non-null  category
 7   ZIP CODE                        53831 non-null  int64   
 8   RESIDENTIAL UNITS               53831 non-null  int64   
 9   COMMERCIAL UNITS                53831 non-null  int64   
 10  TOTAL UNITS                     53831 non-null  int64   
 11  LAND SQUARE FEET                53831 non-null  int64   
 12  GROSS SQUARE FEET 

In [12]:
data_X.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,BUILDING CLASS AT PRESENT,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE DATE,oSALE DATE,oTAX CLASS AT PRESENT
0,1,UPPER EAST SIDE (79-96),13 CONDOS - ELEVATOR APARTMENTS,2,1569,1027,R4,10128,1,0,1,0,0,1910,2,R4,2016-09-30 00:00:00,9,63
1,4,HOLLISWOOD,10 COOPS - ELEVATOR APARTMENTS,2,10538,70,D4,11423,0,0,0,0,0,1954,2,D4,2017-02-10 00:00:00,14,63
2,3,DOWNTOWN-FULTON MALL,13 CONDOS - ELEVATOR APARTMENTS,2,170,1042,R4,11201,1,0,1,0,0,2014,2,R4,2017-01-25 00:00:00,13,63
3,4,FLUSHING-NORTH,13 CONDOS - ELEVATOR APARTMENTS,2,4410,1023,R4,11354,1,0,1,0,0,0,2,R4,2017-03-17 00:00:00,15,63
4,3,PARK SLOPE,09 COOPS - WALKUP APARTMENTS,2C,1067,29,C6,11215,0,0,0,0,0,1920,2,C6,2016-09-09 00:00:00,9,85


In [13]:
#Select the variables to be one-hot encoded
one_hot_features = ['BOROUGH', 'BUILDING CLASS CATEGORY','TAX CLASS AT PRESENT','TAX CLASS AT TIME OF SALE','BUILDING CLASS AT PRESENT','BUILDING CLASS AT TIME OF SALE','SALE DATE','NEIGHBORHOOD']

In [14]:
# Convert categorical variables into dummy/indicator variables (i.e. one-hot encoding).
one_hot_encoded = pd.get_dummies(data_X[one_hot_features])
one_hot_encoded.info(verbose=True, memory_usage=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53831 entries, 0 to 10766
Data columns (total 866 columns):
 #    Column                                                                Non-Null Count  Dtype
---   ------                                                                --------------  -----
 0    BOROUGH_1                                                             53831 non-null  uint8
 1    BOROUGH_2                                                             53831 non-null  uint8
 2    BOROUGH_3                                                             53831 non-null  uint8
 3    BOROUGH_4                                                             53831 non-null  uint8
 4    BOROUGH_5                                                             53831 non-null  uint8
 5    BUILDING CLASS CATEGORY_01 ONE FAMILY DWELLINGS                       53831 non-null  uint8
 6    BUILDING CLASS CATEGORY_02 TWO FAMILY DWELLINGS                       53831 non-null  uint8
 7    BU

In [15]:
data_X = data_X.drop(one_hot_features,axis=1)
data_X = pd.concat([data_X, one_hot_encoded] ,axis=1)

In [16]:
data_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53831 entries, 0 to 10766
Columns: 877 entries, BLOCK to NEIGHBORHOOD_WYCKOFF HEIGHTS
dtypes: int64(10), object(1), uint8(866)
memory usage: 49.4+ MB


In [17]:
train_X = data_X[:num_train_samples].to_numpy()
test_X = data_X[num_train_samples:].to_numpy()

# Regression

In [140]:
rf_regr = RandomForestRegressor()
rf_regr.fit(train_X, train_y)
Y_pred_rf = rf_regr.predict(test_X)

# MAPE metric
mean_absolute_percentage_error(test_y,Y_pred_rf)

0.33963923237201954

In [18]:
from sklearn.preprocessing import RobustScaler, StandardScaler
scaler = RobustScaler()  # 对数据进行缩放
n_train=train.shape[0] #获得有多少条训练数据
#del data_X["NEIGHBORHOOD"]
#del data_X["ADDRESS"]
#del data_X["SALE DATE"]
X = data_X[:n_train] # 获得训练数据
test_X = data_X[n_train:] # 获得测试数据
y= test_y

X_scaled = scaler.fit(X).transform(X)
y_log = np.log(y) # 对于 y 进行人为定义log 进行缩放 
test_X_scaled = scaler.transform(test_X)

In [19]:
y_logg=np.log(train_y)

In [123]:
rf_regr = RandomForestRegressor(verbose=True)
rf_regr.fit(X_scaled,y_logg)
Y_pred_rf = rf_regr.predict(test_X_scaled)

# MAPE metric
mean_absolute_percentage_error(test_y,np.exp(Y_pred_rf))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  2.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.4s finished


0.31791926117269637

In [20]:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor

In [21]:
def rmse_cv(model,X,y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=1))  # 5折交叉验证
    return rmse

In [None]:
models = [Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(), GradientBoostingRegressor(),  LinearSVR(),
          ElasticNet(alpha=0.001,max_iter=10000), BayesianRidge(), KernelRidge (alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
          ExtraTreesRegressor(),XGBRegressor()] 
models=[Ridge(), LinearSVR(), BayesianRidge(),ExtraTreesRegressor(),XGBRegressor()]
names = ["Ridge", "LinSVR", "Bay","Extra","Xgb"]
for name, model in zip(names, models):
    score = rmse_cv(model, X_scaled, y_logg)
    print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std()))

Ridge: 537623.427874, 8893.4063
LinSVR: 707419.379389, 22409.0992


In [22]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from scipy.stats import skew
from sklearn.decomposition import PCA, KernelPCA
from sklearn.impute import SimpleImputer as Imputer

In [23]:
class AverageWeight(BaseEstimator, RegressorMixin):
    def __init__(self,mod,weight):
        self.mod = mod
        self.weight = weight
        self.nums=len(mod)
    def fit(self,X,y):
        self.models_ = [clone(x) for x in self.mod]
        for model in self.models_:
            model.fit(X,y)
            print("完成！")
        return self
    
    def predict(self,X,models,weight):
        self.weight = weight
        u = X.shape[0]
        w=[]
        for i in range(u):
            w.append(0)
        for i in models:
            print("基础")
            tmp=self.models_[i].predict(X)*self.weight[i]
            for j in range(u):
                w[j]=w[j]+tmp[j]
        #pred = np.array([model.predict(X) for model in self.models_])
        # for every data point, single model prediction times weight, then add them together
        #for data in range(pred.shape[1]):
         #   single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
          #  w.append(np.sum(single))
        return w

In [24]:
lasso = Lasso(alpha=0.001,max_iter=10000)
ridge = Ridge()
RF = RandomForestRegressor()
GBR = GradientBoostingRegressor()
LinSVR = LinearSVR()
Ela =  ElasticNet(alpha=0.001,max_iter=10000)
Bay =BayesianRidge() 
Extra =  ExtraTreesRegressor()
Xgb = XGBRegressor(max_depth=6,n_estimators=800,learning_rate=0.16)

In [25]:
w1 = 0.2
w2 = 0.2
w3 = 0.6
w4 = 0.3
w5 = 0.05
w6 = 0.1

In [36]:
weight_avg = AverageWeight(mod = [lasso,ridge,RF,GBR,LinSVR,Ela,Bay,Extra,Xgb],weight=[w1,w2,w3])
weight_avg.fit(X_scaled,y_logg)

完成！
完成！
完成！
完成！
完成！
完成！
完成！
完成！
完成！


In [45]:
def predict(Self,X,models,weight):
    Self.weight = weight
    u = X.shape[0]
    w=[]
    for i in range(u):
        w.append(0)
    for i in range(len(models)):
        print("基础")
        tmp=Self.models_[models[i]].predict(X)*weight[i]
        for j in range(u):
            w[j]=w[j]+tmp[j]
        #pred = np.array([model.predict(X) for model in self.models_])
        # for every data point, single model prediction times weight, then add them together
        #for data in range(pred.shape[1]):
         #   single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
          #  w.append(np.sum(single))
    return w

In [37]:
W=[]
for i in range(weight_avg.nums):
            print("基础")
            tmp=weight_avg.models_[i].predict(test_X_scaled)
            W.append(mean_absolute_percentage_error(test_y,np.exp(tmp)))
W

基础
基础
基础
基础
基础
基础
基础
基础
基础


[0.40257969217595857,
 0.37496789140252884,
 0.31799019183544625,
 0.38904074412066963,
 0.371064836879462,
 0.3901713394313403,
 0.3749617099592213,
 0.33806137893260235,
 0.3412323535932328]

In [51]:
Y_pred_rf = predict(weight_avg,test_X_scaled,[2,7],[0.8,0.2])
mean_absolute_percentage_error(test_y,np.exp(Y_pred_rf))

基础
基础


0.3177509369177875

In [None]:
weight_avg = AverageWeight(mod = [Extra, ridge, Xgb, RF,GBR, Bay],weight=[w1,w2,w3,w4,w5,w6])
rmse_cv(weight_avg,X_scaled,y_logg),  rmse_cv(weight_avg,X_scaled,y_logg).mean()

In [138]:
rf_regr.fit(X_scaled, y_logg)

In [139]:

import numpy as np
Y_pred_rf = rf_regr.predict(test_X_scaled)
mean_absolute_percentage_error(test_y,np.exp(Y_pred_rf))

0.31875050389417536

In [None]:
rf_regr.fit(X_scaled, train_y)

In [63]:
import numpy as np
Y_pred_rf = rf_regr.predict(test_X_scaled)
mean_absolute_percentage_error(test_y,Y_pred_rf)

0.33900199335374487

In [32]:
pd.DataFrame({"pred":Y_pred_rf}).to_csv("学号_姓名.csv")

In [26]:
import datetime

WWW=[]
depth=[6]
trees=[400,800】]
lr=[0.1,0.15,0.2]
for de in depth:
    for tree in trees:
        for lrr in lr:
            start = datetime.datetime.now()
            mm=XGBRegressor(max_depth=de,n_estimators=tree,learning_rate=lrr)        # 使用多少棵树来拟合，也可以理解为多少次迭代。默认100；
            mm.fit(X_scaled,y_logg)
            YY=mm.predict(test_X_scaled)
            tmp=mean_absolute_percentage_error(test_y,np.exp(YY))
            end = datetime.datetime.now()
            print(de,end=" ")
            print(tree,end=" ")
            print(lrr,end=" ")
            print(tmp)
            print('totally time is ' ,end = "")
            print(end-start)
            WWW.append(tmp)
WWW

6 800 0.1 0.3303402501199265
totally time is 0:02:46.693772
6 800 0.15 0.32683184550392264
totally time is 0:02:46.436442
6 800 0.2 0.32766503871163666
totally time is 0:02:45.732962
6 1200 0.1 0.3277392439911786
totally time is 0:04:09.835459
6 1200 0.15 0.32629573384049854
totally time is 0:04:08.190134
6 1200 0.2 0.328235370728801
totally time is 0:04:07.282323
6 1600 0.1 0.3265961848798322
totally time is 0:05:32.041322
6 1600 0.15 0.32719867177775813
totally time is 0:05:30.230619
6 1600 0.2 0.3310832375424873
totally time is 0:05:29.484162
6 2400 0.1 0.3273328511501888
totally time is 0:08:15.320929
6 2400 0.15 0.3291598506620614
totally time is 0:08:13.284119
6 2400 0.2 0.3372376604996416
totally time is 0:08:11.234391


[0.3303402501199265,
 0.32683184550392264,
 0.32766503871163666,
 0.3277392439911786,
 0.32629573384049854,
 0.328235370728801,
 0.3265961848798322,
 0.32719867177775813,
 0.3310832375424873,
 0.3273328511501888,
 0.3291598506620614,
 0.3372376604996416]

In [None]:
import datetime

WWW=[]
depth=[6]
trees=[800,1200,1600]
lr=[0.1,0.15,0.2]
for de in depth:
    for tree in trees:
        for lrr in lr:
            start = datetime.datetime.now()
            mm=XGBRegressor(max_depth=de,n_estimators=tree,learning_rate=lrr)        # 使用多少棵树来拟合，也可以理解为多少次迭代。默认100；
            mm.fit(X_scaled,y_logg)
            YY=mm.predict(test_X_scaled)
            tmp=mean_absolute_percentage_error(test_y,np.exp(YY))
            end = datetime.datetime.now()
            print(de,end=" ")
            print(tree,end=" ")
            print(lrr,end=" ")
            print(tmp)
            print('totally time is ' ,end = "")
            print(end-start)
            WWW.append(tmp)
WWW

6 800 0.05 0.3350308184883919
totally time is 0:01:56.563457
6 800 0.1 0.3303402501199265
totally time is 0:02:03.939198
6 800 0.15 0.32683184550392264
totally time is 0:02:07.982595
6 800 0.2 0.32766503871163666
totally time is 0:02:08.521636
6 800 0.4 0.33831505484808144
totally time is 0:02:08.446361
6 1200 0.05 0.33193211481385765
totally time is 0:03:17.188579
6 1200 0.1 0.3277392439911786
totally time is 0:03:16.967780
6 1200 0.15 0.32629573384049854
totally time is 0:03:19.211351
6 1200 0.2 0.328235370728801
totally time is 0:03:18.798700
6 1200 0.4 0.3443793738692462
totally time is 0:03:17.496033
6 1600 0.05 0.329758949381
totally time is 0:04:24.511116


In [30]:
import datetime

WWW=[]
depth=[6]
trees=[800]
lr=[500]
for de in depth:
    for tree in trees:
        for lrr in lr:
            start = datetime.datetime.now()
            mm=RandomForestRegressor(n_estimators=lrr,verbose=True)        # 使用多少棵树来拟合，也可以理解为多少次迭代。默认100；
            mm.fit(X_scaled,y_logg)
            YY=mm.predict(test_X_scaled)
            tmp=mean_absolute_percentage_error(test_y,np.exp(YY))
            end = datetime.datetime.now()
            print(de,end=" ")
            print(tree,end=" ")
            print(lrr,end=" ")
            print(tmp)
            print('totally time is ' ,end = "")
            print(end-start)
            WWW.append(tmp)
WWW

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 25.4min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


6 800 500 0.3169416656576877
totally time is 0:25:27.676281


[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed:    2.8s finished


[0.3169416656576877]

In [31]:
import datetime

WWW=[]
depth=[6]
trees=[3200]
lr=[0.1,0.15,0.2]
for de in depth:
    for tree in trees:
        for lrr in lr:
            start = datetime.datetime.now()
            mm=XGBRegressor(max_depth=de,n_estimators=tree,learning_rate=lrr)        # 使用多少棵树来拟合，也可以理解为多少次迭代。默认100；
            mm.fit(X_scaled,y_logg)
            YY=mm.predict(test_X_scaled)
            tmp=mean_absolute_percentage_error(test_y,np.exp(YY))
            end = datetime.datetime.now()
            print(de,end=" ")
            print(tree,end=" ")
            print(lrr,end=" ")
            print(tmp)
            print('totally time is ' ,end = "")
            print(end-start)
            WWW.append(tmp)
WWW

6 3200 0.1 0.32946608987565384
totally time is 0:11:16.280694
6 3200 0.15 0.33263571701142064
totally time is 0:11:12.309581
6 3200 0.2 0.34177348246326283
totally time is 0:11:07.716917


[0.32946608987565384, 0.33263571701142064, 0.34177348246326283]

In [39]:
import datetime

WWW=[]
depth=[160]
trees=[160]
lr=[0.1]
for de in depth:
    for tree in trees:
        for lrr in lr:
            start = datetime.datetime.now()
            mm=ExtraTreesRegressor(max_depth=de,n_estimators=tree)        # 使用多少棵树来拟合，也可以理解为多少次迭代。默认100；
            mm.fit(X_scaled,y_logg)
            YY=mm.predict(test_X_scaled)
            tmp=mean_absolute_percentage_error(test_y,np.exp(YY))
            end = datetime.datetime.now()
            print(de,end=" ")
            print(tree,end=" ")
            print(lrr,end=" ")
            print(tmp)
            print('totally time is ' ,end = "")
            print(end-start)
            WWW.append(tmp)
WWW

160 160 0.1 0.33785220544221456
totally time is 0:06:52.035447


[0.33785220544221456]