In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet

from sklearn import metrics
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score

from scipy import stats

In [2]:
train = pd.read_csv("./data/train.csv")
train_y = train[["SALE PRICE"]]
train_X = train
del train_X["SALE PRICE"]

test_X = pd.read_csv("./data/test.csv")
test_y = pd.read_csv("./data/test_groundtruth.csv")

print("train_X:",train_X.shape)
print("train_y:",train_y.shape)
print("test_X:",test_X.shape)
print("test_y:",test_y.shape)

train_X: (43064, 19)
train_y: (43064, 1)
test_X: (10767, 19)
test_y: (10767, 1)


In [3]:
train.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE DATE
0,1,UPPER EAST SIDE (79-96),13 CONDOS - ELEVATOR APARTMENTS,2,1569,1027,R4,402 EAST 90TH STREET,5A,10128,1,0,1,-,-,1910,2,R4,2016-09-30 00:00:00
1,4,HOLLISWOOD,10 COOPS - ELEVATOR APARTMENTS,2,10538,70,D4,"87-50 204TH STREET, B42",,11423,0,0,0,-,-,1954,2,D4,2017-02-10 00:00:00
2,3,DOWNTOWN-FULTON MALL,13 CONDOS - ELEVATOR APARTMENTS,2,170,1042,R4,265 STATE STREET,910,11201,1,0,1,0,0,2014,2,R4,2017-01-25 00:00:00
3,4,FLUSHING-NORTH,13 CONDOS - ELEVATOR APARTMENTS,2,4410,1023,R4,137-11 32 AVENUE,4W,11354,1,0,1,-,-,0,2,R4,2017-03-17 00:00:00
4,3,PARK SLOPE,09 COOPS - WALKUP APARTMENTS,2C,1067,29,C6,"862 PRESIDENT STREET, 1",,11215,0,0,0,0,0,1920,2,C6,2016-09-09 00:00:00


In [4]:
num_train_samples = len(train_X)
num_test_samples=len(test_X)
data_X = pd.concat([train_X, test_X])

# Data Pre Processing

In [5]:

# Let's delete some of the columns that we ** may not ** need
# 请注意 下面删除的特征很可能是有用的，合理的处理能够获得更为准确的预测模型，请探索所删除特征的使用
del data_X['ADDRESS']
del data_X['APARTMENT NUMBER']
#del data_X['BLOCK']
#del data_X['LOT']
#del data_X['BUILDING CLASS AT PRESENT']
#del data_X['BUILDING CLASS AT TIME OF SALE']
#del data_X['NEIGHBORHOOD']
#del data_X['SALE DATE']
#del data_X['LAND SQUARE FEET']
#del data_X['GROSS SQUARE FEET']

In [6]:
aa = data_X.isnull().sum()
aa[aa>0].sort_values(ascending=False)  

Series([], dtype: int64)

In [7]:
# Let's convert some of the columns to appropriate datatype

data_X['TAX CLASS AT TIME OF SALE'] = data_X['TAX CLASS AT TIME OF SALE'].astype('category')
data_X['TAX CLASS AT PRESENT'] = data_X['TAX CLASS AT PRESENT'].astype('category')
data_X['BUILDING CLASS AT TIME OF SALE'] = data_X['BUILDING CLASS AT TIME OF SALE'].astype('category')
data_X['BUILDING CLASS AT PRESENT'] = data_X['BUILDING CLASS AT PRESENT'].astype('category')
#data_X['APARTMENT NUMBER'] = data_X['APARTMENT NUMBER'].astype('category')
data_X['BOROUGH'] = data_X['BOROUGH'].astype('category')
data_X['SALE DATE']=data_X['SALE DATE'].astype(str)
data_X['oSALE DATE']=data_X['SALE DATE'].astype(str)
data_X.replace(' -  ',0,inplace=True)
data_X['GROSS SQUARE FEET']=data_X['GROSS SQUARE FEET'].astype("int64")
data_X['LAND SQUARE FEET']=data_X['LAND SQUARE FEET'].astype("int64")

In [8]:
#data_X["oTAX CLASS AT TIME OF SALE"]=data_X["TAX CLASS AT TIME OF SALE"].map({1:58,2:68,4:99})
data_X["oTAX CLASS AT PRESENT"]=data_X["TAX CLASS AT PRESENT"].map({"1":59,"1A":40,"1B":40,"1C":95,"2":63,"2A":110,"2B":200,"2C":85,"4":95," ":95})

In [9]:
data_X_train=data_X[:num_train_samples]
data_X_test=data_X[num_train_samples:]
for i in range(num_train_samples):
    tmp=data_X_train['SALE DATE'][i].split('-')
    data_X_train['oSALE DATE'][i]=(int(tmp[0])-2016)*12+int(tmp[1])
    #data_X_train['ADDRESS'][i]=data_X_train["ADDRESS"][i].split(',')[0]
for i in range ( num_test_samples ):
    tmp=data_X_test['SALE DATE'][i].split('-')
    data_X_test['oSALE DATE'][i]=(int(tmp[0])-2016)*12+int(tmp[1])
    #data_X_test['ADDRESS'][i]=data_X_test["ADDRESS"][i].split(',')[0]
data_X = pd.concat([data_X_train, data_X_test])

In [10]:
data_X['NEIGHBORHOOD'] = data_X['NEIGHBORHOOD'].astype('category')

In [11]:
data_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53831 entries, 0 to 10766
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype   
---  ------                          --------------  -----   
 0   BOROUGH                         53831 non-null  category
 1   NEIGHBORHOOD                    53831 non-null  category
 2   BUILDING CLASS CATEGORY         53831 non-null  object  
 3   TAX CLASS AT PRESENT            53831 non-null  category
 4   BLOCK                           53831 non-null  int64   
 5   LOT                             53831 non-null  int64   
 6   BUILDING CLASS AT PRESENT       53831 non-null  category
 7   ZIP CODE                        53831 non-null  int64   
 8   RESIDENTIAL UNITS               53831 non-null  int64   
 9   COMMERCIAL UNITS                53831 non-null  int64   
 10  TOTAL UNITS                     53831 non-null  int64   
 11  LAND SQUARE FEET                53831 non-null  int64   
 12  GROSS SQUARE FEET 

In [12]:
data_X.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,BUILDING CLASS AT PRESENT,ZIP CODE,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE DATE,oSALE DATE,oTAX CLASS AT PRESENT
0,1,UPPER EAST SIDE (79-96),13 CONDOS - ELEVATOR APARTMENTS,2,1569,1027,R4,10128,1,0,1,0,0,1910,2,R4,2016-09-30 00:00:00,9,63
1,4,HOLLISWOOD,10 COOPS - ELEVATOR APARTMENTS,2,10538,70,D4,11423,0,0,0,0,0,1954,2,D4,2017-02-10 00:00:00,14,63
2,3,DOWNTOWN-FULTON MALL,13 CONDOS - ELEVATOR APARTMENTS,2,170,1042,R4,11201,1,0,1,0,0,2014,2,R4,2017-01-25 00:00:00,13,63
3,4,FLUSHING-NORTH,13 CONDOS - ELEVATOR APARTMENTS,2,4410,1023,R4,11354,1,0,1,0,0,0,2,R4,2017-03-17 00:00:00,15,63
4,3,PARK SLOPE,09 COOPS - WALKUP APARTMENTS,2C,1067,29,C6,11215,0,0,0,0,0,1920,2,C6,2016-09-09 00:00:00,9,85


In [13]:
#Select the variables to be one-hot encoded
one_hot_features = ['BOROUGH', 'BUILDING CLASS CATEGORY','TAX CLASS AT PRESENT','TAX CLASS AT TIME OF SALE','BUILDING CLASS AT PRESENT','BUILDING CLASS AT TIME OF SALE','SALE DATE','NEIGHBORHOOD']

In [14]:
# Convert categorical variables into dummy/indicator variables (i.e. one-hot encoding).
one_hot_encoded = pd.get_dummies(data_X[one_hot_features])
one_hot_encoded.info(verbose=True, memory_usage=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53831 entries, 0 to 10766
Data columns (total 866 columns):
 #   Column                                                                Non-Null Count  Dtype
---  ------                                                                --------------  -----
 0   BOROUGH_1                                                             53831 non-null  uint8
 1   BOROUGH_2                                                             53831 non-null  uint8
 2   BOROUGH_3                                                             53831 non-null  uint8
 3   BOROUGH_4                                                             53831 non-null  uint8
 4   BOROUGH_5                                                             53831 non-null  uint8
 5   BUILDING CLASS CATEGORY_01 ONE FAMILY DWELLINGS                       53831 non-null  uint8
 6   BUILDING CLASS CATEGORY_02 TWO FAMILY DWELLINGS                       53831 non-null  uint8
 7   BUILDING CLA

In [15]:
data_X = data_X.drop(one_hot_features,axis=1)
data_X = pd.concat([data_X, one_hot_encoded] ,axis=1)

In [16]:
data_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 53831 entries, 0 to 10766
Columns: 877 entries, BLOCK to NEIGHBORHOOD_WYCKOFF HEIGHTS
dtypes: int64(10), object(1), uint8(866)
memory usage: 49.4+ MB


In [17]:
train_X = data_X[:num_train_samples].to_numpy()
test_X = data_X[num_train_samples:].to_numpy()

# Regression

In [18]:
'''不做任何处理
rf_regr = RandomForestRegressor(n_jobs=64)
rf_regr.fit(train_X, train_y)
Y_pred_rf = rf_regr.predict(test_X)

# MAPE metric
mean_absolute_percentage_error(test_y,Y_pred_rf)
'''

'不做任何处理\nrf_regr = RandomForestRegressor(n_jobs=64)\nrf_regr.fit(train_X, train_y)\nY_pred_rf = rf_regr.predict(test_X)\n\n# MAPE metric\nmean_absolute_percentage_error(test_y,Y_pred_rf)\n'

In [19]:
from sklearn.preprocessing import RobustScaler, StandardScaler
scaler = RobustScaler()  # 对数据进行缩放
n_train=train.shape[0] #获得有多少条训练数据
#del data_X["NEIGHBORHOOD"]
#del data_X["ADDRESS"]
#del data_X["SALE DATE"]
X = data_X[:n_train] # 获得训练数据
test_X = data_X[n_train:] # 获得测试数据
y= test_y

X_scaled = scaler.fit(X).transform(X)
y_log = np.log(y) # 对于 y 进行人为定义log 进行缩放 
test_X_scaled = scaler.transform(test_X)

In [20]:
y_logg=np.log(train_y)

In [21]:
'''只做变形处理
rf_regr = RandomForestRegressor(verbose=True,n_jobs=64)
rf_regr.fit(X_scaled,y_logg)
Y_pred_rf = rf_regr.predict(test_X_scaled)

# MAPE metric
mean_absolute_percentage_error(test_y,np.exp(Y_pred_rf))
'''

'只做变形处理\nrf_regr = RandomForestRegressor(verbose=True,n_jobs=64)\nrf_regr.fit(X_scaled,y_logg)\nY_pred_rf = rf_regr.predict(test_X_scaled)\n\n# MAPE metric\nmean_absolute_percentage_error(test_y,np.exp(Y_pred_rf))\n'

In [22]:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from scipy.stats import skew
from sklearn.decomposition import PCA, KernelPCA
from sklearn.impute import SimpleImputer as Imputer

In [24]:
class AverageWeight(BaseEstimator, RegressorMixin):
    def __init__(self,mod,weight):
        self.mod = mod
        self.weight = weight
        self.nums=len(mod)
    def fit(self,X,y):
        self.models_ = [clone(x) for x in self.mod]
        for model in self.models_:
            model.fit(X,y)
            print("完成！")
        return self
    
    def predict(self,X,models=[],weight=[]):
        if(len(models)==0):
            models=self.mod
        if(len(weight)==0):
            weight=self.weight
        u = X.shape[0]
        w=[]
        for i in range(u):
            w.append(0)
        for i in models:
            print("基础")
            tmp=self.models_[i].predict(X)*weight[i]
            for j in range(u):
                w[j]=w[j]+tmp[j]
        #pred = np.array([model.predict(X) for model in self.models_])
        # for every data point, single model prediction times weight, then add them together
        #for data in range(pred.shape[1]):
         #   single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
          #  w.append(np.sum(single))
        return w

In [25]:
RF = RandomForestRegressor(n_estimators=600,n_jobs=200,verbose=False)
Extra =  ExtraTreesRegressor(max_depth=160,n_estimators=160,n_jobs=200) 
Xgb = XGBRegressor(max_depth=6,n_estimators=1600,learning_rate=0.16)

In [26]:
w1 = 0.15
w2 = 0.25
w3 = 0.6

In [27]:
weight_avg = AverageWeight(mod = [Extra,Xgb, RF],weight=[w1,w2,w3])
weight_avg.fit(X_scaled,y_logg)

完成！
完成！
完成！


AverageWeight(mod=[ExtraTreesRegressor(max_depth=160, n_estimators=160,
                                       n_jobs=200),
                   XGBRegressor(base_score=None, booster=None,
                                colsample_bylevel=None, colsample_bynode=None,
                                colsample_bytree=None, enable_categorical=False,
                                gamma=None, gpu_id=None, importance_type=None,
                                interaction_constraints=None,
                                learning_rate=0.16, max_delta_step=None,
                                max_depth=6, min_child_weight=None, missing=nan,
                                monotone_constraints=None, n_estimators=1600,
                                n_jobs=None, num_parallel_tree=None,
                                predictor=None, random_state=None,
                                reg_alpha=None, reg_lambda=None,
                                scale_pos_weight=None, subsample=None,
        

In [28]:
def predict(Self,X,models,weight,special=0):
    Self.weight = weight
    u = X.shape[0]
    w=[]
    for i in range(u):
        w.append(0)
    for i in range(len(models)):
        #print("基础")
        if special==0:
            tmp=Self.models_[models[i]].predict(X)*weight[i]
            #print(type(tmp))
            for j in range(u):
                w[j]=w[j]+tmp[j]
        else:
            tmp=Self.models_[models[i]].predict(X)*1
            #print(type(tmp))
            for j in range(u):
                if w[j]>tmp[j] or i == 0:
                    w[j]=tmp[j]
        for j in range(u):
            w[j]=float(w[j])
        #pred = np.array([model.predict(X) for model in self.models_])
        # for every data point, single model prediction times weight, then add them together
        #for data in range(pred.shape[1]):
         #   single = [pred[model,data]*weight for model,weight in zip(range(pred.shape[0]),self.weight)]
          #  w.append(np.sum(single))
    return w

In [29]:
W=[]
for i in range(weight_avg.nums):
            print("基础")
            tmp=weight_avg.models_[i].predict(test_X_scaled)
            W.append(mean_absolute_percentage_error(test_y,np.exp(tmp)))
W

基础
基础
基础


[0.33844527590997797, 0.3271116616335088, 0.31692621196099924]

In [30]:
Y_pred_rf = predict(weight_avg,test_X_scaled,[0,1,2],[w1,w2,w3],special=1)  
print(mean_absolute_percentage_error(test_y,np.exp(Y_pred_rf)))

0.3034559640754448


In [47]:
import sys
#sys.setdefaultencoding("utf8")
print(type(Y_pred_rf))
print(Y_pred_rf)
pd.DataFrame({"pred":Y_pred_rf}).to_csv("1.csv",encoding="utf8")

<class 'list'>
[12.836150169372559, 14.695767698654471, 13.009067007905875, 12.784393196768411, 13.328808384582908, 13.366321381333133, 12.976964950561523, 12.233906686349112, 14.08782218357431, 12.829177667421028, 14.081542567019415, 13.27489185333252, 13.754205703735352, 12.957970067523936, 13.95553849792952, 13.780826727343229, 13.109390465056151, 12.864083290100098, 14.631561429033178, 13.838311699391777, 12.80798625946045, 12.84852960100788, 12.589336154150901, 12.85194630278761, 13.824828147888184, 14.527101971014238, 12.805231673479039, 12.852903366088867, 13.732353376007191, 13.152005195617676, 13.380142211914062, 13.440589904785156, 13.463309039134224, 13.638387680053711, 13.001553535461426, 14.026462725494175, 12.018844645935491, 13.0335054397583, 13.850131034851074, 12.70158863067627, 12.344858169555664, 12.27520214611982, 13.46543025970459, 13.88353599700049, 13.868197441101074, 14.119567141610036, 13.962686538696289, 13.771711349487305, 13.418049873643028, 12.9876804351806