In [85]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import base
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split

%matplotlib inline

train = pd.read_csv("/content/drive/My Drive/遊び/Re不動産価格推定/train_data.csv")
test = pd.read_csv("/content/drive/My Drive/遊び/Re不動産価格推定/test_data.csv")
train.head()

Unnamed: 0,id,種類,地域,市区町村コード,都道府県名,市区町村名,地区名,最寄駅：名称,最寄駅：距離（分）,間取り,面積（㎡）,土地の形状,間口,延床面積（㎡）,建築年,建物の構造,用途,今後の利用目的,前面道路：方位,前面道路：種類,前面道路：幅員（ｍ）,都市計画,建ぺい率（％）,容積率（％）,取引時点,改装,取引の事情等,y
0,1,中古マンション等,,13101,東京都,千代田区,飯田橋,飯田橋,1,２ＬＤＫ,55,,,,昭和59年,ＳＲＣ,,住宅,,,,商業地域,80.0,600.0,2017年第４四半期,改装済,,66.0
1,2,中古マンション等,,13101,東京都,千代田区,飯田橋,飯田橋,5,１Ｋ,20,,,,平成15年,ＲＣ,,住宅,,,,商業地域,80.0,500.0,2017年第４四半期,未改装,,19.0
2,3,中古マンション等,,13101,東京都,千代田区,飯田橋,飯田橋,3,１ＬＤＫ,45,,,,平成24年,ＲＣ,住宅,その他,,,,商業地域,80.0,500.0,2017年第３四半期,未改装,,37.0
3,4,中古マンション等,,13101,東京都,千代田区,飯田橋,飯田橋,5,１Ｒ,20,,,,平成15年,ＲＣ,住宅,住宅,,,,商業地域,80.0,500.0,2017年第３四半期,未改装,,18.0
4,5,宅地(土地と建物),商業地,13101,東京都,千代田区,飯田橋,飯田橋,3,,80,ほぼ台形,6.8,330.0,昭和61年,ＲＣ,住宅、事務所、店舗,事務所,南西,区道,8.0,商業地域,80.0,500.0,2017年第２四半期,,,240.0


In [86]:
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):
  def __init__(self, colnames, targetName, n_fold=5, verbosity=True, discardOriginal_col=False):
    self.colnames = colnames
    self.targetName = targetName
    self.n_fold = n_fold
    self.verbosity = verbosity
    self.discardOriginal_col = discardOriginal_col
  
  # モデルを返す
  def fit(self, X, y=None):
    return self
  
  def transform(self, X):
    assert(type(self.targetName) == str)
    assert(type(self.colnames) == str)
    assert(self.colnames in X.columns)
    assert(self.targetName in X.columns)

    mean_of_target = X[self.targetName].mean()
    kf = KFold(n_splits = self.n_fold, shuffle = False, random_state=2019)
    col_mean_name = self.colnames + '_' + 'KFold_Target_Enc'
    X[col_mean_name] = np.nan

    for tr_ind, val_ind in kf.split(X):
      X_tr, X_val = X.iloc[tr_ind],X.iloc[val_ind]
      X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
      X[col_mean_name].fillna(mean_of_target, inplace = True)  # nanになってしまったところは平均値で埋める --(1)

    if self.verbosity:            
      encoded_feature = X[col_mean_name].values
      print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,self.targetName, 
                                                                                  np.corrcoef(X[self.targetName].values,encoded_feature)[0][1]))
    if self.discardOriginal_col:
      X = X.drop(self.targetName, axis=1)
    return X
class TargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
  """How to use.
  test_targetc = TargetEncoderTest(new_train,
                                   'Feature',
                                    'Feature_Kfold_Target_Enc')
  new_test = test_targetc.fit_transform(test)
  """

  def __init__(self,train,colNames,encodedName):

      self.train = train
      self.colNames = colNames
      self.encodedName = encodedName

  def fit(self, X, y=None):
      return self

  def transform(self,X):       
      mean =  self.train[[self.colNames, self.encodedName]].groupby(self.colNames).mean().reset_index() 

      dd = {}
      for index, row in mean.iterrows():
          dd[row[self.colNames]] = row[self.encodedName]
          X[self.encodedName] = X[self.colNames]
      X = X.replace({self.encodedName: dd})
      return X

In [89]:
def preprocessor(train,test):
  # カテゴリ変数を数値に変換

  # 日本語変数名を英語変数名に
  En_namelist = ['id', 'build_type', 'area', 'area_code', 'prefectures', 'city', 'district', 'station_name', 'station_dist', 'floor_plan', 'floor', 'shape', 'frontage', 'total_floor', 'age', 'structure',
                'usage', 'purpose', 'road_azimuth', 'road_type', 'road_width', 'cityplan', 'coverage', 'FloorArea_ratio', 'treadtime', 'renovation', 'tread_circumstance', 'y']
  Ja_namelist = list(train.columns)
  d = dict(zip(Ja_namelist, En_namelist))
  train = train.rename(columns=d)
  del d["y"]
  test = test.rename(columns=d)

  # いらない変数をドロップ
  # 'id'
  train = train.drop('id',axis=1)
  test = test.drop('id',axis=1)
  

  # kFoldTargetEncoding
  df = pd.concat([X_train, test])

  cat_cols = list(train.columns[train.dtypes==object])
  for colname in cat_cols:
    targetc = KFoldTargetEncoderTrain(colname,'y',n_fold=5)
    train = targetc.fit_transform(train)
    test_targetc = TargetEncoderTest(train, colname, '{}_KFold_Target_Enc'.format(colname))
    test = test_targetc.fit_transform(test)

  X_train = train.drop(cat_cols, axis=1)
  X_train = X_train.drop("y", axis=1)
  y_train = train['y']
  test = test.drop(cat_cols, axis=1)
  print(test)

  # 標準化
  scaler = StandardScaler()
  scaler.fit(X_train)
  scaler.transform(X_train)
  X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
  scaler = StandardScaler()
  scaler.fit(test)
  scaler.transform(test)
  test = pd.DataFrame(scaler.transform(test), columns=test.columns)

  return X_train, y_train, test

In [90]:
X_train,y_train,test = preprocessor(train,test)



Correlation between the new feature, build_type_KFold_Target_Enc and, y is 0.07706653891425425.
Correlation between the new feature, area_KFold_Target_Enc and, y is 0.1859880841612339.
Correlation between the new feature, prefectures_KFold_Target_Enc and, y is -0.04705246355958163.
Correlation between the new feature, city_KFold_Target_Enc and, y is 0.09510476850601315.
Correlation between the new feature, district_KFold_Target_Enc and, y is 0.12453392325509882.
Correlation between the new feature, station_name_KFold_Target_Enc and, y is 0.1317937802551869.
Correlation between the new feature, station_dist_KFold_Target_Enc and, y is 0.05039018938212178.
Correlation between the new feature, floor_plan_KFold_Target_Enc and, y is 0.0960623039836554.
Correlation between the new feature, floor_KFold_Target_Enc and, y is 0.33137522003129627.
Correlation between the new feature, shape_KFold_Target_Enc and, y is 0.07256676865510782.
Correlation between the new feature, frontage_KFold_Target_En

ValueError: ignored

In [None]:
sns.pairplot(train2)

In [74]:
X_train

StandardScaler(copy=       build_type  ... tread_circumstance_KFold_Target_Enc
0        中古マンション等  ...                           65.434766
1        中古マンション等  ...                           65.434766
2        中古マンション等  ...                           65.434766
3        中古マンション等  ...                           65.434766
4       宅地(土地と建物)  ...                           65.434766
...           ...  ...                                 ...
356339     宅地(土地)  ...                           65.434766
356340     宅地(土地)  ...                           65.434766
356341     宅地(土地)  ...                          109.249681
356342     宅地(土地)  ...                          109.249681
356343  宅地(土地と建物)  ...                           65.434766

[356344 rows x 48 columns],
               with_mean=True, with_std=True)

In [None]:
def LOF(dfs):
  clf = LocalOutlierFactor(n_neighbors=8, contamination=0.01)
  pred = clf.fit_predict(dfs)

In [None]:
def LGB():
  