In [1]:
import time
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score #评价指标

In [2]:
data_path = ''
df = pd.read_csv(data_path)
X = df.drop(columns=['area'])
y = df[['area']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1212)

In [3]:
human_features = ['build_c3s_MEAN', 'build_c3s_STD', 'Light_MEAN',
                  'Light_STD', 'OSM_length', 'POP_MEAN','POP_STD']
natural_features = ['DEM_MEAN', 'DEM_STD', 'Slope_MEAN', 
                   'Slope_STD', 'tree_c3s_MEAN', 'tree_c3s_STD']
features_x_lonlat = ['build_c3s_MEAN', 'build_c3s_STD', 'DEM_MEAN', 'DEM_STD', 
                     'Light_MEAN', 'Light_STD', 'OSM_length', 'POP_MEAN',
                     'POP_STD', 'Slope_MEAN', 'Slope_STD', 'tree_c3s_MEAN', 'tree_c3s_STD']

# Training

In [4]:
def train_rf_regressor(X_train, y_train, save_path):
    start_time = time.time()
    rf_regressor = RandomForestRegressor(
        max_depth=50, min_samples_leaf=5, min_samples_split=10, 
        n_estimators=500, n_jobs=-1, random_state=1212
    )
    rf_regressor.fit(X_train, y_train.values.ravel())
    joblib.dump(rf_regressor, save_path)
    
def train_rf_classifier(X_train, y_train, save_path):
    start_time = time.time()
    rf_classifier = RandomForestClassifier(
        max_depth=50, min_samples_leaf=5, min_samples_split=10, 
        n_estimators=500, n_jobs=-1, random_state=1212
    )
    rf_classifier.fit(X_train, y_train.values.ravel())
    joblib.dump(rf_classifier, save_path)

In [6]:
#### Naive random forest (human features)
train_rf_regressor(X_train[human_features], y_train, '')
#### Naive random forest (natural features)
train_rf_regressor(X_train[natural_features], y_train, '')
#### Naive random forest (features without xy)
train_rf_regressor(X_train[features_x_lonlat], y_train, '')

In [7]:
#### Naive random forest
X_train_ = X_train.drop(columns=['CONTINENT'])
train_rf_regressor(X_train_, y_train, '')
#### Classification & regression
reg_X_train, reg_y_train = X_train_[y_train['area'] > 0], y_train[y_train['area'] > 0]
cls_X_train, cls_y_train = X_train_, y_train['area'].apply(lambda x: int(x!=0))

train_rf_classifier(cls_X_train, cls_y_train, '')
train_rf_regressor(reg_X_train, reg_y_train, '')

In [8]:
#### Partition
for cont in tqdm(['Europe', 'North America', 'Asia', 'Africa', 'South America', 'Oceania']):
    X_train_ = X_train[X_train['CONTINENT'] == cont].drop(columns=['CONTINENT'])
    y_train_ = y_train[X_train['CONTINENT'] == cont]

    reg_X_train, reg_y_train = X_train_[y_train_['area'] > 0], y_train_[y_train_['area'] > 0]
    cls_X_train, cls_y_train = X_train_, y_train_['area'].apply(lambda x: int(x!=0))
    
    train_rf_regressor(X_train_, y_train_, '')
    train_rf_classifier(cls_X_train, cls_y_train, '')
    train_rf_regressor(reg_X_train, reg_y_train, '')

  0%|          | 0/6 [00:00<?, ?it/s]