# 导入依赖库

In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import numpy as np

# 项目一
使用随机森林预测波士顿房价

In [29]:
data = pd.read_csv('./housing.csv')

In [30]:
X = data.drop(columns=['Unnamed: 0', 'PRICE'])

In [31]:
y = data['PRICE']

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [33]:
regressor = RandomForestRegressor(n_estimators=50, random_state=0)

In [34]:
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=50, random_state=0)

In [35]:
y_pred = regressor.predict(X_test)

Error Metrics for Regression
   1. Mean Absolute Error
2. Mean Squared Error
3. Root Mean Squared Error

In [36]:
# Evaluating the Algorithm
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 2.561559055118111
Mean Squared Error: 15.767481102362202
Root Mean Squared Error: 3.9708287676960086


## 调参

In [37]:
n_estimators = [40, 50, 60, 100]

In [38]:
mean_abs_error = []
mean_squ_error = []
root_mean_squ_error = []
for n in n_estimators:
    regressor = RandomForestRegressor(n_estimators=n, random_state=0)
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)
    mean_abs_error.append(metrics.mean_absolute_error(y_test, y_pred))
    mean_squ_error.append(metrics.mean_squared_error(y_test, y_pred))
    root_mean_squ_error.append(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [39]:
print(mean_abs_error)

[2.5239370078740153, 2.561559055118111, 2.5656824146981636, 2.5606062992126]


In [40]:
print(mean_squ_error)

[15.093564862204724, 15.767481102362202, 16.055020013123357, 16.735970952755924]


In [41]:
print(root_mean_squ_error)

[3.88504374006326, 3.9708287676960086, 4.006871599280835, 4.090962106003419]


当n_estimators为40时，模型表现最好

# Balanced Weights For Imbalanced Classification
项目地址：https://medium.com/grabngoinfo/balanced-weights-for-imbalanced-classification-465f0e13c5ad

In [42]:
from sklearn.datasets import make_classification
from collections import Counter
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.utils import class_weight
from sklearn.metrics import classification_report

In [43]:
X, y = make_classification(n_samples=100000, n_features=2, n_informative=2,
                           n_redundant=0, n_repeated=0, n_classes=2,
                           n_clusters_per_class=1,
                           weights=[0.995, 0.005],
                           class_sep=0.5, random_state=0)


In [44]:
df = pd.DataFrame({'feature_1': X[:, 0], 'feature_2': X[:, 1], 'target': y})

In [45]:
df.head()

Unnamed: 0,feature_1,feature_2,target
0,1.236497,0.579272,0
1,-0.18872,-0.022001,0
2,-0.653844,0.794309,0
3,0.005572,0.59337,0
4,0.482573,0.796902,0


In [46]:
df['target'].value_counts(normalize=True)

0    0.9897
1    0.0103
Name: target, dtype: float64

类别严重不均衡，第一类占比为99%，第二类占比仅为1%

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 使用交叉验证建立baseline
class_weight未指定时，默认所有的class的weight都为1

In [48]:
rf = RandomForestClassifier(random_state=0, n_jobs=-1)
baseline_model_cv = cross_validate(rf, X_train, y_train, cv=StratifiedKFold(n_splits=5), n_jobs=-1, scoring="recall")
print(f"{baseline_model_cv['test_score'].mean():.3f} +/- {baseline_model_cv['test_score'].std():.3f}")

0.043 +/- 0.016


recall值为0.043,标准差为0.016

## 计算权重
权重为占比的逆

### 手动计算

In [49]:
# y_train包含两类，即1和0，求和就是统计1的数量
np.sum(y_train)

817

In [50]:
# 1的占比
np.sum(y_train) / y_train.shape[0]

0.0102125

In [51]:
# 0的占比
(y_train.shape[0] - np.sum(y_train)) / y_train.shape[0]

0.9897875

In [52]:
# 1的权重
1 / (np.sum(y_train) / y_train.shape[0])

97.91921664626683

In [53]:
# 0的权重
1 / ((y_train.shape[0] - np.sum(y_train)) / y_train.shape[0])

1.0103178712602452

### 使用 sklearn的内置函数进行计算

In [54]:
sklearn_weights = class_weight.compute_class_weight('balanced',classes=np.unique(y_train),y=y_train)
sklearn_weights

array([ 0.50515894, 48.95960832])

In [55]:
# 乘以2后与我们的计算值相同
sklearn_weights*2

array([ 1.01031787, 97.91921665])

将class_weight设置为balanced时，就是按上面的方式计算weight

## 将class_weight设置为balanced，再与基线进行比较

In [56]:
rf_balanced = RandomForestClassifier(random_state=0, class_weight='balanced', n_jobs=-1)
rf_balanced_cv = cross_validate(rf_balanced, X_train, y_train, cv=StratifiedKFold(n_splits=5), n_jobs=-1,
                                scoring="recall")
print(f"{rf_balanced_cv['test_score'].mean():.3f} +/- {rf_balanced_cv['test_score'].std():.3f}")

0.032 +/- 0.013


与基线相比，将class_weight设置为balanced时，recall值下降

## 将class_weight设置为Balanced Subsample

In [57]:
rf_balanced = RandomForestClassifier(random_state=0, class_weight='balanced_subsample', n_jobs=-1)
rf_balanced_cv = cross_validate(rf_balanced, X_train, y_train, cv=StratifiedKFold(n_splits=5), n_jobs=-1,
                                scoring="recall")
print(f"{rf_balanced_cv['test_score'].mean():.3f} +/- {rf_balanced_cv['test_score'].std():.3f}")

0.037 +/- 0.014


将class_weight设置为balanced_subsmple，表现仍然比不上baseline

In [58]:
rf_balanced = RandomForestClassifier(random_state=0, class_weight={1:97,0:1}, n_jobs=-1)
rf_balanced_cv = cross_validate(rf_balanced, X_train, y_train, cv=StratifiedKFold(n_splits=5), n_jobs=-1,
                                scoring="recall")
print(f"{rf_balanced_cv['test_score'].mean():.3f} +/- {rf_balanced_cv['test_score'].std():.3f}")

0.032 +/- 0.013
