In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('../Data/dataset.csv')

dataset.head(3)

Unnamed: 0,B0043GG,B0043HH,B0043II,Protein,Leukocyte,SG,Bilirubin,Acidity,Nitrite,Urobilinogen,...,Urin_color_Dk.Yellow,Urin_color_Lt. Yellow,Urin_color_Lt.Orange,Urin_color_Orange,Urin_color_Other,Urin_color_Red,Urin_color_Reddish,Urin_color_Straw,Urin_color_Yellow,Urin_color_yellow
0,9.3,251.5,4.3,1,119,1.02,0.1,5.0,103842,0.7,...,0,0,0,0,0,0,0,0,1,0
1,5.1,548.3,9.1,4,106,1.02,0.0,5.0,108098,0.7,...,0,0,0,0,0,0,0,0,1,0
2,12.2,1590.0,23.5,6,657,1.016,0.1,5.0,27515,0.7,...,0,0,0,0,0,0,0,0,1,0


In [3]:
dataset.corr()['Occult_blood'].abs().sort_values(ascending=False)

Occult_blood             1.000000
B0043GG                  0.227341
B0043HH                  0.193826
Leukocyte                0.183746
Urin_color_Yellow        0.179217
SG                       0.149275
B0043II                  0.133286
Urobilinogen             0.131899
Urin_color_Red           0.123134
Ketone_body              0.118204
Protein                  0.103824
Urin_color_Orange        0.101566
Acidity                  0.096756
Urin_color_Dk.Yellow     0.081907
Urin_color_Reddish       0.081403
Glucose                  0.076694
Urin_color_Lt.Orange     0.045071
Urin_color_yellow        0.028532
Nitrite                  0.027737
Urin_color_DK.Orange     0.026125
Urin_color_Dk.Red        0.014742
Urin_color_Other         0.007839
Bilirubin                0.007515
Urin_color_Lt. Yellow    0.006680
Urin_color_DK.Yellow     0.006448
Urin_color_Brown         0.005380
Urin_color_Straw         0.005165
Urin_color_Dk.Orange     0.004353
Urin_color_Dk. Yellow    0.002204
Urin_color_Amb

In [5]:
# Selecing columns

dataset = dataset[['B0043GG', 'B0043HH', 'Leukocyte', 'Urin_color_Yellow', 'SG',
                   'B0043II', 'Urobilinogen', 'Urin_color_Red', 'Ketone_body',
                   'Protein', 'Urin_color_Orange', 'Occult_blood']]

dataset.head(3)

Unnamed: 0,B0043GG,B0043HH,Leukocyte,Urin_color_Yellow,SG,B0043II,Urobilinogen,Urin_color_Red,Ketone_body,Protein,Urin_color_Orange,Occult_blood
0,9.3,251.5,119,1,1.02,4.3,0.7,0,1,1,0,3
1,5.1,548.3,106,1,1.02,9.1,0.7,0,1,4,0,2
2,12.2,1590.0,657,1,1.016,23.5,0.7,0,3,6,0,40


In [6]:
dataset.shape

(74337, 12)

In [7]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

X.shape, y.shape

((74337, 11), (74337,))

# Train, Test Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

X_train.shape, X_test.shape

((59469, 11), (14868, 11))

In [13]:
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=2022)

# X_train.shape, X_val.shape

((47575, 3), (11894, 3))

# Data Scaling

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
# X_val = ss.transform(X_val)
X_test = ss.transform(X_test)

In [12]:
X_train[:5, :]

array([[-0.18844861, -0.25766079, -0.53870365,  0.26455711, -0.59688789,
        -0.34908997, -0.28070469, -0.13197082, -0.52379031, -0.4568329 ,
        -0.12375171],
       [-0.13281085, -0.25494616, -0.48245593,  0.26455711, -0.34512929,
        -0.19377343, -0.11140888, -0.13197082, -0.55737155, -0.44443037,
        -0.12375171],
       [-0.18133555, -0.25781403, -0.55628106,  0.26455711, -1.60392227,
        -0.3678028 ,  0.05788693, -0.13197082, -0.49020907, -0.48411847,
        -0.12375171],
       [-0.17867221, -0.25744186, -0.48245593,  0.26455711,  0.91366369,
        -0.17880316, -0.11140888, -0.13197082,  1.96122139, -0.44443037,
        -0.12375171],
       [ 0.15541414, -0.22661775, -0.40863081, -3.7799022 , -0.84864648,
        -0.30230788, -0.22427276,  7.57743263, -0.55737155,  0.03926827,
        -0.12375171]])

# Modeling

## 1. RandomForestRegressor

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold

In [14]:
params = {
    'n_estimators' : range(100, 200, 5),
    'verbose' : [True]
}

In [15]:
rfr = RandomForestRegressor(n_jobs=8, random_state=2022)

In [20]:
# 0값 확인
# 0값 존재 -> MAPE 사용 불가 (실제 y값으로 나누기 때문(y > 0 필수))

len(dataset.loc[dataset.Occult_blood == 0])

2284

In [21]:
kfold = KFold(5, shuffle=True, random_state=2022)

### Learning

In [22]:
grid = GridSearchCV(rfr, params, n_jobs=8, cv=kfold, refit=True)
grid.fit(X_train, y_train)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 195 out of 195 | elapsed:    5.7s finished


GridSearchCV(cv=KFold(n_splits=5, random_state=2022, shuffle=True),
             estimator=RandomForestRegressor(n_jobs=8, random_state=2022),
             n_jobs=8,
             param_grid={'n_estimators': range(100, 200, 5), 'verbose': [True]})

In [23]:
grid.best_params_

{'n_estimators': 195, 'verbose': True}

In [24]:
y_pred = grid.best_estimator_.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 195 out of 195 | elapsed:    0.0s finished


### Evaluating

In [25]:
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

In [26]:
mean_absolute_percentage_error(y_test, y_pred)

1610411646391085.2

In [27]:
mean_absolute_error(y_test, y_pred)

120.73357571426095

In [28]:
np.sqrt(mean_squared_error(y_test, y_pred))

183.56210799219392

### Comparising y_pred, y_result

In [29]:
comparison = pd.DataFrame({'prediction' : y_pred, 'result' : y_test.values.ravel()})

In [30]:
comparison

Unnamed: 0,prediction,result
0,208.557699,212
1,273.288200,408
2,259.903433,490
3,243.458094,4
4,2.041026,2
...,...,...
14863,292.086639,131
14864,208.636825,606
14865,9.621538,38
14866,290.740904,536


### Saving Model

In [31]:
import pickle

In [34]:
saved_model = pickle.dump(grid.best_estimator_, open('../Model/{0}_RFT_n_{1}.pkl' \
    .format(dataset.columns[-1], grid.best_params_.get('n_estimators')), 'wb'),
                          protocol=pickle.HIGHEST_PROTOCOL)

### Loading Model

In [35]:
loaded_model = pickle.load(open('../Model/{0}_RFT_n_{1}.pkl' \
    .format(dataset.columns[-1], grid.best_params_.get('n_estimators')), 'rb'))

In [36]:
y_pred = loaded_model.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 195 out of 195 | elapsed:    0.0s finished


In [38]:
mean_absolute_error(y_test, y_pred)

120.73357571426095

In [39]:
np.sqrt(mean_squared_error(y_test, y_pred))

183.56210799219392