In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_csv('../Data/dataset.csv')
dataset = dataset[['Protein', 'Leukocyte', 'SG', 'Bilirubin', 'Acidity', 'Nitrite',
                   'Urobilinogen','Occult_blood', 'Ketone_body', 'Glucose']]

dataset.head(3)

Unnamed: 0,Protein,Leukocyte,SG,Bilirubin,Acidity,Nitrite,Urobilinogen,Occult_blood,Ketone_body,Glucose
0,1,119,1.02,0.1,5.0,103842,0.7,3,1,16
1,4,106,1.02,0.0,5.0,108098,0.7,2,1,65
2,6,657,1.016,0.1,5.0,27515,0.7,40,3,79


In [4]:
dataset.corr()['Nitrite'].abs().sort_values(ascending=False)

Nitrite         1.000000
Bilirubin       0.082774
Urobilinogen    0.049850
Acidity         0.034194
Occult_blood    0.027737
SG              0.020128
Leukocyte       0.018149
Protein         0.014989
Ketone_body     0.012623
Glucose         0.004171
Name: Nitrite, dtype: float64

In [5]:
# Selecing columns

dataset = dataset[['Protein', 'Leukocyte', 'SG', 'Bilirubin', 'Acidity', 'Urobilinogen',
                   'Occult_blood', 'Ketone_body', 'Glucose', 'Nitrite']]

dataset.head(3)

Unnamed: 0,Protein,Leukocyte,SG,Bilirubin,Acidity,Urobilinogen,Occult_blood,Ketone_body,Glucose,Nitrite
0,1,119,1.02,0.1,5.0,0.7,3,1,16,103842
1,4,106,1.02,0.0,5.0,0.7,2,1,65,108098
2,6,657,1.016,0.1,5.0,0.7,40,3,79,27515


In [6]:
dataset.shape

(74337, 10)

In [7]:
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

X.shape, y.shape

((74337, 9), (74337,))

# Train, Test Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

X_train.shape, X_test.shape

((59469, 9), (14868, 9))

In [13]:
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=2022)

# X_train.shape, X_val.shape

((47575, 3), (11894, 3))

# Data Scaling

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
# X_val = ss.transform(X_val)
X_test = ss.transform(X_test)

In [12]:
X_train[:5, :]

array([[-0.4568329 , -0.53870365, -0.59688789,  0.19038558,  0.27419382,
        -0.28070469, -0.68592602, -0.52379031, -0.6161976 ],
       [-0.44443037, -0.48245593, -0.34512929, -0.54462091, -0.87977981,
        -0.11140888, -0.55110695, -0.55737155, -0.55352856],
       [-0.48411847, -0.55628106, -1.60392227,  0.55788882, -0.87977981,
         0.05788693,  1.72284137, -0.49020907, -0.65403552],
       [-0.44443037, -0.48245593,  0.91366369, -0.54462091, -0.87977981,
        -0.11140888,  2.00596142,  1.96122139, -0.55352856],
       [ 0.03926827, -0.40863081, -0.84864648,  0.55788882, -0.49512194,
        -0.22427276, -0.12867386, -0.55737155, -0.60792056]])

# Modeling

## 1. RandomForestRegressor

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, KFold

In [14]:
params = {
    'n_estimators' : range(100, 200, 5),
    'verbose' : [True]
}

In [15]:
rfr = RandomForestRegressor(n_jobs=8, random_state=2022)

In [16]:
# 0값 확인
# 0값 존재 -> MAPE 사용 불가 (실제 y값으로 나누기 때문(y > 0 필수))

len(dataset.loc[dataset.Nitrite == 0])

0

In [17]:
kfold = KFold(10, shuffle=True, random_state=2022)

### Learning

In [18]:
grid = GridSearchCV(rfr, params, n_jobs=8, cv=kfold, refit=True)
grid.fit(X_train, y_train)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 195 out of 195 | elapsed:    5.3s finished


GridSearchCV(cv=KFold(n_splits=10, random_state=2022, shuffle=True),
             estimator=RandomForestRegressor(n_jobs=8, random_state=2022),
             n_jobs=8,
             param_grid={'n_estimators': range(100, 200, 5), 'verbose': [True]})

In [19]:
grid.best_params_

{'n_estimators': 195, 'verbose': True}

In [20]:
y_pred = grid.best_estimator_.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 195 out of 195 | elapsed:    0.0s finished


### Evaluating

In [21]:
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

In [22]:
mean_absolute_percentage_error(y_test, y_pred)

2.420722774713554

In [23]:
mean_absolute_error(y_test, y_pred)

9169.328156933658

In [24]:
np.sqrt(mean_squared_error(y_test, y_pred))

18218.386397459246

In [43]:
# Test_data 14868개 중 MAE 값이 9169 -> data 하나당 평균 오차 약 1

print('Mean:', y_test.mean())
print('Meidan:', y_test.median())
print('Max:', y_test.max())
print('Min:', y_test.min())
print('y_test length:', len(y_test))
print('Average Error: ', mean_absolute_error(y_test, y_pred) / len(y_test))

Mean: 52230.90260963143
Meidan: 53091.0
Max: 109992
Min: 4
y_test length: 14868
Average Error:  0.6167156414402514


### Comparising y_pred, y_result

In [29]:
comparison = pd.DataFrame({'prediction' : y_pred, 'result' : y_test.values.ravel()})

In [30]:
comparison

Unnamed: 0,prediction,result
0,53091.000000,53091
1,83948.000000,83948
2,84887.000000,84887
3,53091.000000,53091
4,63492.000000,63492
...,...,...
14863,83948.000000,83948
14864,53091.000000,53091
14865,54898.317842,33303
14866,55594.000000,55594


### Saving Model

In [31]:
import pickle

In [32]:
saved_model = pickle.dump(grid.best_estimator_, open('../Model/{0}_RFT_n_{1}.pkl' \
    .format(dataset.columns[-1], grid.best_params_.get('n_estimators')), 'wb'),
                          protocol=pickle.HIGHEST_PROTOCOL)

### Loading Model

In [33]:
loaded_model = pickle.load(open('../Model/{0}_RFT_n_{1}.pkl' \
    .format(dataset.columns[-1], grid.best_params_.get('n_estimators')), 'rb'))

In [34]:
y_pred = loaded_model.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 195 out of 195 | elapsed:    0.0s finished


In [35]:
mean_absolute_percentage_error(y_test, y_pred)

2.420722774713554

In [36]:
mean_absolute_error(y_test, y_pred)

9169.328156933658

In [37]:
np.sqrt(mean_squared_error(y_test, y_pred))

18218.386397459246