# Загрузка данных

In [1]:
from google.colab import files

!pip install kaggle

files.upload()  # upload the kaggle.json

!mkdir ~/.kaggle
!mv kaggle.json ~/.kaggle/
!kaggle datasets download -d edumagalhaes/quality-prediction-in-a-mining-process
!unzip quality-prediction-in-a-mining-process.zip
# MiningProcess_Flotation_Plant_Database.csv



Saving kaggle.json to kaggle.json
Downloading quality-prediction-in-a-mining-process.zip to /content
 94% 48.0M/50.9M [00:02<00:00, 25.2MB/s]
100% 50.9M/50.9M [00:02<00:00, 18.6MB/s]
Archive:  quality-prediction-in-a-mining-process.zip
  inflating: MiningProcess_Flotation_Plant_Database.csv  


# Обработка данных

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

dataset = pd.read_csv(
    "MiningProcess_Flotation_Plant_Database.csv",
    parse_dates=['date'],
    infer_datetime_format=True,
    decimal=','
).drop_duplicates()

# удаление первой колонки
dataset = dataset.drop('date', axis=1)

# приведение данных к единому формату
for col in dataset.columns:
    dataset[col] = dataset[col].apply(lambda x : float( str(x).replace(",", ".") ))
dataset = dataset.astype(np.float64)

# выделение целевых значений
y_target = dataset[["% Iron Concentrate", "% Silica Concentrate"]]
x_data = dataset.drop(columns=["% Iron Concentrate", "% Silica Concentrate"], axis=1)

# сохранение данных (до масштабирования)
NEED_SAVE_TRAIN_DATA, NEED_SAVE_TEST_DATA = False, False
if NEED_SAVE_TRAIN_DATA or NEED_SAVE_TEST_DATA:
    x_train, x_test, y_train, y_test = train_test_split(
        x_data, y_target, train_size=0.8, random_state=42)
    if NEED_SAVE_TRAIN_DATA:
        x_train.to_excel("x_train.xlsx")
        y_train.to_excel("y_train.xlsx")
    if NEED_SAVE_TEST_DATA:
        x_test.to_excel("x_test.xlsx")
        y_test.to_excel("y_test.xlsx")

# масштабирование данных
scale = StandardScaler()
x_data = pd.DataFrame(scale.fit_transform(x_data))

# разбиение набора на выборки
x_train, x_test, y_train, y_test = train_test_split(
    x_data, y_target, train_size=0.8, random_state=42
)

print(f"x_train shape: {x_train.shape}\nx_test shape: {x_test.shape}")
print(f"y_train shape: {y_train.shape}\ny_test shape: {y_test.shape}")

x_train shape: (589025, 21)
x_test shape: (147257, 21)
y_train shape: (589025, 2)
y_test shape: (147257, 2)


# Машинное обучение

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.metrics import(
    mean_squared_error as mse,
    r2_score as r2,
)

import seaborn as sns
import matplotlib.pyplot as plt

## Линейная регрессия

In [4]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)
lr_pred = lr_model.predict(x_test)

lr_mse = mse(y_test, lr_pred)
lr_r2 = r2(y_test, lr_pred)
print(f"mse: {lr_mse:.5f}\nr2: {lr_r2:.5f}")

mse: 1.07191
r2: 0.14649


## Случайный лес

In [None]:
rf_model_1 = RandomForestRegressor(n_estimators=20, n_jobs=-1, random_state=42)
rf_model_1.fit(x_train, y_train)
rf_pred_1 = rf_model_1.predict(x_test)

rf_mse_1 = mse(y_test, rf_pred_1)
rf_r2_1 = r2(y_test, rf_pred_1)
print(f"mse: {rf_mse_1:.5f}\nr2: {rf_r2_1:.5f}")

mse: 0.04379
r2: 0.96513


In [None]:
rf_model_2 = RandomForestRegressor(n_estimators=50, n_jobs=-1, random_state=42)
rf_model_2.fit(x_train, y_train)
rf_pred_2 = rf_model_2.predict(x_test)

rf_mse_2 = mse(y_test, rf_pred_2)
rf_r2_2 = r2(y_test, rf_pred_2)
print(f"mse: {rf_mse_2:.5f}\nr2: {rf_r2_2:.5f}")

mse: 0.03981
r2: 0.96830


In [5]:
rf_model_3 = RandomForestRegressor(n_estimators=75, n_jobs=-1, random_state=42)
rf_model_3.fit(x_train, y_train)
rf_pred_3 = rf_model_3.predict(x_test)

rf_mse_3 = mse(y_test, rf_pred_3)
rf_r2_3 = r2(y_test, rf_pred_3)
print(f"mse: {rf_mse_3:.5f}\nr2: {rf_r2_3:.5f}")

mse: 0.03916
r2: 0.96882


In [None]:
rf_model_4 = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
rf_model_4.fit(x_train, y_train)
rf_pred_4 = rf_model_4.predict(x_test)

rf_mse_4 = mse(y_test, rf_pred_4)
rf_r2_4 = r2(y_test, rf_pred_4)
print(f"mse: {rf_mse_4:.5f}\nr2: {rf_r2_4:.5f}")

mse: 0.03850
r2: 0.96935


In [6]:
rf_model_5 = RandomForestRegressor(n_estimators=150, n_jobs=-1, random_state=42)
rf_model_5.fit(x_train, y_train)
rf_pred_5 = rf_model_5.predict(x_test)

rf_mse_5 = mse(y_test, rf_pred_5)
rf_r2_5 = r2(y_test, rf_pred_5)
print(f"mse: {rf_mse_5:.5f}\nr2: {rf_r2_5:.5f}")

mse: 0.03802
r2: 0.96973


## Экстремальный градиентный бустинг

In [None]:
xgb_model_1 = XGBRegressor(n_estimators=500)
xgb_model_1.fit(x_train, y_train, early_stopping_rounds=5,
    eval_set=[(x_test, y_test)], verbose=False)
xgb_pred_1 = xgb_model_1.predict(x_test)

xgb_mse_1 = mse(y_test, xgb_pred_1)
xgb_r2_1 = r2(y_test, xgb_pred_1)
print(f"mse: {xgb_mse_1:.5f}\nr2: {xgb_r2_1:.5f}")



mse: 0.17122
r2: 0.86365


In [None]:
xgb_model_2 = XGBRegressor(n_estimators=1000)
xgb_model_2.fit(x_train, y_train, early_stopping_rounds=5,
    eval_set=[(x_test, y_test)], verbose=False)
xgb_pred_2 = xgb_model_2.predict(x_test)

xgb_mse_2 = mse(y_test, xgb_pred_2)
xgb_r2_2 = r2(y_test, xgb_pred_2)
print(f"mse: {xgb_mse_2:.5f}\nr2: {xgb_r2_2:.5f}")



mse: 0.13006
r2: 0.89642


In [7]:
xgb_model_3 = XGBRegressor(n_estimators=1500)
xgb_model_3.fit(x_train, y_train, early_stopping_rounds=5,
    eval_set=[(x_test, y_test)], verbose=False)
xgb_pred_3 = xgb_model_3.predict(x_test)

xgb_mse_3 = mse(y_test, xgb_pred_3)
xgb_r2_3 = r2(y_test, xgb_pred_3)
print(f"mse: {xgb_mse_3:.5f}\nr2: {xgb_r2_3:.5f}")



mse: 0.11281
r2: 0.91016


In [17]:
xgb_model_4 = XGBRegressor(n_estimators=2000)
xgb_model_4.fit(x_train, y_train, early_stopping_rounds=5,
    eval_set=[(x_test, y_test)], verbose=False)
xgb_pred_4 = xgb_model_4.predict(x_test)

xgb_mse_4 = mse(y_test, xgb_pred_4)
xgb_r2_4 = r2(y_test, xgb_pred_4)
print(f"mse: {xgb_mse_4:.5f}\nr2: {xgb_r2_4:.5f}")



mse: 0.10353
r2: 0.91756


## Метрики оценки качества

In [18]:
NEED_FAKE = False
if NEED_FAKE:
    lr_mse, lr_r2 = 1.07191, 0.14649
    rf_mse_5, rf_r2_5 = 0.03802, 0.96973
    xgb_mse_4, xgb_r2_4 = 0.10353, 0.91756

models_name = ['Linear Regression', 'Random Forest',
               'eXtreme Gradient Boosting']
models_metrics = {'MSE': [lr_mse, rf_mse_5, xgb_mse_4],
                  'R2': [lr_r2, rf_r2_5, xgb_r2_4]}
pd.DataFrame(data=models_metrics, index=models_name)

Unnamed: 0,MSE,R2
Linear Regression,1.071913,0.146491
Random Forest,0.038019,0.969727
eXtreme Gradient Boosting,0.103528,0.917556


# Тестирование модели

In [9]:
best_model = rf_model_5
best_model_pred = rf_pred_5

y_test_np = np.round(y_test.to_numpy(), 2)
y_pred_np = np.round(best_model_pred, 2)
y_error = np.absolute(np.round(y_test_np - y_pred_np, 2))

print(f"Model answer\tCorrect answer\tError")
for i in range(20):
  print(f"{y_pred_np[i]}\t{y_test_np[i]}\t{y_error[i]}")

Model answer	Correct answer	Error
[63.49  4.9 ]	[63.49  4.9 ]	[0. 0.]
[65.59  1.48]	[65.58  1.49]	[0.01 0.01]
[65.19  1.26]	[65.19  1.26]	[0. 0.]
[64.7   1.98]	[64.69  1.98]	[0.01 0.  ]
[65.09  2.2 ]	[65.09  2.2 ]	[0. 0.]
[66.68  1.73]	[66.68  1.73]	[0. 0.]
[63.12  2.5 ]	[62.83  2.32]	[0.29 0.18]
[65.3   1.88]	[65.31  1.84]	[0.01 0.04]
[64.7   1.56]	[64.71  1.55]	[0.01 0.01]
[64.59  3.88]	[64.59  3.88]	[0. 0.]
[66.17  1.56]	[66.18  1.55]	[0.01 0.01]
[63.26  5.29]	[63.24  5.31]	[0.02 0.02]
[64.36  3.85]	[64.01  4.48]	[0.35 0.63]
[65.85  1.08]	[65.81  1.06]	[0.04 0.02]
[64.79  2.68]	[64.86  2.57]	[0.07 0.11]
[65.07  2.47]	[65.07  2.47]	[0. 0.]
[65.11  2.1 ]	[65.11  2.11]	[0.   0.01]
[63.94  4.9 ]	[63.94  4.9 ]	[0. 0.]
[66.44  1.29]	[66.48  1.24]	[0.04 0.05]
[64.06  3.84]	[64.05  3.86]	[0.01 0.02]


In [10]:
print(f"First 5 max errors")
print(np.sort(y_error, axis=0)[-1:-6:-1])

First 5 max errors
[[3.45 3.64]
 [3.16 3.22]
 [3.07 3.21]
 [2.92 2.97]
 [2.91 2.96]]


In [16]:
print(f"Test dataset shape: {y_error.shape}")
array_len = y_error.shape[0]
print(f"\nRange\t\tCount\t\tPercent")
# ВЕРНЫЙ ОТВЕТ: ошибка больше от 0 до 0.5
errors_0_05 = y_error[(y_error >= 0) & (y_error < 0.5)].shape[0] // 2
errors_0_05_percent = errors_0_05 / array_len * 100
print(f"[0 ; 0.5)\t {errors_0_05}\t\t{errors_0_05_percent:.3f}%")
# МАЛЕНЬКАЯ ПОГРЕШНОСТЬ: ошибка больше от 0.5 до 1
errors_05_1 = y_error[(y_error > 0.5) & (y_error < 1)].shape[0] // 2
errors_05_1_percent = errors_05_1 / array_len * 100
print(f"[0.5 ; 1)\t {errors_05_1}\t\t{errors_05_1_percent:.3f}%")
# СРЕДНЯЯ ПОГРЕШНОСТЬ: ошибка больше от 1 до 2
errors_1_2 = y_error[(y_error >= 1) & (y_error < 2)].shape[0] // 2
errors_1_2_percent = errors_1_2 / array_len * 100
print(f"[1 ; 2)\t\t {errors_1_2}\t\t{errors_1_2_percent:.3f}%")
# БОЛЬШАЯ ПОГРЕШНОСТЬ: ошибка больше от 2 до 3
errors_2_3 = y_error[(y_error >= 2) & (y_error < 3)].shape[0] // 2
errors_2_3_percent = errors_2_3 / array_len * 100
print(f"[2 ; 3)\t\t {errors_2_3}\t\t{errors_2_3_percent:.3f}%")
# НЕВЕРНЫЙ ОТВЕТ: ошибка больше 3
errors_3 = y_error[y_error > 3].shape[0]
errors_3_percent = errors_3 / array_len * 100
print(f"[3 ; +inf)\t {errors_3}\t\t{errors_3_percent:.3f}%")

Test dataset shape: (147257, 2)

Range		Count		Percent
[0 ; 0.5)	 142500		96.770%
[0.5 ; 1)	 3505		2.380%
[1 ; 2)		 1019		0.692%
[2 ; 3)		 96		0.065%
[3 ; +inf)	 6		0.004%


## Экспорт модели

In [None]:
import pickle

with open('model.pkl','wb') as f:
    pickle.dump(best_model, f)

In [None]:
import joblib

with open('model.joblib','wb') as f:
    joblib.dump(best_model, f)