<a href="https://colab.research.google.com/github/ArifAygun/Iron-Ore-Froth-Flotation-Quality-Prediction/blob/main/AA_Graduate_Project_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Quality Prediction of Iron Ore Mining Flotation Process - Part:2**

### **Import Libraries and Modules**

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import math
import random
import xgboost as xgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

### **Import Dataset**

In [4]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/Flotation/

flotation = pd.read_csv('flotation_manipulated.csv')
flotation.head().T

Mounted at /content/drive/
/content/drive/My Drive/Flotation


Unnamed: 0,0,1,2,3,4
iron_feed,55.2,55.2,55.2,55.2,55.2
silica_feed,16.98,16.98,16.98,16.98,16.98
starch_flow,323.277098,313.325639,347.948294,334.142022,340.9351
amina_flow,578.786678,537.219661,591.906744,593.170106,619.710806
pulp_flow,398.753368,399.871822,398.763806,399.866983,399.615089
pulp_pH,10.113621,10.129944,10.048444,9.918389,9.745722
pulp_density,1.729598,1.667556,1.733,1.730944,1.765889
airflow,264.929678,264.490111,264.647722,264.573311,264.559433
level,452.624362,450.767561,452.043033,493.578506,551.274806
iron_conc,66.91,67.06,66.97,66.75,66.63


## **5. Machine Learning Models**

### **5.1. Splitting Dataset**

**Split Dataset as X and y**

In [5]:
X = flotation.drop(['silica_conc', 'iron_conc'], axis=1)
y_Si = flotation['silica_conc']
y_Fe = flotation['iron_conc']

print("Shape of X:", X.shape)
print("Shape of y_Si:", y_Si.shape)
print("Shape of y_Fe:", y_Fe.shape)

Shape of X: (4097, 9)
Shape of y_Si: (4097,)
Shape of y_Fe: (4097,)


**Split into train, validation, and test set**

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_val_test, y_train, y_val_test = train_test_split(X, y_Si, 
                                           test_size=0.4, random_state=1)

X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, 
                                                test_size=0.5, random_state=1) 

print(X_train.shape[0], X_val.shape[0], X_test.shape[0])

2458 819 820


### **5.2. Linear Regression (Silica Concentrate)**

**Estimate and test a linear regression with all inputs**

In [7]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

lr0 = linear_model.LinearRegression()

lr0.fit(X_train, y_train)

y_train_pred = lr0.predict(X_train)
y_val_pred = lr0.predict(X_val)

print('MSE on training set:')
print(mean_squared_error(y_train, y_train_pred))
print('MSE on validation set:')
print(mean_squared_error(y_val, y_val_pred))
print('')

print('R squared on training set:')
print(r2_score(y_train, y_train_pred))
print('R squared on validation set:')
print(r2_score(y_val, y_val_pred))

MSE on training set:
1.0742301284438096
MSE on validation set:
1.068384824057416

R squared on training set:
0.12161178787254845
R squared on validation set:
0.12812614783705667


- Here we show that a linear regression model with lots of parameters, overfits on the training set and has a disappointing performance on the validation set.

- We are not yet using the test set because we are going to try other models and then pick the best one.

**Estimate and validate a linear regression with ten randomly chosen inputs**

In [8]:
import random
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

random.seed(10)

lr = linear_model.LinearRegression()

input_indices = random.sample(range(0, X.shape[1]), 9)

X_train_subset = X_train.iloc[:, input_indices]
X_val_subset = X_val.iloc[:, input_indices]

lr.fit(X_train_subset, y_train)

y_train_pred = lr.predict(X_train_subset)
y_val_pred = lr.predict(X_val_subset)

print('MSE on training set:')
print(mean_squared_error(y_train, y_train_pred))
print('MSE on validation set:')
print(mean_squared_error(y_val, y_val_pred))
print('-'*25)
print('R squared on training set:')
print(r2_score(y_train, y_train_pred))
print('R squared on validation set:')
print(r2_score(y_val, y_val_pred))

MSE on training set:
1.0742301284438092
MSE on validation set:
1.068384824057414
-------------------------
R squared on training set:
0.12161178787254878
R squared on validation set:
0.12812614783705834


- Here we show that overfitting is much less severe

**Estimate many linear regressions with ten randomly chosen inputs and pick the best one**

- For 100 times, randomly choose 10 inputs, estimate regression

In [13]:
random.seed(10)

lr = linear_model.LinearRegression()

input_indices = random.sample(range(0, X.shape[1]), 9)

MSE = mean_squared_error(y_val, y_val_pred)

for j in range(0, 100):
    lr_j = linear_model.LinearRegression()
    input_indices_j = random.sample(range(0, X.shape[1]), 9)
    lr_j.fit(X_train.iloc[:, input_indices_j], y_train)
    y_val_pred_j = lr_j.predict(X_val.iloc[:, input_indices_j])
    MSE_j = mean_squared_error(y_val, y_val_pred_j)
    if MSE_j < MSE:
        input_indices = input_indices_j
        lr = lr_j
        MSE = MSE_j

# Make predictions on the train, validation, and test sets
X_train_subset = X_train.iloc[:, input_indices]
X_val_subset = X_val.iloc[:, input_indices]
X_test_subset = X_test.iloc[:, input_indices]

lr.fit(X_train_subset, y_train)

y_train_pred = lr.predict(X_train_subset)
y_val_pred = lr.predict(X_val_subset)
y_test_pred = lr.predict(X_test_subset)

print('MSE on training set:')
print(mean_squared_error(y_train, y_train_pred))
print('MSE on validation set:')
print(mean_squared_error(y_val, y_val_pred))
print('MSE on test set:')
print(mean_squared_error(y_test, y_test_pred))
print('')

print('R squared on training set:')
print(r2_score(y_train, y_train_pred))
print('R squared on validation set:')
print(r2_score(y_val, y_val_pred))
print('R squared on test set:')
print(r2_score(y_test, y_test_pred))


MSE on training set:
1.0742301284438092
MSE on validation set:
1.068384824057414
MSE on test set:
1.1271061816138208

R squared on training set:
0.12161178787254878
R squared on validation set:
0.12812614783705834
R squared on test set:
0.11858998299531198


- MSE values on the training, validation, and test sets are all relatively low, indicating that the model is able to fit the data well.

- R-squared values on the training, validation, and test sets are all around 0.67-0.68, suggesting that the model explains about 67-68% of the variance in the target variable.

- The model seems to perform reasonably well on the given dataset. However, it's important to note that the interpretation and evaluation of these metrics may vary depending on the specific context and requirements of the problem at hand.

### **5.3. Random Forest Regressor (Silica Concentrate)**

In [14]:
rf1 = RandomForestRegressor(random_state=0, n_estimators=100)
rf1.fit(X_train, y_train)
y_pred_rf = rf1.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print('Mean Squared Error (MSE) of Random Forest Regression:', mse_rf)
print('R-squared Score of Random Forest Regression:', r2_rf)

Mean Squared Error (MSE) of Random Forest Regression: 0.8777841058409073
R-squared Score of Random Forest Regression: 0.3135627181567826


**Estimate and test a Random Forest Regressor with all inputs**

In [15]:
rf0 = RandomForestRegressor()

rf0.fit(X_train, y_train)

y_train_pred0 = rf0.predict(X_train)
y_val_pred0 = rf0.predict(X_val)
y_test_pred0 = rf0.predict(X_test)

print('MSE on training set:')
print(mean_squared_error(y_train, y_train_pred0))
print('MSE on validation set:')
print(mean_squared_error(y_val, y_val_pred0))
print('MSE on test set:')
print(mean_squared_error(y_test, y_test_pred0))
print('-'*25)
print('R squared on training set:')
print(r2_score(y_train, y_train_pred0))
print('R squared on validation set:')
print(r2_score(y_val, y_val_pred0))
print('R squared on test set:')
print(r2_score(y_test, y_test_pred0))

MSE on training set:
0.11679283604973993
MSE on validation set:
0.8418493211197945
MSE on test set:
0.8757793920674505
-------------------------
R squared on training set:
0.9044995595165046
R squared on validation set:
0.31299434995902875
R squared on test set:
0.3151304274196508


In [16]:
random.seed(10)

rf = RandomForestRegressor()

input_indices = random.sample(range(0, X.shape[1]), 9)

X_train_subset = X_train.iloc[:, input_indices]
X_val_subset = X_val.iloc[:, input_indices]

rf.fit(X_train_subset, y_train)

y_train_pred = rf.predict(X_train_subset)
y_val_pred = rf.predict(X_val_subset)

print('MSE on training set:')
print(mean_squared_error(y_train, y_train_pred))
print('MSE on validation set:')
print(mean_squared_error(y_val, y_val_pred))
print('-'*25)
print('R squared on training set:')
print(r2_score(y_train, y_train_pred))
print('R squared on validation set:')
print(r2_score(y_val, y_val_pred))


MSE on training set:
0.11880267881627866
MSE on validation set:
0.8337184718624825
-------------------------
R squared on training set:
0.9028561293541848
R squared on validation set:
0.31962966965255124


In [17]:
random.seed(10)

rf1 = RandomForestRegressor()

input_indices = random.sample(range(0, X.shape[1]), 9)

MSE = mean_squared_error(y_val, y_val_pred)

for j in range(0, 100):
    rf_j = RandomForestRegressor()  # Create a new instance of RandomForestRegressor
    input_indices_j = random.sample(range(0, X.shape[1]), 9)
    rf_j.fit(X_train.iloc[:, input_indices_j], y_train)
    y_val_pred_j = rf_j.predict(X_val.iloc[:, input_indices_j])
    MSE_j = mean_squared_error(y_val, y_val_pred_j)
    if MSE_j < MSE:
        input_indices = input_indices_j
        rf = rf_j
        MSE = MSE_j

# Make predictions on the train, validation, and test sets
X_train_subset = X_train.iloc[:, input_indices]
X_val_subset = X_val.iloc[:, input_indices]
X_test_subset = X_test.iloc[:, input_indices]

rf1.fit(X_train_subset, y_train)

y_train_pred = rf1.predict(X_train_subset)
y_val_pred = rf1.predict(X_val_subset)
y_test_pred = rf1.predict(X_test_subset)

print('MSE on training set:')
print(mean_squared_error(y_train, y_train_pred))
print('MSE on validation set:')
print(mean_squared_error(y_val, y_val_pred))
print('MSE on test set:')
print(mean_squared_error(y_test, y_test_pred))
print('-'*25)
print('R squared on training set:')
print(r2_score(y_train, y_train_pred))
print('R squared on validation set:')
print(r2_score(y_val, y_val_pred))
print('R squared on test set:')
print(r2_score(y_test, y_test_pred))

MSE on training set:
0.11650411606385369
MSE on validation set:
0.8340171236908568
MSE on test set:
0.8853003446910737
-------------------------
R squared on training set:
0.9047356432247278
R squared on validation set:
0.3193859496799376
R squared on test set:
0.30768493279742026
