In [1]:
# Import necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import ensemble
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from math import sqrt

%matplotlib inline

  import pandas.util.testing as tm


In [2]:
data = pd.read_csv('data.csv')
data.head()

Unnamed: 0,Wellhead Temp. (C),Wellhead Press (psi),MMCFD- gas,BOPD (barrel of oil produced per day),BWPD (barrel of water produced per day),BSW - basic solid and water (%),CO2 mol. (%) @ 25 C & 1 Atm.,Gas Grav.,CR-corrosion defect
0,64.13,2058.81,2.53,1307.94,5815.68,21.06,4.1099,0.7434,0.4052
1,68.21,1883.68,2.73,610.06,6343.57,9.71,0.933,0.7421,0.4044
2,45.27,948.74,3.9,480.06,6251.32,23.71,4.1899,0.7915,0.3997
3,66.97,2036.34,15.26,700.38,7795.69,61.04,1.6463,0.9139,0.3991
4,67.21,1340.54,10.36,1209.46,123.96,20.47,2.1917,0.8877,0.398


In [3]:
# Check data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 292 entries, 0 to 291
Data columns (total 9 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Wellhead Temp. (C)                       292 non-null    float64
 1   Wellhead Press (psi)                     292 non-null    float64
 2   MMCFD- gas                               292 non-null    float64
 3   BOPD (barrel of oil produced per day)    292 non-null    float64
 4   BWPD (barrel of water produced per day)  292 non-null    float64
 5   BSW - basic solid and water (%)          292 non-null    float64
 6   CO2 mol. (%) @ 25 C & 1 Atm.             292 non-null    float64
 7   Gas Grav.                                292 non-null    float64
 8   CR-corrosion defect                      292 non-null    float64
dtypes: float64(9)
memory usage: 20.7 KB


In [4]:
# Descriptive statistics
data.describe()

Unnamed: 0,Wellhead Temp. (C),Wellhead Press (psi),MMCFD- gas,BOPD (barrel of oil produced per day),BWPD (barrel of water produced per day),BSW - basic solid and water (%),CO2 mol. (%) @ 25 C & 1 Atm.,Gas Grav.,CR-corrosion defect
count,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0,292.0
mean,59.882979,1476.940137,7.672979,1022.48476,4232.614281,41.500479,2.764615,0.825091,0.21045
std,8.689875,528.118015,4.994977,544.609518,2763.672374,25.891736,0.999888,0.063128,0.121707
min,41.07,382.08,0.23,129.47,40.61,0.13,0.6786,0.7111,0.0009
25%,52.7625,1039.45,3.2825,562.8175,1833.5,18.0425,1.99685,0.772075,0.107625
50%,61.92,1546.675,6.905,998.915,3671.875,39.955,2.88785,0.82915,0.2224
75%,66.6375,1913.565,11.7525,1455.1825,6845.135,64.17,3.561775,0.879225,0.316375
max,73.87,2317.23,17.54,2087.43,9314.26,89.26,4.2982,0.9319,0.4052


In [5]:
# Check for missing values
data.isnull().sum()

Wellhead Temp. (C)                         0
Wellhead Press (psi)                       0
MMCFD- gas                                 0
BOPD (barrel of oil produced per day)      0
BWPD (barrel of water produced per day)    0
BSW - basic solid and water (%)            0
CO2 mol. (%) @ 25 C & 1 Atm.               0
Gas Grav.                                  0
CR-corrosion defect                        0
dtype: int64

In [21]:
# Perform some graphical analysis on the data
# from scipy.stats import norm
# ax = sns.pairplot(data)

## Data Modeling

In [6]:
X = data.iloc[:,0:8].values
y = data.iloc[:,8]
X = MinMaxScaler().fit_transform(X)

In [7]:
X

array([[0.70304878, 0.86645996, 0.13287117, ..., 0.23482554, 0.94797768,
        0.14628623],
       [0.82743902, 0.77596052, 0.14442519, ..., 0.10748345, 0.07028401,
        0.14039855],
       [0.12804878, 0.29282485, 0.21201618, ..., 0.26455739, 0.97007957,
        0.36413043],
       ...,
       [0.39512195, 0.98430613, 0.073368  , ..., 0.08650286, 0.85031495,
        0.85144928],
       [0.14085366, 0.6676485 , 0.18428654, ..., 0.25748906, 0.86343795,
        0.3754529 ],
       [0.67804878, 0.89966152, 0.16117851, ..., 0.73465724, 0.69585037,
        0.20561594]])

In [8]:
y

0      0.4052
1      0.4044
2      0.3997
3      0.3991
4      0.3980
        ...  
287    0.0031
288    0.0027
289    0.0027
290    0.0022
291    0.0009
Name: CR-corrosion defect , Length: 292, dtype: float64

In [10]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape,X_test.shape)

(233, 8) (59, 8)


### Create a baseline linear regression model

In [11]:
regr_model = LinearRegression()
regr_model.fit(X_train,y_train)
y_pred = regr_model.predict(X_test)

In [12]:
print("Score:", regr_model.score(X_test, y_test))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Score: -0.03140122551476332
Mean Absolute Error: 0.11273898761453433
Mean Squared Error: 0.016674740981322343
Root Mean Squared Error: 0.12913071277322968


### Gradient Boosting Regressor

In [13]:
gbm_model = ensemble.GradientBoostingRegressor(n_estimators=15000, max_depth=4, min_samples_leaf=15, 
                                           min_samples_split=10, learning_rate=0.01, loss='huber', random_state=5)

# Reshape train_target to be a 1d array
#y_train = y_train.as_matrix().flatten()

# Fit model
gbm_model.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.01, loss='huber', max_depth=4,
                          min_samples_leaf=15, min_samples_split=10,
                          n_estimators=15000, random_state=5)

In [14]:
# Make predictions with model
y_pred = gbm_model.predict(X_test)


In [15]:
print("Score:", gbm_model.score(X_test, y_test))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Score: -0.23182405262349204
Mean Absolute Error: 0.1193983799641853
Mean Squared Error: 0.019914991861492124
Root Mean Squared Error: 0.1411204870367592
