In [1]:
# Import necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import ensemble
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from math import sqrt

%matplotlib inline

  import pandas.util.testing as tm


In [2]:
data = pd.read_csv('generated_dataset.csv')
data.head()

Unnamed: 0,Wellhead Temp. (C),Wellhead Press (psi),MMCFD- gas,BOPD (barrel of oil produced per day),BWPD (barrel of water produced per day),BSW - basic solid and water (%),CO2 mol. (%) @ 25 C & 1 Atm.,Gas Grav.,CR-corrosion defect
0,53.35,1105.13,12.87,1378.93,2812.62,75.64,3.3628,0.7205,0.2245
1,72.25,1026.31,3.42,1028.75,919.92,44.21,3.8679,0.894,0.2262
2,65.08,722.96,6.23,2017.92,1212.42,17.55,2.3552,0.7661,0.2305
3,60.71,1557.23,11.71,558.22,1716.09,65.79,1.7253,0.7738,0.2303
4,46.19,1304.42,8.58,1280.47,1929.22,37.45,1.8327,0.7611,0.2202


In [3]:
# Check data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10292 entries, 0 to 10291
Data columns (total 9 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Wellhead Temp. (C)                       10292 non-null  float64
 1   Wellhead Press (psi)                     10292 non-null  float64
 2   MMCFD- gas                               10292 non-null  float64
 3   BOPD (barrel of oil produced per day)    10292 non-null  float64
 4   BWPD (barrel of water produced per day)  10292 non-null  float64
 5   BSW - basic solid and water (%)          10292 non-null  float64
 6   CO2 mol. (%) @ 25 C & 1 Atm.             10292 non-null  float64
 7   Gas Grav.                                10292 non-null  float64
 8   CR-corrosion defect                      10292 non-null  float64
dtypes: float64(9)
memory usage: 723.8 KB


In [4]:
# Descriptive statistics
data.describe()

Unnamed: 0,Wellhead Temp. (C),Wellhead Press (psi),MMCFD- gas,BOPD (barrel of oil produced per day),BWPD (barrel of water produced per day),BSW - basic solid and water (%),CO2 mol. (%) @ 25 C & 1 Atm.,Gas Grav.,CR-corrosion defect
count,10292.0,10292.0,10292.0,10292.0,10292.0,10292.0,10292.0,10292.0,10292.0
mean,57.352401,1361.783583,8.851533,1103.563396,4636.556014,44.87674,2.502574,0.821436,0.211285
std,9.423529,559.27542,4.968737,565.392744,2685.799834,25.710558,1.041002,0.063142,0.037283
min,41.07,382.08,0.23,129.47,40.61,0.13,0.6786,0.7111,0.0009
25%,49.22,880.0,4.57,611.645,2295.52,22.8875,1.60865,0.7668,0.1921
50%,57.36,1364.9,8.88,1106.085,4591.995,45.075,2.5181,0.8216,0.2135
75%,65.4125,1848.2525,13.09,1589.71,6997.4425,67.2125,3.406125,0.8763,0.2328
max,73.87,2317.23,17.54,2087.43,9314.26,89.26,4.2982,0.9319,0.4052


In [5]:
# Check for missing values
data.isnull().sum()

Wellhead Temp. (C)                         0
Wellhead Press (psi)                       0
MMCFD- gas                                 0
BOPD (barrel of oil produced per day)      0
BWPD (barrel of water produced per day)    0
BSW - basic solid and water (%)            0
CO2 mol. (%) @ 25 C & 1 Atm.               0
Gas Grav.                                  0
CR-corrosion defect                        0
dtype: int64

In [21]:
# Perform some graphical analysis on the data
# from scipy.stats import norm
# ax = sns.pairplot(data)

## Data Modeling

In [22]:
X = data.iloc[:,0:8].values
y = data.iloc[:,8]
X = MinMaxScaler().fit_transform(X)

In [23]:
X

array([[0.37439024, 0.37364029, 0.73021375, ..., 0.8471895 , 0.74157365,
        0.04257246],
       [0.95060976, 0.33290959, 0.18428654, ..., 0.49455851, 0.88111946,
        0.82835145],
       [0.7320122 , 0.17615172, 0.34662045, ..., 0.19544486, 0.46320035,
        0.2490942 ],
       ...,
       [0.39512195, 0.98430613, 0.073368  , ..., 0.08650286, 0.85031495,
        0.85144928],
       [0.14085366, 0.6676485 , 0.18428654, ..., 0.25748906, 0.86343795,
        0.3754529 ],
       [0.67804878, 0.89966152, 0.16117851, ..., 0.73465724, 0.69585037,
        0.20561594]])

In [24]:
y

0        0.2245
1        0.2262
2        0.2305
3        0.2303
4        0.2202
          ...  
10287    0.0031
10288    0.0027
10289    0.0027
10290    0.0022
10291    0.0009
Name: CR-corrosion defect, Length: 10292, dtype: float64

In [25]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape,X_test.shape)

(8233, 8) (2059, 8)


### Create a baseline linear regression model

In [26]:
regr_model = LinearRegression()
regr_model.fit(X_train,y_train)
y_pred = regr_model.predict(X_test)

In [29]:
print("Score:", regr_model.score(X_test, y_test))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Score: 0.2055109185528412
Mean Absolute Error: 0.02298731180514156
Mean Squared Error: 0.0011402477220824557
Root Mean Squared Error: 0.03376755428043991


### Gradient Boosting Regressor

In [30]:
gbm_model = ensemble.GradientBoostingRegressor(n_estimators=15000, max_depth=4, min_samples_leaf=15, 
                                           min_samples_split=10, learning_rate=0.01, loss='huber', random_state=5)

# Reshape train_target to be a 1d array
#y_train = y_train.as_matrix().flatten()

# Fit model
gbm_model.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=0.01, loss='huber', max_depth=4,
                          min_samples_leaf=15, min_samples_split=10,
                          n_estimators=15000, random_state=5)

In [31]:
# Make predictions with model
y_pred = gbm_model.predict(X_test)


In [32]:
print("Score:", gbm_model.score(X_test, y_test))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Score: 0.6717850156748474
Mean Absolute Error: 0.006246960586793584
Mean Squared Error: 0.0004710529030158549
Root Mean Squared Error: 0.021703753201136777
