In [28]:
# Import necessary library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from math import sqrt
from numpy import mean
from numpy import std

%matplotlib inline

In [29]:
data = pd.read_csv('generated_dataset.csv')
data.head()

Unnamed: 0,Wellhead Temp. (C),Wellhead Press (psi),MMCFD- gas,BOPD (barrel of oil produced per day),BWPD (barrel of water produced per day),BSW - basic solid and water (%),CO2 mol. (%) @ 25 C & 1 Atm.,Gas Grav.,CR-corrosion defect
0,53.35,1105.13,12.87,1378.93,2812.62,75.64,3.3628,0.7205,0.2245
1,72.25,1026.31,3.42,1028.75,919.92,44.21,3.8679,0.894,0.2262
2,65.08,722.96,6.23,2017.92,1212.42,17.55,2.3552,0.7661,0.2305
3,60.71,1557.23,11.71,558.22,1716.09,65.79,1.7253,0.7738,0.2303
4,46.19,1304.42,8.58,1280.47,1929.22,37.45,1.8327,0.7611,0.2202


In [30]:
# Check data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10292 entries, 0 to 10291
Data columns (total 9 columns):
Wellhead Temp. (C)                         10292 non-null float64
Wellhead Press (psi)                       10292 non-null float64
MMCFD- gas                                 10292 non-null float64
BOPD (barrel of oil produced per day)      10292 non-null float64
BWPD (barrel of water produced per day)    10292 non-null float64
BSW - basic solid and water (%)            10292 non-null float64
CO2 mol. (%) @ 25 C & 1 Atm.               10292 non-null float64
Gas Grav.                                  10292 non-null float64
CR-corrosion defect                        10292 non-null float64
dtypes: float64(9)
memory usage: 723.8 KB


In [31]:
# Descriptive statistics
data.describe()

Unnamed: 0,Wellhead Temp. (C),Wellhead Press (psi),MMCFD- gas,BOPD (barrel of oil produced per day),BWPD (barrel of water produced per day),BSW - basic solid and water (%),CO2 mol. (%) @ 25 C & 1 Atm.,Gas Grav.,CR-corrosion defect
count,10292.0,10292.0,10292.0,10292.0,10292.0,10292.0,10292.0,10292.0,10292.0
mean,57.352401,1361.783583,8.851533,1103.563396,4636.556014,44.87674,2.502574,0.821436,0.211285
std,9.423529,559.27542,4.968737,565.392744,2685.799834,25.710558,1.041002,0.063142,0.037283
min,41.07,382.08,0.23,129.47,40.61,0.13,0.6786,0.7111,0.0009
25%,49.22,880.0,4.57,611.645,2295.52,22.8875,1.60865,0.7668,0.1921
50%,57.36,1364.9,8.88,1106.085,4591.995,45.075,2.5181,0.8216,0.2135
75%,65.4125,1848.2525,13.09,1589.71,6997.4425,67.2125,3.406125,0.8763,0.2328
max,73.87,2317.23,17.54,2087.43,9314.26,89.26,4.2982,0.9319,0.4052


In [32]:
# Check for missing values
data.isnull().sum()

Wellhead Temp. (C)                         0
Wellhead Press (psi)                       0
MMCFD- gas                                 0
BOPD (barrel of oil produced per day)      0
BWPD (barrel of water produced per day)    0
BSW - basic solid and water (%)            0
CO2 mol. (%) @ 25 C & 1 Atm.               0
Gas Grav.                                  0
CR-corrosion defect                        0
dtype: int64

In [33]:
# Perform some graphical analysis on the data
# from scipy.stats import norm
# ax = sns.pairplot(data)

## Data Modeling

In [41]:
X = data.iloc[:,1:7].values
y = data.iloc[:,8].values
X = MinMaxScaler().fit_transform(X)

In [35]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape,X_test.shape)

(8233, 6) (2059, 6)


### Gradient Boosting Regressor

In [42]:
model = ensemble.GradientBoostingRegressor()

In [43]:
# define the grid of values to search
grid = dict()
grid['n_estimators'] = [10, 50, 100, 500]
grid['learning_rate'] = [0.0001, 0.001, 0.01, 0.1, 1.0] 
grid['subsample'] = [0.5, 0.7, 1.0]
grid['max_depth'] = [3, 7, 9]

In [44]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [47]:
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='neg_mean_absolute_error')

In [48]:

# execute the grid search
grid_result = grid_search.fit(X, y)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# summarize all scores that were evaluated
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

KeyboardInterrupt: 

In [16]:

# define the evaluation procedure
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1,error_score='raise')
# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: -0.012 (0.001)


The scikit-learn library makes the MAE negative so that it is maximized instead of minimized. This means that larger negative MAE are better and a perfect model has a MAE of 0.



In [18]:
# gbm_model = ensemble.GradientBoostingRegressor(n_estimators=15000, max_depth=4, min_samples_leaf=15, 
#                                            min_samples_split=10, learning_rate=0.01, loss='huber', random_state=5)

# # Reshape train_target to be a 1d array
# y_train = y_train.as_matrix().flatten()

# Fit model
model.fit(X_train, y_train)

GradientBoostingRegressor()

In [20]:
# Make predictions with model
y_pred = model.predict(X_test)

In [21]:
print("Score:", model.score(X_test, y_test))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Score: 0.5703278461174064
Mean Absolute Error: 0.012459934003917646
Mean Squared Error: 0.0006166638486893732
Root Mean Squared Error: 0.02483271730377836
