In [None]:
# Project: Model Evaluation (Oil Reservoir Productivity Estimates) | Data_size = 100 | Parameter in use = Density, Porosity, Permeability
# Test Performance of the Model

In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [None]:
# Create a sample dataset
np.random.seed(55)  # For reproducibility

In [None]:
# Generate data
data_size = 100
depth = np.random.uniform(1500, 3500, data_size)  # Depth in meters
porosity = np.random.uniform(0.1, 0.3, data_size)  # Porosity as a fraction
permeability = np.random.uniform(100, 1000, data_size)  # Permeability in millidarcies

In [None]:
# Assign oil presence (1 or 0)
# The lower bound '0' is (inclusive), The upper bound '2' is (exclusive)
oil_present = np.random.randint(0, 2, data_size)

In [None]:
# Confirm array values
print("Oil presence array:", oil_present)

Oil presence array: [1 0 1 1 0 1 0 0 0 0 0 1 1 1 0 1 0 1 1 0 0 0 0 1 1 1 1 0 0 1 1 0 1 1 0 0 0
 1 0 1 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 0 1 1 0 1 0 0 1 1 1 1 1 0 1 0 0
 1 1 0 1 0 0 0 1 1 0 1 1 0 0 0 1 1 1 0 0 0 1 0 0 1 0]


In [None]:
# Create DataFrame df
df = pd.DataFrame({
    'depth': depth,
    'porosity': porosity,
    'permeability': permeability,
    'oil_present': oil_present
})

In [None]:
# View sample df (A representation of 5 reservoirs and their respective properties)
print (df.head(5))

         depth  porosity  permeability  oil_present
0  1686.216573  0.156381    707.823587            1
1  3443.311840  0.215291    656.489516            0
2  2467.719961  0.198200    983.998141            1
3  1985.045403  0.251003    736.177105            1
4  2562.247660  0.113637    245.558254            0


In [None]:
print (df) # A representation of the entire dataset (100 reservoirs) and their respective properties)

          depth  porosity  permeability  oil_present
0   1686.216573  0.156381    707.823587            1
1   3443.311840  0.215291    656.489516            0
2   2467.719961  0.198200    983.998141            1
3   1985.045403  0.251003    736.177105            1
4   2562.247660  0.113637    245.558254            0
..          ...       ...           ...          ...
95  3347.064636  0.224470    791.615717            1
96  2429.702764  0.111521    858.721490            0
97  3220.772442  0.164794    781.592537            0
98  1864.272556  0.235567    738.625335            1
99  2067.829905  0.125870    401.037129            0

[100 rows x 4 columns]


In [None]:
print (df.describe()) # Supplemental data

             depth    porosity  permeability  oil_present
count   100.000000  100.000000    100.000000   100.000000
mean   2515.408319    0.190524    557.374277     0.480000
std     579.666175    0.057332    248.355728     0.502117
min    1518.277885    0.100539    116.945621     0.000000
25%    2044.932758    0.141704    330.826353     0.000000
50%    2527.868752    0.196832    566.745933     0.000000
75%    3051.982483    0.237815    748.719291     1.000000
max    3471.223164    0.296330    992.976077     1.000000


In [None]:
# Adding synthetic features: Variables [Depth, Porosity, Permeability, Oil_present]
df['depth_porosity'] = df['depth'] * df['porosity']
df['porosity_permeability'] = df['porosity'] * df['permeability']

In [None]:
# Splitting the data into features (X) and target (y)
X = df[['depth', 'porosity', 'permeability', 'depth_porosity', 'porosity_permeability']]
y = df['oil_present']

In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=55)

In [None]:
# Standardizing the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Using GridSearchCV method to test our model, and find the best settings (hyperparameters) for it
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}
grid_search = GridSearchCV(GradientBoostingClassifier(random_state=55), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

In [None]:
# Getting the best model from grid search
best_model = grid_search.best_estimator_

In [None]:
# Predictions testing
y_pred = best_model.predict(X_test_scaled)

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

accuracy, precision, recall, f1, conf_matrix, grid_search.best_params_

(0.55,
 0.42857142857142855,
 0.8571428571428571,
 0.5714285714285714,
 array([[5, 8],
        [1, 6]]),
 {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 100})

In [None]:
# Testing the model with sample variables

import pandas as pd
import numpy as np

# Example input values (Feel free to replace with actual reservoir values)
density = 0.85
depth = 3500
porosity = 0.3
permeability = 1000

# Calculate synthetic data
depth_porosity = depth * porosity
porosity_permeability = porosity * permeability

# Create a pandas DataFrame with the input features, including synthetic features
input_features_df = pd.DataFrame({
    'depth': [depth],
    'porosity': [porosity],
    'permeability': [permeability],
    'depth_porosity': [depth_porosity],
    'porosity_permeability': [porosity_permeability]
})

# Standardize the input features using the same scaler used in training
input_features_scaled = scaler.transform(input_features_df)

# Make prediction using the trained model
prediction = best_model.predict(input_features_scaled)

# Interpret prediction result
if prediction[0] == 1:
    print("Prediction: There is oil present.")
else:
    print("Prediction: There is no oil present.")


Prediction: There is oil present.


In [None]:
# Testing the model with sample variables

import pandas as pd
import numpy as np

# Adjusted variables: Depth, Permeability
# Example input values (Feel free to replace with actual reservoir values)
density = 0.85
depth = 1500
porosity = 0.3
permeability = 700

# Calculate synthetic data
depth_porosity = depth * porosity
porosity_permeability = porosity * permeability

# Create a pandas DataFrame with the input features, including synthetic features
input_features_df = pd.DataFrame({
    'depth': [depth],
    'porosity': [porosity],
    'permeability': [permeability],
    'depth_porosity': [depth_porosity],
    'porosity_permeability': [porosity_permeability]
})

# Standardize the input features using the same scaler used in training
input_features_scaled = scaler.transform(input_features_df)

# Make prediction using the trained model
prediction = best_model.predict(input_features_scaled)

# Interpret prediction result
if prediction[0] == 1:
    print("Prediction: There is oil present.")
else:
    print("Prediction: There is no oil present.")


Prediction: There is no oil present.


In [None]:
# Integrating SQL for sample business queries

import pandas as pd

# Using the initial df DataFrame (100 reservoirs)

# Calculate production_capacity
# Assuming specific gravity (SG) = 0.85
df['production_capacity'] = df['depth'] * df['porosity'] * df['permeability'] * 0.85 / 0.159

# Find the best prospect (highest production capacity) from the DataFrame df
best_prospect = df.loc[df['production_capacity'].idxmax()]

# Formatting data with comma separators to make it more presentable
def format_with_commas(number):
    return "{:,}".format(number)

# Reserves Estimation
total_reservoirs = len(df)
producing_reservoirs = df['oil_present'].sum()
average_depth = df['depth'].mean()
average_porosity = df['porosity'].mean() * 100  # Convert porosity to percentage
average_permeability = df['permeability'].mean()

# Format and display reserves estimation
print("Reserves Estimation:")
print(f"Total reservoirs: {format_with_commas(total_reservoirs)}")
print(f"Producing reservoirs: {format_with_commas(producing_reservoirs)}")
print(f"Average depth: {format_with_commas(int(average_depth))} meters")
print(f"Average porosity: {average_porosity:.2f}%")
print(f"Average permeability: {format_with_commas(int(average_permeability))} millidarcies\n")

# Query to find the best reservoir prospect, and summary performance of the entire dataset
reservoir_id = df.index[df['production_capacity'] == best_prospect['production_capacity']].tolist()[0] + 1
depth = format_with_commas(int(best_prospect['depth']))
production_capacity = format_with_commas(int(best_prospect['production_capacity']))
print(f"The best prospect is reservoir {reservoir_id} at depth {depth} meters, "
      f"with a production capacity of: {production_capacity} barrels")


Reserves Estimation:
Total reservoirs: 100
Producing reservoirs: 48
Average depth: 2,515 meters
Average porosity: 19.05%
Average permeability: 557 millidarcies

The best prospect is reservoir 76 at depth 2,934 meters, with a production capacity of: 3,856,815 barrels
