# Midland Basin Well Data

## Import dependencies and load data

In [109]:
from sqlalchemy import create_engine
from config import db_password
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [110]:
# Connection string to PostgreSQL
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/MidlandBasin_data"

In [111]:
# Create database engine
engine = create_engine(db_string)

In [135]:
# Load the data as a dataframe
df = pd.read_sql_query('select * from "well_data"', con=engine)
df.head()

Unnamed: 0,well_id,lease_name,well_number,operator_company,county,landing_zone,surf_lat,surf_long,bh_lat,bh_long,...,date_completed,total_fluid,total_proppant,well_spacing,well_name,fluid,prop,avg_ppg,oil_eur,gas_eur
0,42329418360000,BRADFORD TRUST A UNIT 3,1513AH,XTO ENERGY INC,MIDLAND,WCMP A,31.73841,-102.00904,31.71565,-102.00382,...,2018-10-05,17406252.0,10804000.0,,BRADFORD TRUST A UNIT 3 1513AH,54.0,1412.0,0.62,208.0,3005.0
1,42329418710000,WARFIELD EAST C,103LS,DIAMONDBACK EXPLORATION & PROD LLC,MIDLAND,SPBYL SH,31.922233,-102.224514,31.892186,-102.21475,...,2018-06-05,,,,WARFIELD EAST C 103LS,,,,758.0,932.0
2,42329418760000,ST,4043WA,DIAMONDBACK EXPLORATION & PROD LLC,MIDLAND,WCMP A,31.982514,-102.211939,31.955094,-102.205336,...,2018-05-25,15855777.0,16090000.0,,ST 4043WA,37.0,1585.0,1.01,456.0,1758.0
3,42329418860000,MUSTANG,H120UH,PERMIAN DEEP ROCK OIL CO LLC,MIDLAND,WCMP D,31.972125,-102.067947,31.997833,-102.076536,...,2018-09-05,16382508.0,20288000.0,,MUSTANG H120UH,42.0,2197.0,1.24,858.0,1556.0
4,42329418890000,GBG 41/38 UNIT,0032LS,CHEVRON U S A INC,MIDLAND,SPBYL SH,31.85316,-102.129826,31.879517,-102.140645,...,2018-06-26,21463544.0,20104000.0,,GBG 41/38 UNIT 0032LS,52.0,2052.0,0.94,759.0,1338.0


In [136]:
# Summarize initial dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14702 entries, 0 to 14701
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   well_id           14702 non-null  int64  
 1   lease_name        14702 non-null  object 
 2   well_number       14702 non-null  object 
 3   operator_company  14702 non-null  object 
 4   county            14702 non-null  object 
 5   landing_zone      14176 non-null  object 
 6   surf_lat          14702 non-null  float64
 7   surf_long         14702 non-null  float64
 8   bh_lat            14309 non-null  float64
 9   bh_long           14309 non-null  float64
 10  date_spud         14392 non-null  object 
 11  lateral_len       12126 non-null  float64
 12  date_completed    12747 non-null  object 
 13  total_fluid       11634 non-null  float64
 14  total_proppant    11541 non-null  float64
 15  well_spacing      0 non-null      object 
 16  well_name         14702 non-null  object

## Data Cleaning

In [137]:
# Drop the columns where all values are null
df = df.dropna(axis='columns', how="all")

# Drop the NaN rows
df = df.dropna()

# Remove duplicate columns
df = df.drop(columns = ["lease_name", "well_number"])

# Assign numeric values to landing zone.
def LandingZone_numeric(zone):
    if zone == "SPBYU SD":
        return 1
    elif zone == "SPBYL SD":
        return 2
    elif zone == "SPBYL SH":
        return 3
    elif zone == "WCMP A":
        return 4
    elif zone == "WCMP B":
        return 5
    elif zone == "WCMP C":
        return 6
    elif zone == "WCMP D":
        return 7
    elif zone == "WOLFCAMP":
        return 8
    elif zone == "WOLFCAMP SHALE":
        return 9
    else:
        return np.NaN
    
df["landing_zone"] = df["landing_zone"].apply(LandingZone_numeric)

# Key columns cannot have null values.
df["landing_zone"] = df["landing_zone"].dropna()
df["fluid"] = df["fluid"].dropna()
df["prop"] = df["prop"].dropna()
df["avg_ppg"] = df["avg_ppg"].dropna()
df["oil_eur"] = df["oil_eur"].dropna()

# Replace Oil/Gas EUR columns with a new barrels oil equivalent (BOE) column
df["BOE_EUR"] = (df["gas_eur"]/6) + df["oil_eur"]

df

Unnamed: 0,well_id,operator_company,county,landing_zone,surf_lat,surf_long,bh_lat,bh_long,date_spud,lateral_len,date_completed,total_fluid,total_proppant,well_name,fluid,prop,avg_ppg,oil_eur,gas_eur,BOE_EUR
0,42329418360000,XTO ENERGY INC,MIDLAND,4.0,31.738410,-102.009040,31.715650,-102.003820,2017-08-17,7654.0,2018-10-05,17406252.0,10804000.0,BRADFORD TRUST A UNIT 3 1513AH,54.0,1412.0,0.62,208.0,3005.0,708.833333
2,42329418760000,DIAMONDBACK EXPLORATION & PROD LLC,MIDLAND,4.0,31.982514,-102.211939,31.955094,-102.205336,2018-01-04,10150.0,2018-05-25,15855777.0,16090000.0,ST 4043WA,37.0,1585.0,1.01,456.0,1758.0,749.000000
3,42329418860000,PERMIAN DEEP ROCK OIL CO LLC,MIDLAND,7.0,31.972125,-102.067947,31.997833,-102.076536,2018-05-29,9233.0,2018-09-05,16382508.0,20288000.0,MUSTANG H120UH,42.0,2197.0,1.24,858.0,1556.0,1117.333333
4,42329418890000,CHEVRON U S A INC,MIDLAND,3.0,31.853160,-102.129826,31.879517,-102.140645,2017-09-28,9798.0,2018-06-26,21463544.0,20104000.0,GBG 41/38 UNIT 0032LS,52.0,2052.0,0.94,759.0,1338.0,982.000000
5,42329418900100,CHEVRON U S A INC,MIDLAND,4.0,31.853179,-102.129748,31.880202,-102.137624,2018-01-18,9975.0,2018-06-23,25829358.0,16740000.0,GBG 41/38 UNIT 0033WB,62.0,1678.0,0.65,827.0,2430.0,1232.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14652,42461411070000,COG OPERATING LLC,UPTON,3.0,31.618781,-102.095258,31.660225,-102.109031,2019-10-02,15091.0,2020-03-17,30361752.0,29640000.0,WINDHAM TXL R16 3204LH,48.0,1964.0,0.98,1295.0,1863.0,1605.500000
14660,42461411670000,PIONEER NATURAL RESOURCES USA INC,UPTON,2.0,31.594112,-101.847537,31.613090,-101.857264,2019-06-05,7622.0,2019-09-20,12406909.0,11634000.0,NORTH PEMBROOK SPRABERRY UNIT 1H,39.0,1526.0,0.94,273.0,1015.0,442.166667
14663,42461411890000,PIONEER NATURAL RESOURCES USA INC,UPTON,5.0,31.581047,-102.065315,31.607993,-102.073882,2019-07-11,10065.0,2019-12-19,25339348.0,21688000.0,REESE 15F-10-D 4307H,60.0,2155.0,0.86,839.0,4298.0,1555.333333
14664,42461411930100,PIONEER NATURAL RESOURCES USA INC,UPTON,5.0,31.580888,-102.065264,31.608974,-102.069893,2019-08-12,9838.0,2019-12-19,24850432.0,20732000.0,REESE 15E-10-F 4311H,60.0,2107.0,0.83,889.0,1449.0,1130.500000


In [138]:

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6792 entries, 0 to 14665
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   well_id           6792 non-null   int64  
 1   operator_company  6792 non-null   object 
 2   county            6792 non-null   object 
 3   landing_zone      6785 non-null   float64
 4   surf_lat          6792 non-null   float64
 5   surf_long         6792 non-null   float64
 6   bh_lat            6792 non-null   float64
 7   bh_long           6792 non-null   float64
 8   date_spud         6792 non-null   object 
 9   lateral_len       6792 non-null   float64
 10  date_completed    6792 non-null   object 
 11  total_fluid       6792 non-null   float64
 12  total_proppant    6792 non-null   float64
 13  well_name         6792 non-null   object 
 14  fluid             6792 non-null   float64
 15  prop              6792 non-null   float64
 16  avg_ppg           6792 non-null   float64

In [139]:
# Convert data to numeric
df["landing_zone"].value_counts()

5.0    2058
4.0    1756
3.0    1556
8.0     508
1.0     235
9.0     203
2.0     193
7.0     183
6.0      93
Name: landing_zone, dtype: int64

## Split to Training/Testing

In [43]:
# Remove outcome target from features
y = df["Oil EUR, Mbbl"]
X = df.drop(columns = ["Oil EUR, Mbbl","Gas EUR, MMcf"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [44]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

ValueError: could not convert string to float: '21H'