# Midland Basin Well Data

## Import dependencies and load data

In [5]:
from sqlalchemy import create_engine
from config import db_password
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.ensemble import BalancedRandomForestClassifier,EasyEnsembleClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


In [6]:
# Connection string to PostgreSQL
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/MidlandBasin_data"

In [7]:
# Create database engine
engine = create_engine(db_string)

In [22]:
# Load the data as a dataframe
df = pd.read_csv('Resources\Midland Basin Wells Header Data clean.csv')
df.head()

Unnamed: 0,well id,Lease Name,Well Number,Operator compnay,County,Landing Zone,Surf Lat,Surf Long,BH Lat,BH Long,...,Date Completed,Total Fluid (bbl),Total Proppant (lb),"Well Spacing, ft",Well Name,Fluid (bbl/ft),Prop (lb/ft),Avg PPG,"Oil EUR, Mbbl","Gas EUR, MMcf"
0,42329418360000,BRADFORD TRUST A UNIT 3,1513AH,XTO ENERGY INC,MIDLAND,WCMP A,31.73841,-102.00904,31.71565,-102.00382,...,10/5/2018,17406252.0,10804000.0,,BRADFORD TRUST A UNIT 3 1513AH,54.0,1412.0,0.62,208.0,3005.0
1,42329418710000,WARFIELD EAST C,103LS,DIAMONDBACK EXPLORATION & PROD LLC,MIDLAND,SPBYL SH,31.922233,-102.224514,31.892186,-102.21475,...,6/5/2018,,,,WARFIELD EAST C 103LS,,,,758.0,932.0
2,42329418760000,ST,4043WA,DIAMONDBACK EXPLORATION & PROD LLC,MIDLAND,WCMP A,31.982514,-102.211939,31.955094,-102.205336,...,5/25/2018,15855777.0,16090000.0,,ST 4043WA,37.0,1585.0,1.01,456.0,1758.0
3,42329418860000,MUSTANG,H120UH,PERMIAN DEEP ROCK OIL CO LLC,MIDLAND,WCMP D,31.972125,-102.067947,31.997833,-102.076536,...,9/5/2018,16382508.0,20288000.0,,MUSTANG H120UH,42.0,2197.0,1.24,858.0,1556.0
4,42329418890000,GBG 41/38 UNIT,0032LS,CHEVRON U S A INC,MIDLAND,SPBYL SH,31.85316,-102.129826,31.879517,-102.140645,...,6/26/2018,21463544.0,20104000.0,,GBG 41/38 UNIT 0032LS,52.0,2052.0,0.94,759.0,1338.0


In [23]:
# Summarize initial dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14702 entries, 0 to 14701
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   well id              14702 non-null  int64  
 1   Lease Name           14702 non-null  object 
 2   Well Number          14702 non-null  object 
 3   Operator compnay     14702 non-null  object 
 4   County               14702 non-null  object 
 5   Landing Zone         14176 non-null  object 
 6   Surf Lat             14702 non-null  float64
 7   Surf Long            14702 non-null  float64
 8   BH Lat               14309 non-null  float64
 9   BH Long              14309 non-null  float64
 10  Date Spud            14392 non-null  object 
 11  Lateral Len (ft)     12126 non-null  float64
 12  Date Completed       12747 non-null  object 
 13  Total Fluid (bbl)    11634 non-null  float64
 14  Total Proppant (lb)  11541 non-null  float64
 15  Well Spacing, ft     0 non-null     

## Data Cleaning

In [24]:
# Drop the columns where all values are null
df = df.dropna(axis='columns', how="all")

# Drop the NaN rows
df = df.dropna()

# Remove irrelevant columns
df = df.drop(columns = ["Lease Name", "Operator compnay", "Date Completed"])

# Write a list comprehension to keep the key columns from having null values.
df["Fluid (bbl/ft)"] = df["Fluid (bbl/ft)"] != 'NaN'
df["Prop (lb/ft)"] = df["Prop (lb/ft)"] != 'NaN'
df["Avg PPG"] = df["Avg PPG"] != 'NaN'
df["Oil EUR, Mbbl"] = df["Oil EUR, Mbbl"] != 'NaN'

df

Unnamed: 0,well id,Well Number,County,Landing Zone,Surf Lat,Surf Long,BH Lat,BH Long,Date Spud,Lateral Len (ft),Total Fluid (bbl),Total Proppant (lb),Well Name,Fluid (bbl/ft),Prop (lb/ft),Avg PPG,"Oil EUR, Mbbl","Gas EUR, MMcf"
0,42329418360000,1513AH,MIDLAND,WCMP A,31.738410,-102.009040,31.715650,-102.003820,8/17/2017,7654.0,17406252.0,10804000.0,BRADFORD TRUST A UNIT 3 1513AH,True,True,True,True,3005.0
2,42329418760000,4043WA,MIDLAND,WCMP A,31.982514,-102.211939,31.955094,-102.205336,1/4/2018,10150.0,15855777.0,16090000.0,ST 4043WA,True,True,True,True,1758.0
3,42329418860000,H120UH,MIDLAND,WCMP D,31.972125,-102.067947,31.997833,-102.076536,5/29/2018,9233.0,16382508.0,20288000.0,MUSTANG H120UH,True,True,True,True,1556.0
4,42329418890000,0032LS,MIDLAND,SPBYL SH,31.853160,-102.129826,31.879517,-102.140645,9/28/2017,9798.0,21463544.0,20104000.0,GBG 41/38 UNIT 0032LS,True,True,True,True,1338.0
5,42329418900100,0033WB,MIDLAND,WCMP A,31.853179,-102.129748,31.880202,-102.137624,1/18/2018,9975.0,25829358.0,16740000.0,GBG 41/38 UNIT 0033WB,True,True,True,True,2430.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14651,42461411070000,3204LH,UPTON,SPBYL SH,31.618781,-102.095258,31.660225,-102.109031,10/2/2019,15091.0,30361752.0,29640000.0,WINDHAM TXL R16 3204LH,True,True,True,True,1863.0
14659,42461411670000,1H,UPTON,SPBYL SD,31.594112,-101.847537,31.613090,-101.857264,6/5/2019,7622.0,12406909.0,11634000.0,NORTH PEMBROOK SPRABERRY UNIT 1H,True,True,True,True,1015.0
14662,42461411890000,4307H,UPTON,WCMP B,31.581047,-102.065315,31.607993,-102.073882,7/11/2019,10065.0,25339348.0,21688000.0,REESE 15F-10-D 4307H,True,True,True,True,4298.0
14663,42461411930100,4311H,UPTON,WCMP B,31.580888,-102.065264,31.608974,-102.069893,8/12/2019,9838.0,24850432.0,20732000.0,REESE 15E-10-F 4311H,True,True,True,True,1449.0


In [25]:

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6792 entries, 0 to 14664
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   well id              6792 non-null   int64  
 1   Well Number          6792 non-null   object 
 2   County               6792 non-null   object 
 3   Landing Zone         6792 non-null   object 
 4   Surf Lat             6792 non-null   float64
 5   Surf Long            6792 non-null   float64
 6   BH Lat               6792 non-null   float64
 7   BH Long              6792 non-null   float64
 8   Date Spud            6792 non-null   object 
 9   Lateral Len (ft)     6792 non-null   float64
 10  Total Fluid (bbl)    6792 non-null   float64
 11  Total Proppant (lb)  6792 non-null   float64
 12  Well Name            6792 non-null   object 
 13  Fluid (bbl/ft)       6792 non-null   bool   
 14  Prop (lb/ft)         6792 non-null   bool   
 15  Avg PPG              6792 non-null   