# Midland Basin Well Data

## Import dependencies and load data

In [152]:
from sqlalchemy import create_engine
from config import db_password
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

In [153]:
# Connection string to PostgreSQL
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/FNL_WellData"

# Create database engine
engine = create_engine(db_string)

In [154]:
# Load the data as a dataframe
df = pd.read_sql_query('select * from "well_data"', con=engine)
df.head()

Unnamed: 0,well_id,lease_name,operator_company,county,landing_zone,date_completed,lateral_len,total_fluid,total_proppant,well_spacing,...,oil_eur,gas_eur,isopach,porosity,sw,tvd,fvf,toc,rock_type,clay_vol
0,42-383389920000,MERCHANT 1401CL,OCCIDENTAL,REAGAN,WOLFCAMP C,2015-08-22,4986.0,177847.0,9966451.0,684.562385,...,85.0,1630.0,845.5,0.061,0.618201,9376.0,1.5,2.036221,3.0,0.348545
1,42-383375670000,UNIVERSITY EAST 4316BH,RRP OPERATING LLC,REAGAN,WOLFCAMP B,2012-04-11,7307.0,153044.0,0.0,1358.097263,...,88.0,1147.0,355.2,0.063,0.237288,6760.2,1.5,1.959924,3.0,0.317504
2,42-461400590000,XBC GIDDINGS ESTATE N 26H,PIONEER,UPTON,WOLFCAMP B,2015-08-29,11525.0,393818.0,9893273.0,1301.45223,...,462.0,916.0,230.6,0.067,0.177291,9383.0,1.5,2.110517,3.0,0.296745
3,42-383385530000,UNIVERSITY 58-32 8H,PIONEER,REAGAN,WOLFCAMP B,2014-08-15,10167.0,279736.0,11295589.0,2053.260873,...,223.0,407.0,335.7,0.07,0.221368,7527.1,1.5,1.912096,3.0,0.345443
4,42-461398150000,UNIVERSITY 3-14 52H,PIONEER,UPTON,WOLFCAMP A,2015-05-17,7886.0,271446.0,8532042.0,933.900355,...,335.0,157.0,336.6,0.065,0.035373,8419.0,1.5,1.407951,0.0,0.290434


## Data Cleaning

In [155]:
# Remove duplicate columns
df = df.drop(columns = ["lease_name"])

In [156]:
# Drop the columns where all values are null
df = df.dropna(axis='columns', how="all")

# Drop the NaN rows
df = df.dropna()

# Convert date objects to datetime
df['date_completed'] = pd.to_datetime(df["date_completed"],yearfirst=True)

# Key columns cannot have null values.
df["landing_zone"] = df["landing_zone"].dropna()
df["lateral_len"] = df["lateral_len"].dropna()
df["fluid"] = df["fluid"].dropna()
df["prop"] = df["prop"].dropna()
df["avg_ppg"] = df["avg_ppg"].dropna()
df["oil_eur"] = df["oil_eur"].dropna()
df["isopach"] = df["isopach"].dropna()
df["porosity"] = df["porosity"].dropna()
df["sw"] = df["sw"].dropna()
df["fvf"] = df["fvf"].dropna()
df.isnull().sum(axis = 0)
df.set_index("well_id",inplace=True)

df

Unnamed: 0_level_0,operator_company,county,landing_zone,date_completed,lateral_len,total_fluid,total_proppant,well_spacing,fluid,prop,...,oil_eur,gas_eur,isopach,porosity,sw,tvd,fvf,toc,rock_type,clay_vol
well_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42-383389920000,OCCIDENTAL,REAGAN,WOLFCAMP C,2015-08-22,4986.0,177847.0,9966451.0,684.562385,35.7,1999.0,...,85.0,1630.0,845.5,0.061,0.618201,9376.0,1.5,2.036221,3.0,0.348545
42-383375670000,RRP OPERATING LLC,REAGAN,WOLFCAMP B,2012-04-11,7307.0,153044.0,0.0,1358.097263,20.9,0.0,...,88.0,1147.0,355.2,0.063,0.237288,6760.2,1.5,1.959924,3.0,0.317504
42-461400590000,PIONEER,UPTON,WOLFCAMP B,2015-08-29,11525.0,393818.0,9893273.0,1301.452230,34.2,858.0,...,462.0,916.0,230.6,0.067,0.177291,9383.0,1.5,2.110517,3.0,0.296745
42-383385530000,PIONEER,REAGAN,WOLFCAMP B,2014-08-15,10167.0,279736.0,11295589.0,2053.260873,27.5,1111.0,...,223.0,407.0,335.7,0.070,0.221368,7527.1,1.5,1.912096,3.0,0.345443
42-461398150000,PIONEER,UPTON,WOLFCAMP A,2015-05-17,7886.0,271446.0,8532042.0,933.900355,34.4,1082.0,...,335.0,157.0,336.6,0.065,0.035373,8419.0,1.5,1.407951,0.0,0.290434
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42-317413520000,DIAMONDBACK,MARTIN,WOLFCAMP A,2019-01-28,12983.0,422593.0,23067956.0,778.950212,32.5,1777.0,...,332.0,3616.0,226.3,0.056,0.193346,9390.1,2.0,1.754126,4.0,0.266024
42-461408000000,CONOCOPHILLIPS,UPTON,LOWER SPRABERRY,2019-01-18,12948.0,598732.0,26048986.0,910.774324,46.2,2012.0,...,514.0,2155.0,404.3,0.058,0.221588,8734.1,1.5,1.838898,3.0,0.319537
42-227395960000,CROWNQUEST,HOWARD,WOLFCAMP A,2018-11-18,10027.0,516450.0,20144200.0,536.653837,51.5,2009.0,...,475.0,1189.0,286.4,0.071,0.162865,8208.6,1.6,2.126808,3.0,0.306010
42-317413940000,EXXON,MARTIN,WOLFCAMP A,2018-12-06,10325.0,597468.0,17529064.0,792.490961,57.9,1698.0,...,497.0,3167.0,249.2,0.059,0.166516,8789.2,2.0,2.044683,4.0,0.299328


In [157]:
# Combine Wolfcamp B Upper and Lower into Wolfcamp B
df['landing_zone'] = df['landing_zone'].replace(to_replace="WOLFCAMP B UPPER",value="WOLFCAMP B")
df['landing_zone'] = df['landing_zone'].replace(to_replace="WOLFCAMP B LOWER",value="WOLFCAMP B")
df.drop(df[df['landing_zone'] == 'UPPER SPRABERRY'].index, inplace = True)
df.drop(df[df['county'] == 'ECTOR'].index, inplace = True)

In [158]:
# Calculate in-place volume and other metrics
df["SoPhi"] = (df["porosity"]*(1-df['sw']))


In [159]:
df

Unnamed: 0_level_0,operator_company,county,landing_zone,date_completed,lateral_len,total_fluid,total_proppant,well_spacing,fluid,prop,...,gas_eur,isopach,porosity,sw,tvd,fvf,toc,rock_type,clay_vol,SoPhi
well_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42-383389920000,OCCIDENTAL,REAGAN,WOLFCAMP C,2015-08-22,4986.0,177847.0,9966451.0,684.562385,35.7,1999.0,...,1630.0,845.5,0.061,0.618201,9376.0,1.5,2.036221,3.0,0.348545,0.023290
42-383375670000,RRP OPERATING LLC,REAGAN,WOLFCAMP B,2012-04-11,7307.0,153044.0,0.0,1358.097263,20.9,0.0,...,1147.0,355.2,0.063,0.237288,6760.2,1.5,1.959924,3.0,0.317504,0.048051
42-461400590000,PIONEER,UPTON,WOLFCAMP B,2015-08-29,11525.0,393818.0,9893273.0,1301.452230,34.2,858.0,...,916.0,230.6,0.067,0.177291,9383.0,1.5,2.110517,3.0,0.296745,0.055122
42-383385530000,PIONEER,REAGAN,WOLFCAMP B,2014-08-15,10167.0,279736.0,11295589.0,2053.260873,27.5,1111.0,...,407.0,335.7,0.070,0.221368,7527.1,1.5,1.912096,3.0,0.345443,0.054504
42-461398150000,PIONEER,UPTON,WOLFCAMP A,2015-05-17,7886.0,271446.0,8532042.0,933.900355,34.4,1082.0,...,157.0,336.6,0.065,0.035373,8419.0,1.5,1.407951,0.0,0.290434,0.062701
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42-317413520000,DIAMONDBACK,MARTIN,WOLFCAMP A,2019-01-28,12983.0,422593.0,23067956.0,778.950212,32.5,1777.0,...,3616.0,226.3,0.056,0.193346,9390.1,2.0,1.754126,4.0,0.266024,0.045173
42-461408000000,CONOCOPHILLIPS,UPTON,LOWER SPRABERRY,2019-01-18,12948.0,598732.0,26048986.0,910.774324,46.2,2012.0,...,2155.0,404.3,0.058,0.221588,8734.1,1.5,1.838898,3.0,0.319537,0.045148
42-227395960000,CROWNQUEST,HOWARD,WOLFCAMP A,2018-11-18,10027.0,516450.0,20144200.0,536.653837,51.5,2009.0,...,1189.0,286.4,0.071,0.162865,8208.6,1.6,2.126808,3.0,0.306010,0.059437
42-317413940000,EXXON,MARTIN,WOLFCAMP A,2018-12-06,10325.0,597468.0,17529064.0,792.490961,57.9,1698.0,...,3167.0,249.2,0.059,0.166516,8789.2,2.0,2.044683,4.0,0.299328,0.049176


## Label Encoding

In [160]:
df['landing_zone'] = df['landing_zone'].astype('category')

In [161]:
# Label encoding county
df['lz_cat'] = df['landing_zone'].cat.codes
df.head()

Unnamed: 0_level_0,operator_company,county,landing_zone,date_completed,lateral_len,total_fluid,total_proppant,well_spacing,fluid,prop,...,isopach,porosity,sw,tvd,fvf,toc,rock_type,clay_vol,SoPhi,lz_cat
well_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
42-383389920000,OCCIDENTAL,REAGAN,WOLFCAMP C,2015-08-22,4986.0,177847.0,9966451.0,684.562385,35.7,1999.0,...,845.5,0.061,0.618201,9376.0,1.5,2.036221,3.0,0.348545,0.02329,6
42-383375670000,RRP OPERATING LLC,REAGAN,WOLFCAMP B,2012-04-11,7307.0,153044.0,0.0,1358.097263,20.9,0.0,...,355.2,0.063,0.237288,6760.2,1.5,1.959924,3.0,0.317504,0.048051,5
42-461400590000,PIONEER,UPTON,WOLFCAMP B,2015-08-29,11525.0,393818.0,9893273.0,1301.45223,34.2,858.0,...,230.6,0.067,0.177291,9383.0,1.5,2.110517,3.0,0.296745,0.055122,5
42-383385530000,PIONEER,REAGAN,WOLFCAMP B,2014-08-15,10167.0,279736.0,11295589.0,2053.260873,27.5,1111.0,...,335.7,0.07,0.221368,7527.1,1.5,1.912096,3.0,0.345443,0.054504,5
42-461398150000,PIONEER,UPTON,WOLFCAMP A,2015-05-17,7886.0,271446.0,8532042.0,933.900355,34.4,1082.0,...,336.6,0.065,0.035373,8419.0,1.5,1.407951,0.0,0.290434,0.062701,4


## Prepare Dataframe for K-Means

In [211]:
ml_df = df.drop(columns = ["operator_company","county","landing_zone","date_completed",
                            "lateral_len","total_fluid","total_proppant","prop","avg_ppg","oil_eur","tvd",
                            "gas_eur","well_spacing","fluid","porosity","sw","lz_cat","SoPhi",'isopach','rock_type','fvf'])

# ml_df['tvd'] = ml_df['tvd']/10000
ml_df.head(10)

Unnamed: 0_level_0,toc,clay_vol
well_id,Unnamed: 1_level_1,Unnamed: 2_level_1
42-383389920000,2.036221,0.348545
42-383375670000,1.959924,0.317504
42-461400590000,2.110517,0.296745
42-383385530000,1.912096,0.345443
42-461398150000,1.407951,0.290434
42-329402720000,1.850712,0.249911
42-329413330000,1.192114,0.296486
42-317405940000,2.000713,0.31391
42-383397660000,2.450848,0.31046
42-383395770000,2.006208,0.307349


In [212]:
ml_df.columns

Index(['toc', 'clay_vol'], dtype='object')

## K-Means for Rock Type Clustering

In [213]:
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

inertia = []
k = list(range(1, 11))

In [214]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(ml_df)
    inertia.append(km.inertia_)

In [215]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [216]:
# Initializing model
model = KMeans(n_clusters=4, random_state=5)
model

model.fit(ml_df)

# Get the predictions
predictions = model.predict(ml_df)

# Add a new class column to the df_iris
ml_df["class"] = model.labels_
ml_df.head()

Unnamed: 0_level_0,toc,clay_vol,class
well_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
42-383389920000,2.036221,0.348545,0
42-383375670000,1.959924,0.317504,0
42-461400590000,2.110517,0.296745,0
42-383385530000,1.912096,0.345443,0
42-461398150000,1.407951,0.290434,1


In [218]:
ml_df.hvplot.scatter(x="clay_vol",y="toc",hover_cols=["class"],by="class")

In [221]:
# # Plotting the clusters with three features
# fig = px.scatter_3d(ml_df, x="clay_vol", y="lz_cat", z="tvd", color="class", symbol="class", size="clay_vol",width=800)
# fig.update_layout(legend=dict(x=0,y=1))
# fig.show()

In [220]:
ml_df.to_csv('k_means_output.csv',columns=['class'])