In [19]:
# Import dependencies
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import graphviz
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os
import subprocess


In [2]:
# Import Excel file into data frame
eagleford_df = pd.read_excel("Eagle Ford Data.xlsx")

In [84]:
eagleford_df.columns.tolist()

['API',
 'Longitude',
 'Latitude',
 'Bottomhole longitude',
 'Bottomhole latitude',
 'Well Number',
 'Lease Name',
 'Well Type',
 'Permit Type',
 'Pad',
 'Operator',
 'Operator (Subsidiary)',
 'Operator (Original)',
 'Operator Thematic Group',
 'Operator US Focus',
 'Play',
 'Sub-play',
 'State',
 'County',
 'Region',
 'Basin',
 'Reservoir',
 'Field',
 'Permit Date',
 'Drilling Start Date',
 'Drilling End Date',
 'Completion Date',
 'First Production Date',
 'IP Date',
 'Vertical Depth (m)',
 'Measured Depth (m)',
 'Lateral Length (m)',
 'Drilling Days (days)',
 'Drilling Speed (m/day)',
 'Fracture Stages (stages)',
 'Water (l)',
 'Proppant (kg)',
 'API Gravity (degree)',
 'Well Cost Total (US$)',
 'Rig Cost (US$)',
 'Casing Cost (US$)',
 'Water Cost (US$)',
 'Proppant Cost (US$)',
 'Pumping Cost (US$)',
 'Other Cost (US$)',
 'Cum30 Oil (bbl)',
 'Cum60 Oil (bbl)',
 'Cum90 Oil (bbl)',
 'Cum180 Oil (bbl)',
 'Cum365 Oil (bbl)',
 'Cum30 Gas (mcf)',
 'Cum60 Gas (mcf)',
 'Cum90 Gas (mcf)',
 

In [85]:
# Build data frame
data_df = pd.DataFrame({
                       'Sub-play':eagleford_df['Sub-play'],
                       'Longitude':eagleford_df['Longitude'],
                       'Latitude':eagleford_df['Latitude'],
                       'Well Cost Total (US$)':eagleford_df['Well Cost Total (US$)'],
                       'Cum365 Total (boe)':eagleford_df['Cum365 Total (boe)'],
                       'Vertical Depth (m)':eagleford_df['Vertical Depth (m)'],
                       'Measured Depth (m)':eagleford_df['Measured Depth (m)'],
                       'Lateral Length (m)':eagleford_df['Lateral Length (m)'],
                       })

In [86]:
# Drop rows with missing cumulative data
data_df = data_df.dropna(axis=0,how='any')

In [87]:
# Price of oil today 8/27/2018
oil_price = 68.92
data_df['Cum365 Revenue (US$)'] = data_df['Cum365 Total (boe)']*oil_price

In [88]:
data_df.head()

Unnamed: 0,Sub-play,Longitude,Latitude,Well Cost Total (US$),Cum365 Total (boe),Vertical Depth (m),Measured Depth (m),Lateral Length (m),Cum365 Revenue (US$)
21,Black Oil,-98.522838,28.922144,7397396.0,20837.0,2332.0,4008.0,1593.0,1436086.04
22,Black Oil,-98.541724,28.759596,7046343.0,51826.0,2657.0,4069.0,1341.0,3571847.92
23,Black Oil,-98.211906,28.846486,6313942.0,67468.0,3064.0,5022.0,1860.0,4649894.56
24,Black Oil,-98.750151,28.771177,5512727.0,20272.0,2377.0,3888.0,1436.0,1397146.24
25,Black Oil,-98.222394,28.84715,5065798.0,28316.0,3050.0,4432.0,1313.0,1951538.72


In [89]:
# Add break even column to data_df
data_df['Break Even'] = data_df.apply(lambda row: 1 
                                 if row['Cum365 Revenue (US$)'] - row['Well Cost Total (US$)'] >= 0 
                                 else 0, axis=1)

In [90]:
# Create training and test data sets
train, test = train_test_split(data_df, test_size=0.2)

In [97]:
# Define training features 
training_features = train.loc[:,['Latitude','Longitude','Vertical Depth (m)','Measured Depth (m)','Lateral Length (m)']]

In [99]:
# Define test features 
test_features = test.loc[:,['Latitude','Longitude','Vertical Depth (m)','Measured Depth (m)','Lateral Length (m)']]

In [94]:
# Define target
target = train['Break Even']

In [118]:
# Create decision tree model
dt = DecisionTreeClassifier()

In [119]:
# Fit training data to the model
dt.fit(training_features, target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [120]:
# Make prediction by passing test features to the model
predict = dt.predict(test_features)

In [121]:
# Accuracy of predicion
accuracy_score(test['Break Even'], predict)

0.766295707472178

In [15]:
# Function to create tree visiualization as png file
def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")

In [83]:
# Create png file 
visualize_tree(dt, list(features))