In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LinearRegression

Load in total well data

In [2]:
data = pd.read_csv(r'../data/all_wells.csv')
data.head()

Unnamed: 0,easting,northing,porosity,permeability,Poisson's ratio,Young's Modulus,water saturation,oil saturation,proppant weight (lbs),pump rate (cubic feet/min),...,water 3,water 4,water 5,water 6,water 7,water 8,water 9,water 10,water 11,water 12
0,6300.0,53700.0,0.05,0.009,0.305,10589350.0,0.118212,0.881788,870417.418369,393.48628,...,,,,,,,,,,
1,6401.0,53700.0,0.03,0.004,0.305,10592060.0,0.11785,0.88215,,,...,,,,,,,,,,
2,6502.0,53700.0,0.05,0.008,0.305,10594870.0,0.117492,0.882508,,,...,,,,,,,,,,
3,6602.0,53700.0,0.03,0.003,0.305,10597760.0,0.117138,0.882862,860887.258744,278.775058,...,,,,,,,,,,
4,6703.0,53700.0,0.03,0.003,0.304,10600740.0,0.116788,0.883212,,,...,,,,,,,,,,


Only keep rows that provide overall well data

In [3]:
oil_months = []

for i in range(1, 13):
    oil_months.append('oil ' + str(i))

wells = data[['well name'] + oil_months].dropna()
wells.head()

Unnamed: 0,well name,oil 1,oil 2,oil 3,oil 4,oil 5,oil 6,oil 7,oil 8,oil 9,oil 10,oil 11,oil 12
8100,Tarragon 4-119H,10809.0,10108.0,9352.0,8626.0,7856.0,7137.0,6430.0,5678.0,4922.0,4205.0,3477.0,2724.0
8101,Fennel 10-129H,2049.0,1932.0,1784.0,1649.0,1499.0,1364.0,1226.0,1093.0,969.0,804.0,689.0,542.0
8102,Federal 14-113H,11699.0,10905.0,10149.0,9365.0,8521.0,7732.0,6919.0,6124.0,5338.0,4536.0,3781.0,2954.0
8103,King 7-184H,5980.0,5580.0,5176.0,4789.0,4405.0,3978.0,3581.0,3159.0,2735.0,2338.0,1914.0,1536.0
8104,Sundae 1-129H,3892.0,3646.0,3399.0,3133.0,2834.0,2577.0,2309.0,2063.0,1800.0,1545.0,1279.0,1007.0


Express each column as its natural log in order to perform exponential regression

In [4]:
well_logs = pd.DataFrame()
for i in range(0, 12):
    original = wells['oil ' + str(i + 1)]
    logs = []
    for j in range(100):
        logs.append(np.log(original[j + 8100]))
    well_logs['oil ' + str(i)] = logs

well_logs.head()

Unnamed: 0,oil 0,oil 1,oil 2,oil 3,oil 4,oil 5,oil 6,oil 7,oil 8,oil 9,oil 10,oil 11
0,9.288134,9.221082,9.143346,9.062536,8.969033,8.873048,8.76873,8.644354,8.50147,8.34403,8.153925,7.909857
1,7.625107,7.566311,7.486613,7.407924,7.312553,7.218177,7.111512,6.996681,6.876265,6.689599,6.535241,6.295266
2,9.367259,9.296977,9.22513,9.144735,9.050289,8.953123,8.842027,8.719971,8.582606,8.419801,8.237744,7.990915
3,8.696176,8.626944,8.551788,8.474077,8.390496,8.288534,8.183397,8.058011,7.913887,7.757051,7.556951,7.336937
4,8.266678,8.201386,8.131237,8.049746,7.949444,7.854381,7.74457,7.631917,7.495542,7.342779,7.153834,6.914731


Model initial production and decay rate for each well

In [5]:
x = np.array([i for i in range(0, 12)]).reshape(-1, 1)
decays = []
initial = []
scores = []

for index, row in well_logs.iterrows():
    y = np.array(row)
    
    model = LinearRegression()
    model.fit(x, y)
    decays.append(model.coef_[0])
    # initial.append(np.exp(model.intercept_))
    scores.append(model.score(x, y))
    
wells['decay'] = decays
# wells['initial production'] = initial
wells['regression score'] = scores

In [6]:
wells.head()

Unnamed: 0,well name,oil 1,oil 2,oil 3,oil 4,oil 5,oil 6,oil 7,oil 8,oil 9,oil 10,oil 11,oil 12,decay,regression score
8100,Tarragon 4-119H,10809.0,10108.0,9352.0,8626.0,7856.0,7137.0,6430.0,5678.0,4922.0,4205.0,3477.0,2724.0,-0.119736,0.964746
8101,Fennel 10-129H,2049.0,1932.0,1784.0,1649.0,1499.0,1364.0,1226.0,1093.0,969.0,804.0,689.0,542.0,-0.116082,0.967369
8102,Federal 14-113H,11699.0,10905.0,10149.0,9365.0,8521.0,7732.0,6919.0,6124.0,5338.0,4536.0,3781.0,2954.0,-0.11966,0.965494
8103,King 7-184H,5980.0,5580.0,5176.0,4789.0,4405.0,3978.0,3581.0,3159.0,2735.0,2338.0,1914.0,1536.0,-0.11905,0.964329
8104,Sundae 1-129H,3892.0,3646.0,3399.0,3133.0,2834.0,2577.0,2309.0,2063.0,1800.0,1545.0,1279.0,1007.0,-0.117664,0.966516
