# Прогноз добычи нефти. Подготовка массивов для обучения
Задача - спрогнозировать добычу нефти, исходя из данных по короткому начальному интервалу добычи, а также геологической и технической информации по скважине.

Возможны два подхода. "Параметрический" основан на предположении, что добыча падает по определенному закону (например, по закону Арпса, справедливость которого была подтверждена в первой части проекта), и модель должна подобрать параметры в формуле. "Непараметрический" не предполагает какую-то заданную форму кривой. Во втором случае модель можно будет применять только на временном интервале, использованном при обучении. Параметрический метод дает модель, не ограниченную временным интервалом обучения. Сначала для простоты пробую непараметрический метод.

Если наблюдать за добычей достаточно долго, то можно определить коэффициенты уравнения без машинного обучения. Но в данном случае для прогноза будет использован короткий период.

В этой версии для обучения будет использован одинаковый период времени для всех скважин, равный самому короткой продолжительности наблюдения для всех скважин.

Целевая переменная - общий объем, добытый за весь этот период. Признаки - зависящие от времени данные (дебит, давление в скважине) за короткий начальный период (train_period) и независящие от времени геологические и технические данные, что потребует построения модели с двумя входами.

In [144]:
import pandas as pd
import numpy as np

In [145]:
well_data = pd.read_csv('well_data_oil.csv', index_col=0)
well_data

Unnamed: 0,inverse_spacing,Initial_Pressure_Estimate_ksi,Dew_Point_Pressure_kpsi,Bubble_Point_Pressure_kpsi,Reservoir_Temperature_degFe2,Sandface_Temp_degFe2,Static_Wellhead_Temp_degFe2,Stages_e2,Net_Pay_kft,TVD_kft,...,Bottom_Perf_kft,Initial_GOR_Mscf/bbl,Clusters_e3,Porosity_e1,Clusters_per_Stage_e1,Total_Proppant_lbs_e8,Total_Fluid_Bbls_e6,Oil_Specific_Gravity,Water_Saturation,Oil_Saturation
CARDINAL,1.428571,5.9,2.44,2.44,2.28,2.28,0.7,0.49,0.067,8.759,...,16.226,0.75684,0.735,0.52,1.5,0.265768,0.542527,0.876161,0.36,0.64
CROW,0.769231,5.5,1.764,1.764,2.21,2.2,0.7,0.69,0.063,8.4202,...,19.171,0.51307,1.035,0.52,1.5,0.3902,0.575663,0.813218,0.321,0.679
EAGLE,0.0,5.0,2.122,2.122,2.35,2.34,0.6,0.5,0.074,7.89081,...,15.439,0.5579,0.45,0.63,0.9,0.252674,0.577003,0.847508,0.27,0.73
FALCON,1.0,5.7,1.37,1.37,2.25,2.25,0.6,0.64,0.066,7.501876,...,17.825,0.275,0.96,0.63,1.5,0.35971,0.559202,0.851896,0.27,0.73
HAWK,1.0,5.65,1.37,1.37,2.25,2.25,0.6,0.68,0.067,7.516675,...,18.496,0.275,1.02,0.63,1.5,0.380648,0.596128,0.851896,0.271,0.729
JAY,1.428571,5.7,2.459,2.459,2.28,2.279,0.7,0.48,0.067,8.79071,...,16.173,0.76384,0.432,0.52,0.9,0.25986,0.519038,0.876161,0.361,0.602
KITE,0.714286,5.0,3.064,3.064,2.38,2.35,0.65,0.34,0.056,8.84683,...,14.474,1.0347,0.306,0.5,0.9,0.185548,0.360202,0.808202,0.321,0.679
LARK,1.25,5.7,2.455,2.455,2.28,2.28,0.7,0.5,0.068,8.77986,...,16.199,0.76228,0.45,0.52,0.9,0.270246,0.535169,0.876161,0.363,0.637
OSPREY,0.714286,5.4,1.211,1.211,2.25,2.25,0.7,0.28,0.078,7.6386,...,13.913,0.33572,0.252,0.63,0.9,0.135145,0.275579,0.83832,0.26,0.74
SPARROW,0.666667,5.164,2.1,2.1,2.25,2.25,0.7,0.49,0.062,8.5364,...,16.029,0.6284,0.441,0.52,0.9,0.275507,0.533414,0.876161,0.332,0.668


In [146]:
production_data = pd.read_csv('production_by_period_oil.csv', index_col=0)
production_data

Unnamed: 0,period,day,Gas Volume (MMscf),flowrate,Gas Lift Inj Volume (MMscf),Pressure Source _Tubing Pressure,Casing_Pressure_ksia,Tubing_Pressure_ksia,Active_Pressure_ksia,Line_Pressure_ksia,Calculated_Sandface_Pressure_ksia,water_thousand_bbl
CARDINAL,CARDINAL_0.0_119.0,0.0,0.24546,321.67,0.0,1,0.908000,1.820000,1.820000,0.059586,5.142505,0.66333
CARDINAL,CARDINAL_0.0_119.0,1.0,0.24935,337.89,0.0,1,0.912000,1.767000,1.767000,0.059586,5.089437,0.69000
CARDINAL,CARDINAL_0.0_119.0,2.0,0.25026,336.47,0.0,1,0.900000,1.747000,1.747000,0.059586,5.049800,0.64667
CARDINAL,CARDINAL_0.0_119.0,3.0,0.25158,353.89,0.0,1,0.897000,1.725000,1.725000,0.059586,4.995304,0.61000
CARDINAL,CARDINAL_0.0_119.0,4.0,0.24229,336.78,0.0,1,0.895000,1.711000,1.711000,0.059586,4.985308,0.59167
...,...,...,...,...,...,...,...,...,...,...,...,...
SWIFT,SWIFT_896.0_1245.0,231.0,0.03237,61.00,0.0,0,0.086496,0.119640,0.086496,0.014696,1.844417,0.04100
SWIFT,SWIFT_896.0_1245.0,232.0,0.03100,53.00,0.0,0,0.084821,0.097029,0.084821,0.014696,1.795906,0.00000
SWIFT,SWIFT_896.0_1245.0,233.0,0.03000,28.00,0.0,0,0.086696,0.117196,0.086696,0.014696,1.851028,0.02300
SWIFT,SWIFT_896.0_1245.0,234.0,0.01400,37.00,0.0,0,0.085596,0.103846,0.085596,0.014696,1.867198,0.05000


In [147]:
start = production_data\
    .groupby('period')\
    .agg({'day': 'min'})\
    .rename(columns={'day': 'start'})
start

Unnamed: 0_level_0,start
period,Unnamed: 1_level_1
CARDINAL_0.0_119.0,0.0
CARDINAL_119.0_486.0,0.0
CARDINAL_486.0_781.0,0.0
CROW_24.0_216.0,0.0
EAGLE_0.0_205.0,1.0
EAGLE_205.0_1483.0,0.0
JAY_0.0_132.0,0.0
JAY_132.0_480.0,0.0
JAY_480.0_782.0,0.0
KITE_111.0_1066.0,0.0


Начало разное, поэтому выполняю сдвиг.

In [148]:
production_data = production_data.join(start, on='period')

In [149]:
production_data

Unnamed: 0,period,day,Gas Volume (MMscf),flowrate,Gas Lift Inj Volume (MMscf),Pressure Source _Tubing Pressure,Casing_Pressure_ksia,Tubing_Pressure_ksia,Active_Pressure_ksia,Line_Pressure_ksia,Calculated_Sandface_Pressure_ksia,water_thousand_bbl,start
CARDINAL,CARDINAL_0.0_119.0,0.0,0.24546,321.67,0.0,1,0.908000,1.820000,1.820000,0.059586,5.142505,0.66333,0.0
CARDINAL,CARDINAL_0.0_119.0,1.0,0.24935,337.89,0.0,1,0.912000,1.767000,1.767000,0.059586,5.089437,0.69000,0.0
CARDINAL,CARDINAL_0.0_119.0,2.0,0.25026,336.47,0.0,1,0.900000,1.747000,1.747000,0.059586,5.049800,0.64667,0.0
CARDINAL,CARDINAL_0.0_119.0,3.0,0.25158,353.89,0.0,1,0.897000,1.725000,1.725000,0.059586,4.995304,0.61000,0.0
CARDINAL,CARDINAL_0.0_119.0,4.0,0.24229,336.78,0.0,1,0.895000,1.711000,1.711000,0.059586,4.985308,0.59167,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
SWIFT,SWIFT_896.0_1245.0,231.0,0.03237,61.00,0.0,0,0.086496,0.119640,0.086496,0.014696,1.844417,0.04100,0.0
SWIFT,SWIFT_896.0_1245.0,232.0,0.03100,53.00,0.0,0,0.084821,0.097029,0.084821,0.014696,1.795906,0.00000,0.0
SWIFT,SWIFT_896.0_1245.0,233.0,0.03000,28.00,0.0,0,0.086696,0.117196,0.086696,0.014696,1.851028,0.02300,0.0
SWIFT,SWIFT_896.0_1245.0,234.0,0.01400,37.00,0.0,0,0.085596,0.103846,0.085596,0.014696,1.867198,0.05000,0.0


In [150]:
production_data['day'] = production_data.day - production_data.start

In [151]:
production_data.drop(columns=['start'], inplace=True)

Проверяю:

In [152]:
production_data.groupby('period').agg({'day': 'min'})

Unnamed: 0_level_0,day
period,Unnamed: 1_level_1
CARDINAL_0.0_119.0,0.0
CARDINAL_119.0_486.0,0.0
CARDINAL_486.0_781.0,0.0
CROW_24.0_216.0,0.0
EAGLE_0.0_205.0,0.0
EAGLE_205.0_1483.0,0.0
JAY_0.0_132.0,0.0
JAY_132.0_480.0,0.0
JAY_480.0_782.0,0.0
KITE_111.0_1066.0,0.0


Добавляю периоды в данные по скважинам

In [153]:
well_data = production_data[['period']].drop_duplicates().join(well_data)
well_data.set_index('period', inplace=True)
well_data

Unnamed: 0_level_0,inverse_spacing,Initial_Pressure_Estimate_ksi,Dew_Point_Pressure_kpsi,Bubble_Point_Pressure_kpsi,Reservoir_Temperature_degFe2,Sandface_Temp_degFe2,Static_Wellhead_Temp_degFe2,Stages_e2,Net_Pay_kft,TVD_kft,...,Bottom_Perf_kft,Initial_GOR_Mscf/bbl,Clusters_e3,Porosity_e1,Clusters_per_Stage_e1,Total_Proppant_lbs_e8,Total_Fluid_Bbls_e6,Oil_Specific_Gravity,Water_Saturation,Oil_Saturation
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CARDINAL_0.0_119.0,1.428571,5.9,2.44,2.44,2.28,2.28,0.7,0.49,0.067,8.759,...,16.226,0.75684,0.735,0.52,1.5,0.265768,0.542527,0.876161,0.36,0.64
CARDINAL_119.0_486.0,1.428571,5.9,2.44,2.44,2.28,2.28,0.7,0.49,0.067,8.759,...,16.226,0.75684,0.735,0.52,1.5,0.265768,0.542527,0.876161,0.36,0.64
CARDINAL_486.0_781.0,1.428571,5.9,2.44,2.44,2.28,2.28,0.7,0.49,0.067,8.759,...,16.226,0.75684,0.735,0.52,1.5,0.265768,0.542527,0.876161,0.36,0.64
CROW_24.0_216.0,0.769231,5.5,1.764,1.764,2.21,2.2,0.7,0.69,0.063,8.4202,...,19.171,0.51307,1.035,0.52,1.5,0.3902,0.575663,0.813218,0.321,0.679
EAGLE_0.0_205.0,0.0,5.0,2.122,2.122,2.35,2.34,0.6,0.5,0.074,7.89081,...,15.439,0.5579,0.45,0.63,0.9,0.252674,0.577003,0.847508,0.27,0.73
EAGLE_205.0_1483.0,0.0,5.0,2.122,2.122,2.35,2.34,0.6,0.5,0.074,7.89081,...,15.439,0.5579,0.45,0.63,0.9,0.252674,0.577003,0.847508,0.27,0.73
JAY_0.0_132.0,1.428571,5.7,2.459,2.459,2.28,2.279,0.7,0.48,0.067,8.79071,...,16.173,0.76384,0.432,0.52,0.9,0.25986,0.519038,0.876161,0.361,0.602
JAY_132.0_480.0,1.428571,5.7,2.459,2.459,2.28,2.279,0.7,0.48,0.067,8.79071,...,16.173,0.76384,0.432,0.52,0.9,0.25986,0.519038,0.876161,0.361,0.602
JAY_480.0_782.0,1.428571,5.7,2.459,2.459,2.28,2.279,0.7,0.48,0.067,8.79071,...,16.173,0.76384,0.432,0.52,0.9,0.25986,0.519038,0.876161,0.361,0.602
KITE_111.0_1066.0,0.714286,5.0,3.064,3.064,2.38,2.35,0.65,0.34,0.056,8.84683,...,14.474,1.0347,0.306,0.5,0.9,0.185548,0.360202,0.808202,0.321,0.679


Переносим столбец с источником давления, т.к. в данном случае эта величина постоянна во всех периодах.

In [154]:
well_data['Pressure_Source_Tubing_Pressure'] = production_data\
    .groupby('period')\
    .agg({'Pressure Source _Tubing Pressure': 'mean'})
well_data

Unnamed: 0_level_0,inverse_spacing,Initial_Pressure_Estimate_ksi,Dew_Point_Pressure_kpsi,Bubble_Point_Pressure_kpsi,Reservoir_Temperature_degFe2,Sandface_Temp_degFe2,Static_Wellhead_Temp_degFe2,Stages_e2,Net_Pay_kft,TVD_kft,...,Initial_GOR_Mscf/bbl,Clusters_e3,Porosity_e1,Clusters_per_Stage_e1,Total_Proppant_lbs_e8,Total_Fluid_Bbls_e6,Oil_Specific_Gravity,Water_Saturation,Oil_Saturation,Pressure_Source_Tubing_Pressure
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CARDINAL_0.0_119.0,1.428571,5.9,2.44,2.44,2.28,2.28,0.7,0.49,0.067,8.759,...,0.75684,0.735,0.52,1.5,0.265768,0.542527,0.876161,0.36,0.64,1.0
CARDINAL_119.0_486.0,1.428571,5.9,2.44,2.44,2.28,2.28,0.7,0.49,0.067,8.759,...,0.75684,0.735,0.52,1.5,0.265768,0.542527,0.876161,0.36,0.64,1.0
CARDINAL_486.0_781.0,1.428571,5.9,2.44,2.44,2.28,2.28,0.7,0.49,0.067,8.759,...,0.75684,0.735,0.52,1.5,0.265768,0.542527,0.876161,0.36,0.64,0.0
CROW_24.0_216.0,0.769231,5.5,1.764,1.764,2.21,2.2,0.7,0.69,0.063,8.4202,...,0.51307,1.035,0.52,1.5,0.3902,0.575663,0.813218,0.321,0.679,1.0
EAGLE_0.0_205.0,0.0,5.0,2.122,2.122,2.35,2.34,0.6,0.5,0.074,7.89081,...,0.5579,0.45,0.63,0.9,0.252674,0.577003,0.847508,0.27,0.73,1.0
EAGLE_205.0_1483.0,0.0,5.0,2.122,2.122,2.35,2.34,0.6,0.5,0.074,7.89081,...,0.5579,0.45,0.63,0.9,0.252674,0.577003,0.847508,0.27,0.73,1.0
JAY_0.0_132.0,1.428571,5.7,2.459,2.459,2.28,2.279,0.7,0.48,0.067,8.79071,...,0.76384,0.432,0.52,0.9,0.25986,0.519038,0.876161,0.361,0.602,1.0
JAY_132.0_480.0,1.428571,5.7,2.459,2.459,2.28,2.279,0.7,0.48,0.067,8.79071,...,0.76384,0.432,0.52,0.9,0.25986,0.519038,0.876161,0.361,0.602,1.0
JAY_480.0_782.0,1.428571,5.7,2.459,2.459,2.28,2.279,0.7,0.48,0.067,8.79071,...,0.76384,0.432,0.52,0.9,0.25986,0.519038,0.876161,0.361,0.602,0.0
KITE_111.0_1066.0,0.714286,5.0,3.064,3.064,2.38,2.35,0.65,0.34,0.056,8.84683,...,1.0347,0.306,0.5,0.9,0.185548,0.360202,0.808202,0.321,0.679,1.0


In [155]:
production_data.drop(columns=['Pressure Source _Tubing Pressure'], inplace=True)

Выборка очень маленькая, поэтому для окончательной проверки буду использовать только одну скважину с наибольшим периодом наблюдения.

In [156]:
production_data.groupby('period').agg({'day': 'max'}).sort_values(by='day', ascending=False).iloc[0, :]

day    1163.0
Name: EAGLE_205.0_1483.0, dtype: float64

In [157]:
test = production_data.query('period == "EAGLE_205.0_1483.0"')
test.set_index('period', inplace=True)
test

Unnamed: 0_level_0,day,Gas Volume (MMscf),flowrate,Gas Lift Inj Volume (MMscf),Casing_Pressure_ksia,Tubing_Pressure_ksia,Active_Pressure_ksia,Line_Pressure_ksia,Calculated_Sandface_Pressure_ksia,water_thousand_bbl
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
EAGLE_205.0_1483.0,0.0,0.15700,316.730,0.000,0.261669,0.400000,0.400000,0.014696,3.052517,0.19700
EAGLE_205.0_1483.0,1.0,0.15400,393.412,0.000,0.262491,0.250000,0.250000,0.014696,2.730096,0.08000
EAGLE_205.0_1483.0,2.0,0.15100,324.598,0.000,0.263314,0.380000,0.380000,0.014696,2.853773,0.06800
EAGLE_205.0_1483.0,3.0,0.14900,390.811,0.000,0.264137,0.280000,0.280000,0.014696,2.777323,0.07100
EAGLE_205.0_1483.0,4.0,0.14800,414.277,0.000,0.264960,0.280000,0.280000,0.014696,2.827907,0.09100
...,...,...,...,...,...,...,...,...,...,...
EAGLE_205.0_1483.0,1159.0,0.05635,112.910,0.339,0.689696,0.094696,0.094696,0.085197,0.965118,0.06500
EAGLE_205.0_1483.0,1160.0,0.07729,126.250,0.365,0.699696,0.104696,0.104696,0.104611,0.941650,0.06800
EAGLE_205.0_1483.0,1161.0,0.07472,122.090,0.392,0.680696,0.094696,0.094696,0.107219,0.938363,0.09000
EAGLE_205.0_1483.0,1162.0,0.08539,182.500,0.418,0.694696,0.114696,0.114696,0.107219,1.088944,0.10100


In [158]:
train = production_data.query('period != "EAGLE_205.0_1483.0"')
train.set_index('period', inplace=True)
train

Unnamed: 0_level_0,day,Gas Volume (MMscf),flowrate,Gas Lift Inj Volume (MMscf),Casing_Pressure_ksia,Tubing_Pressure_ksia,Active_Pressure_ksia,Line_Pressure_ksia,Calculated_Sandface_Pressure_ksia,water_thousand_bbl
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
CARDINAL_0.0_119.0,0.0,0.24546,321.67,0.0,0.908000,1.820000,1.820000,0.059586,5.142505,0.66333
CARDINAL_0.0_119.0,1.0,0.24935,337.89,0.0,0.912000,1.767000,1.767000,0.059586,5.089437,0.69000
CARDINAL_0.0_119.0,2.0,0.25026,336.47,0.0,0.900000,1.747000,1.747000,0.059586,5.049800,0.64667
CARDINAL_0.0_119.0,3.0,0.25158,353.89,0.0,0.897000,1.725000,1.725000,0.059586,4.995304,0.61000
CARDINAL_0.0_119.0,4.0,0.24229,336.78,0.0,0.895000,1.711000,1.711000,0.059586,4.985308,0.59167
...,...,...,...,...,...,...,...,...,...,...
SWIFT_896.0_1245.0,231.0,0.03237,61.00,0.0,0.086496,0.119640,0.086496,0.014696,1.844417,0.04100
SWIFT_896.0_1245.0,232.0,0.03100,53.00,0.0,0.084821,0.097029,0.084821,0.014696,1.795906,0.00000
SWIFT_896.0_1245.0,233.0,0.03000,28.00,0.0,0.086696,0.117196,0.086696,0.014696,1.851028,0.02300
SWIFT_896.0_1245.0,234.0,0.01400,37.00,0.0,0.085596,0.103846,0.085596,0.014696,1.867198,0.05000


In [159]:
well_data_test = well_data.loc[["EAGLE_205.0_1483.0"], :]
well_data_test

Unnamed: 0_level_0,inverse_spacing,Initial_Pressure_Estimate_ksi,Dew_Point_Pressure_kpsi,Bubble_Point_Pressure_kpsi,Reservoir_Temperature_degFe2,Sandface_Temp_degFe2,Static_Wellhead_Temp_degFe2,Stages_e2,Net_Pay_kft,TVD_kft,...,Initial_GOR_Mscf/bbl,Clusters_e3,Porosity_e1,Clusters_per_Stage_e1,Total_Proppant_lbs_e8,Total_Fluid_Bbls_e6,Oil_Specific_Gravity,Water_Saturation,Oil_Saturation,Pressure_Source_Tubing_Pressure
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EAGLE_205.0_1483.0,0.0,5.0,2.122,2.122,2.35,2.34,0.6,0.5,0.074,7.89081,...,0.5579,0.45,0.63,0.9,0.252674,0.577003,0.847508,0.27,0.73,1.0


In [160]:
well_data_train = well_data.drop(index="EAGLE_205.0_1483.0")
well_data_train

Unnamed: 0_level_0,inverse_spacing,Initial_Pressure_Estimate_ksi,Dew_Point_Pressure_kpsi,Bubble_Point_Pressure_kpsi,Reservoir_Temperature_degFe2,Sandface_Temp_degFe2,Static_Wellhead_Temp_degFe2,Stages_e2,Net_Pay_kft,TVD_kft,...,Initial_GOR_Mscf/bbl,Clusters_e3,Porosity_e1,Clusters_per_Stage_e1,Total_Proppant_lbs_e8,Total_Fluid_Bbls_e6,Oil_Specific_Gravity,Water_Saturation,Oil_Saturation,Pressure_Source_Tubing_Pressure
period,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CARDINAL_0.0_119.0,1.428571,5.9,2.44,2.44,2.28,2.28,0.7,0.49,0.067,8.759,...,0.75684,0.735,0.52,1.5,0.265768,0.542527,0.876161,0.36,0.64,1.0
CARDINAL_119.0_486.0,1.428571,5.9,2.44,2.44,2.28,2.28,0.7,0.49,0.067,8.759,...,0.75684,0.735,0.52,1.5,0.265768,0.542527,0.876161,0.36,0.64,1.0
CARDINAL_486.0_781.0,1.428571,5.9,2.44,2.44,2.28,2.28,0.7,0.49,0.067,8.759,...,0.75684,0.735,0.52,1.5,0.265768,0.542527,0.876161,0.36,0.64,0.0
CROW_24.0_216.0,0.769231,5.5,1.764,1.764,2.21,2.2,0.7,0.69,0.063,8.4202,...,0.51307,1.035,0.52,1.5,0.3902,0.575663,0.813218,0.321,0.679,1.0
EAGLE_0.0_205.0,0.0,5.0,2.122,2.122,2.35,2.34,0.6,0.5,0.074,7.89081,...,0.5579,0.45,0.63,0.9,0.252674,0.577003,0.847508,0.27,0.73,1.0
JAY_0.0_132.0,1.428571,5.7,2.459,2.459,2.28,2.279,0.7,0.48,0.067,8.79071,...,0.76384,0.432,0.52,0.9,0.25986,0.519038,0.876161,0.361,0.602,1.0
JAY_132.0_480.0,1.428571,5.7,2.459,2.459,2.28,2.279,0.7,0.48,0.067,8.79071,...,0.76384,0.432,0.52,0.9,0.25986,0.519038,0.876161,0.361,0.602,1.0
JAY_480.0_782.0,1.428571,5.7,2.459,2.459,2.28,2.279,0.7,0.48,0.067,8.79071,...,0.76384,0.432,0.52,0.9,0.25986,0.519038,0.876161,0.361,0.602,0.0
KITE_111.0_1066.0,0.714286,5.0,3.064,3.064,2.38,2.35,0.65,0.34,0.056,8.84683,...,1.0347,0.306,0.5,0.9,0.185548,0.360202,0.808202,0.321,0.679,1.0
LARK_122.0_438.0,1.25,5.7,2.455,2.455,2.28,2.28,0.7,0.5,0.068,8.77986,...,0.76228,0.45,0.52,0.9,0.270246,0.535169,0.876161,0.363,0.637,1.0


Минимальная продолжительность наблюдения.

In [161]:
min_observation_period = production_data.groupby('period').agg({'day': 'max'}).min()
min_observation_period[0]

101.0

In [162]:
train.reset_index(inplace=True)

In [163]:
test.reset_index(inplace=True)

In [164]:
train

Unnamed: 0,period,day,Gas Volume (MMscf),flowrate,Gas Lift Inj Volume (MMscf),Casing_Pressure_ksia,Tubing_Pressure_ksia,Active_Pressure_ksia,Line_Pressure_ksia,Calculated_Sandface_Pressure_ksia,water_thousand_bbl
0,CARDINAL_0.0_119.0,0.0,0.24546,321.67,0.0,0.908000,1.820000,1.820000,0.059586,5.142505,0.66333
1,CARDINAL_0.0_119.0,1.0,0.24935,337.89,0.0,0.912000,1.767000,1.767000,0.059586,5.089437,0.69000
2,CARDINAL_0.0_119.0,2.0,0.25026,336.47,0.0,0.900000,1.747000,1.747000,0.059586,5.049800,0.64667
3,CARDINAL_0.0_119.0,3.0,0.25158,353.89,0.0,0.897000,1.725000,1.725000,0.059586,4.995304,0.61000
4,CARDINAL_0.0_119.0,4.0,0.24229,336.78,0.0,0.895000,1.711000,1.711000,0.059586,4.985308,0.59167
...,...,...,...,...,...,...,...,...,...,...,...
7707,SWIFT_896.0_1245.0,231.0,0.03237,61.00,0.0,0.086496,0.119640,0.086496,0.014696,1.844417,0.04100
7708,SWIFT_896.0_1245.0,232.0,0.03100,53.00,0.0,0.084821,0.097029,0.084821,0.014696,1.795906,0.00000
7709,SWIFT_896.0_1245.0,233.0,0.03000,28.00,0.0,0.086696,0.117196,0.086696,0.014696,1.851028,0.02300
7710,SWIFT_896.0_1245.0,234.0,0.01400,37.00,0.0,0.085596,0.103846,0.085596,0.014696,1.867198,0.05000


Отбираю значение целевой переменной за минимальный период наблюдения, чтобы у всех скважин время наблюдения было одинаковым.

In [165]:
y_train = train\
    .loc[train.day <= min_observation_period[0], ['period', 'flowrate']]\
    .groupby('period')\
    .agg({'flowrate': 'sum'})\
    .rename(columns={'flowrate': 'total_production'})
y_train

Unnamed: 0_level_0,total_production
period,Unnamed: 1_level_1
CARDINAL_0.0_119.0,32417.24
CARDINAL_119.0_486.0,23956.887
CARDINAL_486.0_781.0,8730.8
CROW_24.0_216.0,60222.0
EAGLE_0.0_205.0,81460.884
JAY_0.0_132.0,27991.78
JAY_132.0_480.0,16254.6258
JAY_480.0_782.0,5073.97
KITE_111.0_1066.0,15051.44
LARK_122.0_438.0,30718.8528


In [166]:
y_train.to_csv('y_train.csv')

Значение целевой переменной для тестовой скважины за минимальный период наблюдения (будем использовать для оценки непараметрического метода)

In [167]:
y_test_short = test\
    .loc[test.day <= min_observation_period[0], ['period', 'flowrate']]\
    .groupby('period')\
    .agg({'flowrate': 'sum'})\
    .rename(columns={'flowrate': 'total_production'})
y_test_short

Unnamed: 0_level_0,total_production
period,Unnamed: 1_level_1
EAGLE_205.0_1483.0,45209.935


In [168]:
y_test_short.to_csv('y_test_short.csv')

Значение целевой переменной для тестовой скважины за весь период наблюдения (будем использовать для оценки параметрического метода)

In [169]:
y_test_full = test\
    .loc[:, ['period', 'flowrate']]\
    .groupby('period')\
    .agg({'flowrate': 'sum'})\
    .rename(columns={'flowrate': 'total_production'})
y_test_full

Unnamed: 0_level_0,total_production
period,Unnamed: 1_level_1
EAGLE_205.0_1483.0,238173.9993


In [170]:
y_test_full.to_csv('y_test_full.csv')

Список периодов, чтобы сохранить порядок:

In [171]:
periods = y_train.index.to_list()

Несколько вариантов продолжительности начального периода для последующего сравнения результатов

In [172]:
train_periods = [5, 10, 20, 30]

In [186]:
for train_period in train_periods:
    postfix = '_' + str(train_period)

    X_train = train.loc[train.day <= train_period, :]\
        .drop(columns='flowrate')
    X_train.set_index('period', inplace=True)
    X_train_np = np.array([X_train.loc[period, :] for period in periods])
    filename = 'X_train' + '_' + str(train_period)
    np.save(filename, X_train_np)

    well_data_train_np = np.array([well_data_train.loc[period, :] for period in periods])
    filename = 'well_data_train' + '_' + str(train_period)
    np.save(filename, well_data_train_np)

    X_test = test.loc[test.day <= train_period, :]\
        .drop(columns='flowrate')
    X_test.set_index('period', inplace=True)
    X_test_np = np.array([X_test.loc[period, :] for period in ["EAGLE_205.0_1483.0"]])
    filename = 'X_test' + '_' + str(train_period)
    np.save(filename, X_test_np)

    well_data_test_np = np.array([well_data_test.loc[period, :] for period in ["EAGLE_205.0_1483.0"]])
    filename = 'well_data_test' + '_' + str(train_period)
    np.save(filename, well_data_test_np)
