In [20]:
from IPython.display import clear_output
import os
from tqdm.auto import tqdm

import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, Dense


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from functions.graph_with_prediction import graph_with_prediction
from functions.train_test_wells import train_test_wells

In [2]:
train_data = pd.read_csv('../../data/preprocessed_data/train_with_inj_wells_without_gaps.csv')
train_data['MEASURED_IN_DATE'] = pd.to_datetime(train_data['MEASURED_IN_DATE'])
train_data.set_index('MEASURED_IN_DATE', inplace=True)

X = train_data.iloc[:, : -1]
y = train_data['TARGET_OIL_RATE']

data_for_well = train_data[train_data['WELL_NAME'] == 1]
X_well = data_for_well.iloc[:, : -1]
y_well = data_for_well['TARGET_OIL_RATE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=False)
X_well_train, X_well_test, y_well_train, y_well_test = train_test_split(X_well, y_well, test_size=0.1, shuffle=False)

In [3]:
df = pd.DataFrame(train_data.isna().any()).reset_index()
df[df[0] == True]['index'].values


array([], dtype=object)

In [6]:
data_for_well.shape[1], X_train.shape[1]

(114, (32249, 113))

In [4]:
total_days_ago = data_for_well.shape[1] - 1

model = Sequential()
model.add(LSTM(64, input_shape=(total_days_ago, 1)))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer='adam')

model_fit = model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
 25/504 [>.............................] - ETA: 40s - loss: 38746.3867

KeyboardInterrupt: 

In [28]:
clear_output(wait=True)

target = 'OIL_RATE'
test_sizes = [30, 45, 60, 120]


path_to_dir_with_data = '../../data/data_for_model/dataframes/'
name_files = os.listdir(path_to_dir_with_data)[1:]


for data_name in tqdm(name_files, desc='Csv', colour='red', leave=False, position=0):
    count_feature = int(data_name.split('_')[0])
    
    path_to_data = f'{path_to_dir_with_data}{data_name}'
    df = pd.read_csv(path_to_data)
    
    df['MEASURED_IN_DATE'] = pd.to_datetime(df['MEASURED_IN_DATE'])
    df.set_index('MEASURED_IN_DATE', inplace=True)
    
    for test_size in test_sizes:
        X_train, X_test, y_train, y_test = train_test_wells(df, target=target , test_size=test_size)
        X_train = X_train.astype('float')
        X_test = X_test.astype('float') 
        y_train = y_train.astype('float') 
        y_test = y_test.astype('float')
        
        input_feature = X_train.shape[1]
        
        model = Sequential()
        model.add(LSTM(64, input_shape=(input_feature, 1)))
        model.add(Dense(1))

        model.compile(loss='mean_squared_error', optimizer='adam')

        model_fit = model.fit(X_train, y_train, epochs=800, batch_size=64, verbose=1)
        
        for well in list(df['WELL_NAME'].unique()):
                    try:
                    
                        X_test_well = X_test[X_test['WELL_NAME'] == well]
                        y_test_well = y_test[y_test['WELL_NAME'] == well][f'TARGET_{target}']

                        data = X_test_well.iloc[[0]]

                        total_points = X_test_well.shape[0]

                        answer = []

                        for point in range(total_points):

                            value_point = float(model.predict(data))


                            answer.append(value_point)

                            if len(answer) == total_points:
                                break

                            data.index = [X_test_well.index[point + 1]]
                            data.iloc[:, : - count_feature] = [X_test_well.iloc[point + 1, : - count_feature]]

                            data.iloc[:, - count_feature : -1] = data.iloc[:, - count_feature + 1 :];
                            data.iloc[:, - 1] = value_point

                        answer = pd.Series(answer, index=y_test_well.index).astype('float')

                        MSE = mean_squared_error(y_test_well, answer) 
                        MAE = mean_absolute_error(y_test_well, answer)

                        result_new.loc[result_new.shape[0]] = [data_name, 
                                                       well, 
                                                       count_feature,
                                                       test_size,  
                                                       MSE, 
                                                       MAE]

                        result_new.drop_duplicates(inplace=True)
                    
                    except ValueError:
                        result_new.loc[result_new.shape[0]] = [data_name, 
                                                               well,
                                                               count_feature,
                                                               test_size,
                                                               np.nan, 
                                                               np.nan]
                        result_new.drop_duplicates(inplace=True)
                        continue
                

Csv:   0%|[31m                                                                                       [0m| 0/4 [00:00<?, ?it/s][0m

Epoch 1/800
Epoch 2/800
Epoch 3/800
Epoch 4/800
Epoch 5/800
Epoch 6/800
Epoch 7/800
Epoch 8/800
Epoch 9/800
Epoch 10/800
Epoch 11/800
Epoch 12/800
Epoch 13/800
Epoch 14/800
Epoch 15/800
Epoch 16/800
Epoch 17/800
Epoch 18/800
Epoch 19/800
Epoch 20/800
Epoch 21/800
Epoch 22/800
Epoch 23/800
Epoch 24/800
Epoch 25/800
Epoch 26/800
Epoch 27/800
Epoch 28/800
Epoch 29/800
Epoch 30/800
Epoch 31/800
Epoch 32/800
Epoch 33/800
Epoch 34/800
Epoch 35/800

                                                                                                                       

KeyboardInterrupt: 

result_new = pd.DataFrame(columns=['DATAFRAME_NAME', 'WELL_NAME', 'DAY_AGO', 'PREDICT_DAYS', 'MSE', 'MAE'],)
result_new

In [25]:
result_new.groupby(['DATAFRAME_NAME', 'WELL_NAME', 
                    'DAY_AGO', 'PREDICT_DAYS']).mean().dropna().sort_values(by=['MSE', 'MAE'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,MSE,MAE
DATAFRAME_NAME,WELL_NAME,DAY_AGO,PREDICT_DAYS,Unnamed: 4_level_1,Unnamed: 5_level_1
5_days_ago_stat.csv,1,5,30,0.161711,0.401682
5_days_ago_stat.csv,4,5,30,2.320119,1.381104
5_days_ago_not_stat.csv,71,5,30,13.228767,3.244256
5_days_ago_not_stat.csv,42,5,30,17.525462,4.013865
5_days_ago_not_stat.csv,81,5,30,18.997009,3.450099
5_days_ago_not_stat.csv,...,...,...,...,...
5_days_ago_not_stat.csv,57,5,30,376040.687579,609.414955
5_days_ago_not_stat.csv,53,5,30,379956.806526,612.096617
5_days_ago_not_stat.csv,53,5,45,410396.702435,638.070240
5_days_ago_not_stat.csv,46,5,45,455583.105508,668.542463


In [None]:
result_new.to_csv('../../data/data_for_model/RNN/result_new.csv)

In [64]:
y_pred = pd.Series(model.predict(X_test).reshape(-1), index=y_test.index)
model.history






<keras.callbacks.History at 0x1b3b5397850>

In [None]:
data = X_well_test.iloc[[0]]

total_point = X_well_test.shape[0]
count_days_ago = data.shape[1] - 1

answer = []

for i in range(total_point):
    
    value_point = float(model.predict(data))
    
    
    answer.append(value_point)
        
    data.iloc[0 , - count_days_ago : - 1] = data.iloc[0 , - count_days_ago + 1 : ]
    data.iloc[0 , - 1] = value_point

answer = pd.Series(answer, index=y_well_test.index)

In [None]:
import matplotlib.pyplot as plt

# Получаем значения функции потерь на каждой эпохе обучения
loss = model_fit.history['loss']

# Получаем значения функции потерь на валидации, если она была использована
if 'val_loss' in model_fit.history:
    val_loss = history.history['val_loss']

# Создаем график функции потерь
plt.plot(loss, label='Training Loss')
if 'val_loss' in model_fit.history:
    plt.plot(val_loss, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
graph_with_prediction(y_well_train, y_well_test, answer, 'RNN')