# Model 1

**In this notebook, we develop a predictive model to classify operational states in offshore wells, distinguishing between normal operations and undesirable events. This binary classification model aims to identify whether each operational instance is in a safe, normal state or if it indicates a potential issue that requires attention.**

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [2]:
df = pd.read_parquet('Data/cleaned_data.parquet')
print(df.head())

            timestamp  label        well              id  P-MON-CKP  \
0 2013-10-04 22:54:00      9  WELL-00020  20131004225400  7817419.0   
1 2013-10-04 22:54:01      9  WELL-00020  20131004225400  7817328.0   
2 2013-10-04 22:54:02      9  WELL-00020  20131004225400  7817236.0   
3 2013-10-04 22:54:03      9  WELL-00020  20131004225400  7817146.0   
4 2013-10-04 22:54:04      9  WELL-00020  20131004225400  7817055.0   

        P-PDG       P-TPT  T-JUS-CKP     T-TPT  class  
0  17077970.0  25102880.0  -10.62551  3.593652    0.0  
1  17077990.0  25103050.0  -10.62626  3.593652    0.0  
2  17078010.0  25103220.0  -10.62700  3.593652    0.0  
3  17078040.0  25103380.0  -10.62774  3.593652    0.0  
4  17078060.0  25103550.0  -10.62848  3.593652    0.0  


In [3]:
# Treat transient states as undesirable
df['target'] = df['class'].apply(lambda x: 1 if x != 0 else 0)

In [4]:
# loop through each id and get the average time between each rows
# for each id
def get_avg_time_between_rows(df):
    df['time_diff'] = df['timestamp'].diff()
    # get id 
    id = df['id'].iloc[0]
    print(f'ID: {id}')
    print(df['time_diff'].mean())

avg_time_between_rows = df.groupby('id').apply(get_avg_time_between_rows)

ID: 20131004225400
0 days 00:00:01
ID: 20140124083303
0 days 00:00:01
ID: 20140314144539
0 days 00:00:01
ID: 20140317130658
0 days 00:00:01
ID: 20140318005900
0 days 00:00:01
ID: 20140318135038
0 days 00:00:01
ID: 20140319021600
0 days 00:00:01
ID: 20140319110000
0 days 00:00:01
ID: 20140804190012
0 days 00:00:01
ID: 20140804210008
0 days 00:00:01
ID: 20140804230059
0 days 00:00:01
ID: 20140805010033
0 days 00:00:01
ID: 20140805030025
0 days 00:00:01
ID: 20140805050008
0 days 00:00:01
ID: 20140805070029
0 days 00:00:01
ID: 20140805090050
0 days 00:00:01
ID: 20140805110107
0 days 00:00:01
ID: 20140805130016
0 days 00:00:01
ID: 20140805150150
0 days 00:00:01
ID: 20140805170137
0 days 00:00:01
ID: 20140805190046
0 days 00:00:01
ID: 20140805210116
0 days 00:00:01
ID: 20140805230050
0 days 00:00:01
ID: 20140806010107
0 days 00:00:01
ID: 20140806030012
0 days 00:00:01
ID: 20140806050141
0 days 00:00:01
ID: 20140806070215
0 days 00:00:01
ID: 20140806090103
0 days 00:00:01
ID: 20140806110038
0

In [5]:
import numpy as np

def create_sequences_memmap(data, features, sequence_length, filename="sequences_memmap.dat"):
    # Convert data to Numpy array for faster slicing
    data_values = data[features + ['target']].values
    ids = data['id'].values

    # Find unique ids and their start and end indices
    unique_ids, id_start_idx = np.unique(ids, return_index=True)
    id_end_idx = np.r_[id_start_idx[1:], len(ids)]

    # Estimate the number of sequences to pre-allocate the memmap
    num_sequences = sum(max(0, end - start - sequence_length + 1) for start, end in zip(id_start_idx, id_end_idx))
    num_features = len(features)

    # Pre-allocate a memory-mapped file for X and y
    X_memmap = np.memmap(filename, dtype='float32', mode='w+', shape=(num_sequences, sequence_length, num_features))
    y_memmap = np.memmap(filename + "_y", dtype='float32', mode='w+', shape=(num_sequences,))

    idx = 0  # Tracking index for filling in the memmap arrays
    for start, end in zip(id_start_idx, id_end_idx):
        group_data = data_values[start:end]

        # Skip groups shorter than the sequence length
        if len(group_data) < sequence_length:
            continue

        # Create sequences with strides
        shape = (len(group_data) - sequence_length + 1, sequence_length, num_features)
        strides = (group_data.strides[0], group_data.strides[0], group_data.strides[1])
        sequences = np.lib.stride_tricks.as_strided(group_data[:, :-1], shape=shape, strides=strides)
        
        # Get targets
        targets = group_data[sequence_length - 1:, -1]

        # Store sequences and targets in memory-mapped arrays
        end_idx = idx + len(sequences)
        X_memmap[idx:end_idx] = sequences
        y_memmap[idx:end_idx] = targets
        idx = end_idx

    # Flush and return only the filled portion of the memmap arrays
    X_memmap.flush()
    y_memmap.flush()
    return X_memmap[:idx], y_memmap[:idx]

# Features and sequence length
features = ['P-MON-CKP', 'P-PDG', 'P-TPT', 'T-JUS-CKP', 'T-TPT']
sequence_length = 60

# Split by unique `id` to avoid leakage
train_ids, test_ids = train_test_split(df['id'].unique(), test_size=0.2, random_state=42)
train_df = df[df['id'].isin(train_ids)]
test_df = df[df['id'].isin(test_ids)]

# Prepare sequences using memory-mapped arrays
X_train, y_train = create_sequences_memmap(train_df, features, sequence_length, filename="train_sequences.dat")
X_test, y_test = create_sequences_memmap(test_df, features, sequence_length, filename="test_sequences.dat")

# Check shapes
print("Training shape:", X_train.shape, y_train.shape)
print("Testing shape:", X_test.shape, y_test.shape)


  X_memmap[idx:end_idx] = sequences
  X_memmap[idx:end_idx] = sequences


Training shape: (42757778, 60, 5) (42757778,)
Testing shape: (10221227, 60, 5) (10221227,)


In [None]:
# Define the LSTM model
model = Sequential([
    LSTM(64, input_shape=(sequence_length, len(features)), return_sequences=True),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.2),
    Dense(1, activation='sigmoid')  # Sigmoid for binary classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=8, validation_split=0.2)

Epoch 1/5

In [None]:
# Evaluate on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)