In [1]:
# keep the events that has long transient time (12 hours or more) and drop the rest, drop the class that points to steady faulty,
#  predict the transient class
#Model used XGBOOST
# train, test, validation dataset were carefully created to have all target class values 

# Model XGBoost V1

**In this notebook, we develop a predictive model to classify operational state of the offshore wells that are in an abnormal state, the aim is to give the people incharge a chance to
evaluate the sitatuin before the well reaches the faulty state**

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import xgboost as xgb  # Import XGBoost
from sklearn.utils.class_weight import compute_class_weight  # For handling class imbalance
from sklearn.metrics import accuracy_score, classification_report  # For evaluation


In [3]:
df = pd.read_parquet('Data/cleaned_data.parquet')
print(df.head())

            timestamp  label        well              id  P-MON-CKP  \
0 2013-10-04 22:54:00      9  WELL-00020  20131004225400  7817419.0   
1 2013-10-04 22:54:01      9  WELL-00020  20131004225400  7817328.0   
2 2013-10-04 22:54:02      9  WELL-00020  20131004225400  7817236.0   
3 2013-10-04 22:54:03      9  WELL-00020  20131004225400  7817146.0   
4 2013-10-04 22:54:04      9  WELL-00020  20131004225400  7817055.0   

        P-PDG       P-TPT  T-JUS-CKP     T-TPT  class  
0  17077970.0  25102880.0  -10.62551  3.593652    0.0  
1  17077990.0  25103050.0  -10.62626  3.593652    0.0  
2  17078010.0  25103220.0  -10.62700  3.593652    0.0  
3  17078040.0  25103380.0  -10.62774  3.593652    0.0  
4  17078060.0  25103550.0  -10.62848  3.593652    0.0  


In [4]:
# Filter the DataFrame to keep only rows with labels 0, 1, 5, or 7
df = df[df['label'].isin([0, 1, 5, 7])]

# Reset the index if needed
df.reset_index(drop=True, inplace=True)

# Check the results
print(df.info())
print(df['label'].value_counts())  # Verify only the desired labels are included


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35717592 entries, 0 to 35717591
Data columns (total 10 columns):
 #   Column     Dtype         
---  ------     -----         
 0   timestamp  datetime64[us]
 1   label      int64         
 2   well       object        
 3   id         object        
 4   P-MON-CKP  float64       
 5   P-PDG      float64       
 6   P-TPT      float64       
 7   T-JUS-CKP  float64       
 8   T-TPT      float64       
 9   class      float64       
dtypes: datetime64[us](1), float64(6), int64(1), object(2)
memory usage: 2.7+ GB
None
label
5    13301677
1     9089136
7     8619224
0     4707555
Name: count, dtype: int64


In [5]:
# Display unique values in the 'class' column
unique_values = df['class'].unique()

print("Unique values in the 'class' column:")
print(unique_values)


Unique values in the 'class' column:
[  0. 101.   1. 105.   5. 107.   7.]


In [6]:
# Filter the DataFrame to keep only rows with class [0,101,105,107]
df = df[df['class'].isin([0,101,105,107])]

# Reset the index if needed
df.reset_index(drop=True, inplace=True)


In [7]:
#check
# Display unique values in the 'class' column
unique_values = df['class'].unique()

print("Unique values in the 'class' column:")
print(unique_values)


Unique values in the 'class' column:
[  0. 101. 105. 107.]


In [8]:
df = df.drop(columns=['label'], errors='ignore')


In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler
import xgboost as xgb


# Features and sequence length
features = ['P-MON-CKP', 'P-PDG', 'P-TPT', 'T-JUS-CKP', 'T-TPT']
sequence_length = 60

# Step 1: Split IDs for `class` 0 separately
class_0_ids = df[df['class'] == 0]['id'].unique()
train_0_ids, test_0_ids = train_test_split(class_0_ids, test_size=0.2, random_state=42)
train_0_ids, val_0_ids = train_test_split(train_0_ids, test_size=0.2, random_state=42)

# Step 2: Split IDs for other classes (`101`, `105`, `107`) separately
class_other_ids = df[df['class'].isin([101, 105, 107])].groupby('class')['id'].unique()

train_other_ids, val_other_ids, test_other_ids = [], [], []
for cls, ids in class_other_ids.items():
    train_ids, test_ids = train_test_split(ids, test_size=0.2, random_state=42)
    train_ids, val_ids = train_test_split(train_ids, test_size=0.2, random_state=42)
    train_other_ids.extend(train_ids)
    val_other_ids.extend(val_ids)
    test_other_ids.extend(test_ids)

# Combine IDs
train_ids = np.unique(np.concatenate([train_0_ids, train_other_ids]))
val_ids = np.unique(np.concatenate([val_0_ids, val_other_ids]))
test_ids = np.unique(np.concatenate([test_0_ids, test_other_ids]))

# Step 3: Split the DataFrame based on IDs
train_df = df[df['id'].isin(train_ids)]
val_df = df[df['id'].isin(val_ids)]
test_df = df[df['id'].isin(test_ids)]


In [16]:

# Flatten the data to 2D for XGBoost
def prepare_x_y(df, features):
    X = df[features].values
    y = df['class'].values
    return X, y

X_train, y_train = prepare_x_y(train_df, features)
X_val, y_val = prepare_x_y(val_df, features)
X_test, y_test = prepare_x_y(test_df, features)

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Map class labels to zero-indexed format
class_mapping = {cls: idx for idx, cls in enumerate(sorted(np.unique(y_train)))}
y_train_mapped = np.array([class_mapping[label] for label in y_train])  # Map y_train to 0-indexed labels
y_val_mapped = np.array([class_mapping[label] for label in y_val])      # Map y_val to 0-indexed labels
y_test_mapped = np.array([class_mapping[label] for label in y_test])    # Map y_test to 0-indexed labels

# Compute class weights to handle class imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_mapped), y=y_train_mapped)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

print("Class mapping (original to sequential):", class_mapping)
print("Class weights (balanced):", class_weights_dict)


Class mapping (original to sequential): {0.0: 0, 101.0: 1, 105.0: 2, 107.0: 3}
Class weights (balanced): {0: 0.8867230524608758, 1: 1.0160307460760165, 2: 2.0956117428184666, 3: 0.7087964129399412}


In [17]:

xgb_model = xgb.XGBClassifier(
    objective='multi:softmax',  # Softmax for multi-class classification
    num_class=len(class_mapping),  # Number of unique classes
    eval_metric='mlogloss',       # Multi-class log loss
    use_label_encoder=False,      # Avoid label encoder warnings
    n_estimators=100,             # Number of trees
    max_depth=6,                  # Tree depth
    learning_rate=0.1,            # Learning rate
    random_state=42
)

# Train the model with early stopping
xgb_model.fit(
    X_train, y_train_mapped,
    eval_set=[(X_val, y_val_mapped)],
    verbose=True
)


Parameters: { "use_label_encoder" } are not used.



[0]	validation_0-mlogloss:1.24278
[1]	validation_0-mlogloss:1.12600
[2]	validation_0-mlogloss:1.02590
[3]	validation_0-mlogloss:0.94144
[4]	validation_0-mlogloss:0.86776
[5]	validation_0-mlogloss:0.80517
[6]	validation_0-mlogloss:0.75083
[7]	validation_0-mlogloss:0.70188
[8]	validation_0-mlogloss:0.65718
[9]	validation_0-mlogloss:0.61783
[10]	validation_0-mlogloss:0.58202
[11]	validation_0-mlogloss:0.55014
[12]	validation_0-mlogloss:0.52103
[13]	validation_0-mlogloss:0.49625
[14]	validation_0-mlogloss:0.47348
[15]	validation_0-mlogloss:0.44975
[16]	validation_0-mlogloss:0.43031
[17]	validation_0-mlogloss:0.41245
[18]	validation_0-mlogloss:0.39655
[19]	validation_0-mlogloss:0.38183
[20]	validation_0-mlogloss:0.36779
[21]	validation_0-mlogloss:0.35679
[22]	validation_0-mlogloss:0.34618
[23]	validation_0-mlogloss:0.33722
[24]	validation_0-mlogloss:0.32769
[25]	validation_0-mlogloss:0.31949
[26]	validation_0-mlogloss:0.31143
[27]	validation_0-mlogloss:0.30422
[28]	validation_0-mlogloss:0.2

In [18]:

# Evaluate the model on the test set
test_accuracy = xgb_model.score(X_test, y_test_mapped)
print(f"Test Accuracy: {test_accuracy}")

# Reverse mapping for predictions
reverse_class_mapping = {v: k for k, v in class_mapping.items()}  # Reverse the mapping
predictions = xgb_model.predict(X_test)
original_labels = [reverse_class_mapping[pred] for pred in predictions]  # Map back to original class labels

# Display sample predictions
print("Sample predictions (original labels):", original_labels[:10])


Test Accuracy: 0.9372290976087746
Sample predictions (original labels): [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [19]:
# Evaluate the model on the validation set
val_accuracy = xgb_model.score(X_val, y_val_mapped)
print(f"Validation Accuracy: {val_accuracy}")

# Predictions on the validation set
val_predictions = xgb_model.predict(X_val)

# Reverse mapping for validation predictions
val_original_labels = [reverse_class_mapping[pred] for pred in val_predictions]

# Display a few sample predictions for the validation set
print("Sample validation predictions (original labels):", val_original_labels[:10])

# Optionally, you can also compute additional metrics
from sklearn.metrics import classification_report, confusion_matrix

# Classification report for validation set
print("\nClassification Report on Validation Set:")
print(classification_report(y_val_mapped, val_predictions, target_names=[str(reverse_class_mapping[idx]) for idx in range(len(class_mapping))]))

# Confusion matrix for validation set
print("\nConfusion Matrix on Validation Set:")
print(confusion_matrix(y_val_mapped, val_predictions))


Validation Accuracy: 0.9274812204497658
Sample validation predictions (original labels): [105.0, 105.0, 105.0, 105.0, 105.0, 105.0, 105.0, 105.0, 105.0, 105.0]

Classification Report on Validation Set:
              precision    recall  f1-score   support

         0.0       0.84      0.88      0.86   1430815
       101.0       1.00      0.99      0.99   1526440
       105.0       0.90      1.00      0.95    682570
       107.0       0.95      0.88      0.92   1914488

    accuracy                           0.93   5554313
   macro avg       0.92      0.94      0.93   5554313
weighted avg       0.93      0.93      0.93   5554313


Confusion Matrix on Validation Set:
[[1262496    5428   76451   86440]
 [  10981 1515458       0       1]
 [   1887       0  680641      42]
 [ 221562       0       0 1692926]]


In [20]:
# Check unique 'class' values in the train, validation, and test datasets
train_classes = train_df['class'].unique()
val_classes = val_df['class'].unique()
test_classes = test_df['class'].unique()

print("Unique 'class' values in training dataset:", train_classes)
print("Unique 'class' values in validation dataset:", val_classes)
print("Unique 'class' values in test dataset:", test_classes)


Unique 'class' values in training dataset: [  0. 101. 105. 107.]
Unique 'class' values in validation dataset: [  0. 105. 107. 101.]
Unique 'class' values in test dataset: [  0. 101. 105. 107.]
