Data Cleaning + Prepration and ML Modling 

# 1. Importing Libraries

In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import sys
import os

%matplotlib inline

In [3]:
import pandas as pd
import dask.dataframe as dd
import numpy as np

# 2. Loading the Datasets


In [4]:
# Load the dataset
r_df = pd.read_csv('real_instances.csv')

In [5]:
pd.set_option('display.max_columns', None)

## 2. Pre-processing real_instances

In [6]:
temp_df = r_df

#### 2.1.  Drop features with >85^% of meissing values

In [7]:
# Drop features with high missing data (> 85%) based on EDA
features_to_drop = ['P-JUS-BS', 'P-JUS-CKP', 'P-MON-CKGL', 'T-MON-CKP', 'P-MON-SDV-P', 'PT-P', 'QBS']

temp_df.drop(columns=features_to_drop, inplace=True)


#### 2.2.  Drop instances messing classes

In [8]:
# Drop rows where the 'class' column has missing (NaN) values
temp_df = temp_df.dropna(subset=['class'])

# Check the number of rows remaining after dropping
print(f"Number of rows after dropping missing labels: {temp_df.shape[0]}")

# Check if there are any remaining missing values in 'class'
print(temp_df['class'].isnull().sum())  # Should print 0


Number of rows after dropping missing labels: 28843850
0


#### 2.3.  Impute rest of missing features using mean across same class type and well

In [10]:
# Function to impute missing values with the mean of the same class and well
def impute_class_well_mean(df):
    # Get a list of columns with missing values
    columns_with_missing_values = df.columns[df.isnull().any()].tolist()
    
    for column in columns_with_missing_values:
        # Group by both class and well, and fill missing values with the group-wise mean
        df[column] = df.groupby(['well', 'class'])[column].transform(lambda x: x.fillna(x.mean()))
    
    return df

# Impute missing values in all columns with missing values
temp_df = impute_class_well_mean(temp_df)

# Check if missing values are filled
print(temp_df.isnull().sum())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df.groupby(['well', 'class'])[column].transform(lambda x: x.fillna(x.mean()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df.groupby(['well', 'class'])[column].transform(lambda x: x.fillna(x.mean()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df.groupby

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df.groupby(['well', 'class'])[column].transform(lambda x: x.fillna(x.mean()))


timestamp               0
label                   0
well                    0
id                      0
ABER-CKGL        21379029
ABER-CKP         17588881
ESTADO-DHSV      16668218
ESTADO-M1        12782303
ESTADO-M2        12806080
ESTADO-PXO       12580613
ESTADO-SDV-GL    13400572
ESTADO-SDV-P      6614910
ESTADO-W1        11910304
ESTADO-W2        12307438
ESTADO-XO        12221682
P-ANULAR          6534177
P-JUS-CKGL        6268383
P-MON-CKP         5644192
P-PDG             6244642
P-TPT             4795839
QGL              12716265
T-JUS-CKP        14022372
T-PDG             8918366
T-TPT             3585222
class                   0
state                   0
dtype: int64


### 2.1.  Stratified Sampling

In [11]:
temp_df['class'] = temp_df['class'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df['class'] = temp_df['class'].astype(int)


In [12]:
from sklearn.model_selection import train_test_split

# Stratified sampling to maintain class proportions (ensuring all labels are present)
r_df_sample, _ = train_test_split(temp_df, test_size=0.90, stratify=temp_df['class'], random_state=42)

# Check the size and distribution of the sample
print(r_df_sample.shape)
print(r_df_sample['class'].value_counts(normalize=True))  # Check class proportions


(2884385, 26)
0      0.505139
107    0.231265
108    0.126731
4      0.085109
3      0.019732
105    0.011117
109    0.006776
8      0.004886
102    0.003089
101    0.002265
7      0.001052
2      0.000738
9      0.000615
6      0.000461
5      0.000458
1      0.000339
106    0.000228
Name: class, dtype: float64


## 3 ML Classifcation Model


### 3.1 Expriment 1: XGBoost Classfication

Model Loading & Prepration

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming 'r_df_sample' is your dataset and 'class' is the target variable
# X will be the features, y will be the target class
X = r_df_sample.drop(columns=['class'])  # Features (drop the class column)
y = r_df_sample['class']  # Target (the class labels)

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [14]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform y for both training and test sets
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Check the transformed class labels
print(np.unique(y_train))  # Ensure they are integers starting from 0


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16]


In [15]:
# Convert class labels to integers if needed
y_train = y_train.astype(int)
y_test = y_test.astype(int)


In [16]:
# Columns to drop from the dataset
columns_to_drop = ['timestamp', 'label', 'well', 'id']

# Drop these columns from both the training and test sets
X_train = X_train.drop(columns=columns_to_drop)
X_test = X_test.drop(columns=columns_to_drop)

# Verify that the columns have been dropped
print("Training set columns after dropping:", X_train.columns)
print("Test set columns after dropping:", X_test.columns)


Training set columns after dropping: Index(['ABER-CKGL', 'ABER-CKP', 'ESTADO-DHSV', 'ESTADO-M1', 'ESTADO-M2',
       'ESTADO-PXO', 'ESTADO-SDV-GL', 'ESTADO-SDV-P', 'ESTADO-W1', 'ESTADO-W2',
       'ESTADO-XO', 'P-ANULAR', 'P-JUS-CKGL', 'P-MON-CKP', 'P-PDG', 'P-TPT',
       'QGL', 'T-JUS-CKP', 'T-PDG', 'T-TPT', 'state'],
      dtype='object')
Test set columns after dropping: Index(['ABER-CKGL', 'ABER-CKP', 'ESTADO-DHSV', 'ESTADO-M1', 'ESTADO-M2',
       'ESTADO-PXO', 'ESTADO-SDV-GL', 'ESTADO-SDV-P', 'ESTADO-W1', 'ESTADO-W2',
       'ESTADO-XO', 'P-ANULAR', 'P-JUS-CKGL', 'P-MON-CKP', 'P-PDG', 'P-TPT',
       'QGL', 'T-JUS-CKP', 'T-PDG', 'T-TPT', 'state'],
      dtype='object')


In [17]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both training and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Model Training & Test

In [18]:
# Initialize XGBoost classifier for multi-class classification
xgb_simple_model = XGBClassifier(objective='multi:softmax', num_class=len(np.unique(y_train)), random_state=42)

# Train the model on the scaled data
xgb_simple_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = xgb_simple_model.predict(X_test_scaled)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 98.63%


Model validation

In [19]:
# Ensure that class 0.0 is excluded before transforming
subset_df = temp_df.sample(frac=0.1, random_state=42)


In [20]:

# Exclude rows where class is 0
subset_df = subset_df[subset_df['class'] != 0]  # Exclude class 0

# Separate features and target
X_subset = subset_df.drop(columns=columns_to_drop + ['class'])  # Drop unwanted columns
y_subset = subset_df['class']  # Target (class labels)

# 1. Label encode the target variable using the original label encoder
# Ensure that the original label encoder includes class 0 in its classes
y_subset_encoded = label_encoder.transform(y_subset)  # Use the original label_encoder (fitted on the full dataset)

# 2. Convert all feature columns to integer values
#X_subset = X_subset.astype(int)

# 3. Apply the same StandardScaler used in the original model
X_subset_scaled = scaler.transform(X_subset)

# 4. Use the previously trained model to make predictions
y_subset_pred = xgb_simple_model.predict(X_subset_scaled)


# 6. Calculate accuracy on the 10% subset
accuracy_subset = accuracy_score(y_subset_encoded, y_subset_pred)
print(f"Prediction Accuracy on 10% of the data excluding class 0: {accuracy_subset * 100:.2f}%")

# 7. (Optional) Print the classification report
#print(classification_report(y_subset_encoded, y_subset_pred, target_names=label_encoder.classes_))


Prediction Accuracy on 10% of the data excluding class 0: 98.57%


Run the model on the eniter real dataset

In [21]:

# Exclude rows where class is 0
new_df = temp_df[temp_df['class'] != 0]  # Exclude class 0

# Separate features and target
X_subset = new_df.drop(columns=columns_to_drop + ['class'])  # Drop unwanted columns
y_subset = new_df['class']  # Target (class labels)

# 1. Label encode the target variable using the original label encoder
# Ensure that the original label encoder includes class 0 in its classes
y_subset_encoded = label_encoder.transform(y_subset)  # Use the original label_encoder (fitted on the full dataset)

# 2. Convert all feature columns to integer values
#X_subset = X_subset.astype(int)

# 3. Apply the same StandardScaler used in the original model
X_subset_scaled = scaler.transform(X_subset)

# 4. Use the previously trained model to make predictions
y_subset_pred = xgb_simple_model.predict(X_subset_scaled)


# 6. Calculate accuracy on the 10% subset
accuracy_subset = accuracy_score(y_subset_encoded, y_subset_pred)
print(f"Prediction Accuracy on the full real data excluding class 0: {accuracy_subset * 100:.2f}%")

Prediction Accuracy on the full real data excluding class 0: 98.57%


In [22]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy_subset = accuracy_score(y_subset_encoded, y_subset_pred)

# Calculate precision, recall, F1-score
precision_subset = precision_score(y_subset_encoded, y_subset_pred, average='weighted')  # Weighted for imbalanced classes
recall_subset = recall_score(y_subset_encoded, y_subset_pred, average='weighted')
f1_subset = f1_score(y_subset_encoded, y_subset_pred, average='weighted')

# Generate confusion matrix
conf_matrix = confusion_matrix(y_subset_encoded, y_subset_pred)

# Print out all the metrics
print(f"Prediction Accuracy: {accuracy_subset * 100:.2f}%")
print(f"Precision: {precision_subset * 100:.2f}%")
print(f"Recall: {recall_subset * 100:.2f}%")
print(f"F1 Score: {f1_subset * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Prediction Accuracy: 98.57%
Precision: 99.89%
Recall: 98.57%
F1 Score: 99.22%
Confusion Matrix:
[[      0       0       0       0       0       0       0       0       0
        0       0       0       0       0       0       0       0]
 [      0    9576       0       0       0       0       0       0       0
        0     207       0       0       0       0       0       0]
 [      0       0   21190       0       0       0       0       0       0
        0       0      84       0       0       0       0       0]
 [      0       0       0  568555     597       0       0       0       0
        0       0       0       0       0       0       0       0]
 [  29728       0       0     223 2424924       0       0       0       0
        0       8       0       0       0       0       0       0]
 [      0       0       0       0       0   13017       0       0       0
        0       0       0     188       0       0       0       0]
 [    847       0       0       0       0       0   12166 

In [24]:
from sklearn.metrics import accuracy_score

# Ensure y_subset_pred and y_subset_encoded are available (predicted and actual labels)

# Add 'class' and 'well' information back to the predictions
results_df = new_df.copy()  # Use your DataFrame with class and well columns
results_df['predicted_class'] = y_subset_pred  # Add predicted labels to the DataFrame

# 1. Accuracy per well
well_accuracy = results_df.groupby('well').apply(
    lambda x: accuracy_score(x['class'], x['predicted_class'])
)

print("Accuracy per Well:")
print(well_accuracy)

# 2. Accuracy per class
class_accuracy = results_df.groupby('class').apply(
    lambda x: accuracy_score(x['class'], x['predicted_class'])
)

print("Accuracy per Class:")
print(class_accuracy)


Accuracy per Well:
well
WELL-00001    0.844772
WELL-00002    0.946624
WELL-00003    0.202475
WELL-00004    0.992004
WELL-00005    0.999967
WELL-00006    0.091996
WELL-00007    1.000000
WELL-00009    0.347326
WELL-00010    0.996246
WELL-00011    0.190024
WELL-00012    0.144197
WELL-00013    0.111972
WELL-00014    0.910328
WELL-00015    0.067168
WELL-00016    0.396990
WELL-00019    0.054227
WELL-00020    0.014563
WELL-00021    0.000000
WELL-00022    0.000000
WELL-00023    0.000000
WELL-00024    0.001088
WELL-00025    0.059840
WELL-00026    0.004065
WELL-00027    0.045898
WELL-00028    0.020104
WELL-00029    0.046979
WELL-00030    0.066617
WELL-00031    0.008436
WELL-00032    0.091350
WELL-00037    0.000000
WELL-00040    0.000000
WELL-00041    0.000000
WELL-00042    0.159961
dtype: float64
Accuracy per Class:
class
1      0.978841
2      0.996052
3      0.998951
4      0.987796
5      0.985763
6      0.915770
7      0.869277
8      0.985027
9      0.994423
101    0.000000
102    0.000000


In [28]:
from sklearn.metrics import classification_report

# Print the classification report for each class
print(classification_report(y_subset_encoded, y_subset_pred))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.95      0.98      0.96      9783
           2       0.92      1.00      0.95     21274
           3       1.00      1.00      1.00    569152
           4       1.00      0.99      0.99   2454883
           5       0.97      0.99      0.98     13205
           6       0.94      0.92      0.93     13285
           7       0.94      0.87      0.90     30339
           8       0.99      0.99      0.99    140920
           9       1.00      0.99      1.00     17752
          10       1.00      0.98      0.99     65340
          11       1.00      0.97      0.98     89091
          12       1.00      0.99      1.00    320672
          13       0.88      0.31      0.45      6569
          14       1.00      0.99      0.99   6670582
          15       1.00      0.98      0.99   3655418
          16       1.00      0.98      0.99    195443

    accuracy              

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
