In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# Loading the Datasets

In [2]:
df_real = pd.read_csv('/Users/ghazijalal/Documents/IE University/Term 3/CAPSTONE PROJECT/Data/real_instances.csv')

In [3]:
df_sim = pd.read_csv('/Users/ghazijalal/Documents/IE University/Term 3/CAPSTONE PROJECT/Data/simulated_instances.csv')

In [4]:
df_drawn = pd.read_csv('/Users/ghazijalal/Documents/IE University/Term 3/CAPSTONE PROJECT/Data/hand_drawn_instances.csv')

In [5]:
# combine the dataframes
df_combined = pd.concat([df_real, df_sim, df_drawn], axis=0)

# Data Overview

In [6]:
def data_overview(df):
    """
    Function to provide an overview of the dataset.

    Parameters:
    df (pd.DataFrame): The DataFrame to analyze.

    Returns:
    None: Prints the overview of the DataFrame.
    """
    num_rows = df.shape[0]
    num_columns = df.shape[1]
    num_categorical = df.select_dtypes(include=['object']).shape[1]
    num_numerical = df.select_dtypes(exclude=['object']).shape[1]
    num_instances = df["id"].nunique()

    # Print the overview
    print(f"Number of rows: {num_rows}")
    print(f"Number of columns: {num_columns}")
    print(f"Number of categorical columns: {num_categorical}")
    print(f"Number of numerical columns: {num_numerical}")
    print(f"Number of unique instances: {num_instances}")

In [7]:
data_overview(df_real)

Number of rows: 32872250
Number of columns: 34
Number of categorical columns: 2
Number of numerical columns: 32
Number of unique instances: 1113


In [8]:
data_overview(df_sim)

Number of rows: 40748658
Number of columns: 34
Number of categorical columns: 2
Number of numerical columns: 32
Number of unique instances: 439


In [9]:
data_overview(df_drawn)

Number of rows: 2966410
Number of columns: 34
Number of categorical columns: 2
Number of numerical columns: 32
Number of unique instances: 10


# Unique Values

### Event Reference Table

In [63]:
label_descriptions = {
        0: 'NORMAL',
        1: 'ABRUPT INCREASE OF BSW',
        2: 'SPURIOUS CLOSURE OF DHSV',
        3: 'SEVERE SLUGGING',
        4: 'FLOW INSTABILITY',
        5: 'RAPID PRODUCTIVITY LOSS',
        6: 'QUICK RESTRICTION IN PCK',
        7: 'SCALING IN PCK',
        8: 'HYDRATE IN PRODUCTION LINE',
        9: 'UNKNOWN EVENT'
    }

# Create a DataFrame from the dictionary
event_reference_table = pd.DataFrame(list(label_descriptions.items()), columns=['label', 'Description'])
event_reference_table

Unnamed: 0,label,Description
0,0,NORMAL
1,1,ABRUPT INCREASE OF BSW
2,2,SPURIOUS CLOSURE OF DHSV
3,3,SEVERE SLUGGING
4,4,FLOW INSTABILITY
5,5,RAPID PRODUCTIVITY LOSS
6,6,QUICK RESTRICTION IN PCK
7,7,SCALING IN PCK
8,8,HYDRATE IN PRODUCTION LINE
9,9,UNKNOWN EVENT


### Feature Reference Table

In [54]:
# Dictionary of tags with their units
tags = {
    'ABER-CKGL': '%',
    'ABER-CKP': '%',
    'ESTADO-DHSV': '[0, 0.5, 1]',
    'ESTADO-M1': '[0, 0.5, 1]',
    'ESTADO-M2': '[0, 0.5, 1]',
    'ESTADO-PXO': '[0, 0.5, 1]',
    'ESTADO-SDV-GL': '[0, 0.5, 1]',
    'ESTADO-SDV-P': '[0, 0.5, 1]',
    'ESTADO-W1': '[0, 0.5, 1]',
    'ESTADO-W2': '[0, 0.5, 1]',
    'ESTADO-XO': '[0, 0.5, 1]',
    'P-ANULAR': 'Pa',
    'P-JUS-BS': 'Pa',
    'P-JUS-CKGL': 'Pa',
    'P-JUS-CKP': 'Pa',
    'P-MON-CKGL': 'Pa',
    'P-MON-CKP': 'Pa',
    'P-MON-SDV-P': 'Pa',
    'P-PDG': 'Pa',
    'PT-P': 'Pa',
    'P-TPT': 'Pa',
    'QBS': 'm3/s',
    'QGL': 'm3/s',
    'T-JUS-CKP': '°C',
    'T-MON-CKP': '°C',
    'T-PDG': '°C',
    'T-TPT': '°C',
    'class': 'Label',
    'state': 'Operational Status',
    'label': 'Instance Label',
    'well': 'Well Name',
    'id': 'Instance ID',
    'Instance': 'Instance ID (Manual)',
    'DataType': 'Dataset Type'
}

# Corresponding human-readable names for each tag
names = [
    'Opening of the GLCK (gas lift choke)',
    'Opening of the PCK (production choke)',
    'State of the DHSV (downhole safety valve)',
    'State of the PMV (production master valve)',
    'State of the AMV (annulus master valve)',
    'State of the PXO (pig-crossover) valve',
    'State of the gas lift SDV (shutdown valve)',
    'State of the production SDV (shutdown valve)',
    'State of the PWV (production wing valve)',
    'State of the AWV (annulus wing valve)',
    'State of the XO (crossover) valve',
    'Pressure in the well annulus',
    'Downstream pressure of the SP (service pump)',
    'Downstream pressure of the GLCK (gas lift choke)',
    'Downstream pressure of the PCK (production choke)',
    'Upstream pressure of the GLCK (gas lift choke)',
    'Upstream pressure of the PCK (production choke)',
    'Upstream pressure of the production SDV (shutdown valve)',
    'Pressure at the PDG (permanent downhole gauge)',
    'Downstream pressure of the PWV in the production tube',
    'Pressure at the TPT (temperature and pressure transducer)',
    'Flow rate at the SP (service pump)',
    'Gas lift flow rate',
    'Downstream temperature of the PCK (production choke)',
    'Upstream temperature of the PCK (production choke)',
    'Temperature at the PDG (permanent downhole gauge)',
    'Temperature at the TPT (temperature and pressure transducer)',
    'Label of the observation',
    'Well operational status',
    'Instance label (event type)',
    'Well name',
    'Instance identifier',
    'Instance count based on manual labeling',
    'Real , simulated or hand-drawn'
]

# Creating a reference table
reference_table = pd.DataFrame(index=np.arange(1, len(names) + 1))
reference_table.index.name = 'Number'
reference_table['Tag'] = tags.keys()
reference_table['Name'] = names
reference_table['Unit'] = tags.values()

# Display the table
reference_table


Unnamed: 0_level_0,Tag,Name,Unit
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,ABER-CKGL,Opening of the GLCK (gas lift choke),%
2,ABER-CKP,Opening of the PCK (production choke),%
3,ESTADO-DHSV,State of the DHSV (downhole safety valve),"[0, 0.5, 1]"
4,ESTADO-M1,State of the PMV (production master valve),"[0, 0.5, 1]"
5,ESTADO-M2,State of the AMV (annulus master valve),"[0, 0.5, 1]"
6,ESTADO-PXO,State of the PXO (pig-crossover) valve,"[0, 0.5, 1]"
7,ESTADO-SDV-GL,State of the gas lift SDV (shutdown valve),"[0, 0.5, 1]"
8,ESTADO-SDV-P,State of the production SDV (shutdown valve),"[0, 0.5, 1]"
9,ESTADO-W1,State of the PWV (production wing valve),"[0, 0.5, 1]"
10,ESTADO-W2,State of the AWV (annulus wing valve),"[0, 0.5, 1]"


In [11]:
# check for columns in the combined dataframe but not in the reference table
missing_columns = set(df_combined.columns) - set(tags.keys())
missing_columns

set()

In [30]:
def generate_unique_values_table(reference_table, df, column_name):
    # Step 1: Duplicate the reference_table
    unique_values_table = reference_table.copy()

    # Step 2: Add the specified column to store the number of unique values for each tag
    unique_values_table[column_name] = unique_values_table['Tag'].map(
        lambda tag: df[tag].nunique() if tag in df.columns else 0
    )

    return unique_values_table

In [None]:
unique_values_table = generate_unique_values_table(reference_table, df_real, 'Unique Values (Real)')
unique_values_table = generate_unique_values_table(unique_values_table, df_sim, 'Unique Values (Simulated)')
unique_values_table = generate_unique_values_table(unique_values_table, df_drawn, 'Unique Values (Hand-Drawn)')
unique_values_table = generate_unique_values_table(unique_values_table, df_combined, 'Unique Values (Combined)')

In [53]:
def add_value_type_column(unique_values_table):
    # Define the conditions for value type
    def determine_value_type(row):
        combined_value = row['Unique Values (Combined)']
        tag = row['Tag']
        
        # Condition 1: Check if unique values are 0
        if combined_value == 0:
            return "Empty"
        
        # Condition 2: Check if unique values are 3 or less
        if combined_value <= 3:
            return "Categorical"
        
        # Condition 3: Check if tag is 'label', 'class', or 'state'
        if tag in ['label', 'class', 'state']:
            return "Classifier"
        
        # Condition 4: Check if tag is 'well', 'id', or 'Instance'
        if tag in ['well', 'id', 'Instance']:
            return "Identifier"
        
        # Condition 5: Check if tag is 'DataType'
        if tag == 'DataType':
            return "Dataset"
        
        # Default return for other types
        return "Continuous"

    # Apply the function to create the new 'Value Type' column
    unique_values_table['Value Type'] = unique_values_table.apply(determine_value_type, axis=1)
    
    # Reorder columns: place 'Value Type' after 'Unit'
    column_order = ['Tag', 'Name', 'Unit', 'Value Type', 'Unique Values (Real)', 'Unique Values (Simulated)', 
                    'Unique Values (Hand-Drawn)', 'Unique Values (Combined)']
    
    # Reordering the DataFrame columns
    unique_values_table = unique_values_table[column_order]

    return unique_values_table

In [57]:
unique_values_table = add_value_type_column(unique_values_table)
unique_values_table

Unnamed: 0_level_0,Tag,Name,Unit,Value Type,Unique Values (Real),Unique Values (Simulated),Unique Values (Hand-Drawn),Unique Values (Combined)
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,ABER-CKGL,Opening of the GLCK (gas lift choke),%,Continuous,300484,0,0,300484
2,ABER-CKP,Opening of the PCK (production choke),%,Continuous,850210,0,0,850210
3,ESTADO-DHSV,State of the DHSV (downhole safety valve),"[0, 0.5, 1]",Categorical,2,0,0,2
4,ESTADO-M1,State of the PMV (production master valve),"[0, 0.5, 1]",Categorical,2,0,0,2
5,ESTADO-M2,State of the AMV (annulus master valve),"[0, 0.5, 1]",Categorical,3,0,0,3
6,ESTADO-PXO,State of the PXO (pig-crossover) valve,"[0, 0.5, 1]",Categorical,2,0,0,2
7,ESTADO-SDV-GL,State of the gas lift SDV (shutdown valve),"[0, 0.5, 1]",Categorical,2,0,0,2
8,ESTADO-SDV-P,State of the production SDV (shutdown valve),"[0, 0.5, 1]",Categorical,2,0,0,2
9,ESTADO-W1,State of the PWV (production wing valve),"[0, 0.5, 1]",Categorical,2,0,0,2
10,ESTADO-W2,State of the AWV (annulus wing valve),"[0, 0.5, 1]",Categorical,3,0,0,3


# Missing Values

In [32]:
def generate_missing_values_table(reference_table, df, column_name):
    # Step 1: Duplicate the reference_table
    missing_values_table = reference_table.copy()

    # Step 2: Add the specified column to store the percentage of missing values for each tag
    missing_values_table[column_name] = missing_values_table['Tag'].map(
        lambda tag: df[tag].isna().mean() * 100 if tag in df.columns else 100
    )

    return missing_values_table

In [58]:
unique_values_table_with_missing = generate_missing_values_table(unique_values_table, df_real, 'Missing Values (%) (Real)')
unique_values_table_with_missing = generate_missing_values_table(unique_values_table_with_missing, df_sim, 'Missing Values (%) (Simulated)')
unique_values_table_with_missing = generate_missing_values_table(unique_values_table_with_missing, df_drawn, 'Missing Values (%) (Hand-Drawn)')
unique_values_table_with_missing = generate_missing_values_table(unique_values_table_with_missing, df_combined, 'Missing Values (%) (Combined)')
unique_values_table_with_missing

Unnamed: 0_level_0,Tag,Name,Unit,Value Type,Unique Values (Real),Unique Values (Simulated),Unique Values (Hand-Drawn),Unique Values (Combined),Missing Values (%) (Real),Missing Values (%) (Simulated),Missing Values (%) (Hand-Drawn),Missing Values (%) (Combined)
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,ABER-CKGL,Opening of the GLCK (gas lift choke),%,Continuous,300484,0,0,300484,74.168142,100.0,100.0,88.912638
2,ABER-CKP,Opening of the PCK (production choke),%,Continuous,850210,0,0,850210,62.501934,100.0,100.0,83.905354
3,ESTADO-DHSV,State of the DHSV (downhole safety valve),"[0, 0.5, 1]",Categorical,2,0,0,2,57.820359,100.0,100.0,81.895962
4,ESTADO-M1,State of the PMV (production master valve),"[0, 0.5, 1]",Categorical,2,0,0,2,45.932496,100.0,100.0,76.79354
5,ESTADO-M2,State of the AMV (annulus master valve),"[0, 0.5, 1]",Categorical,3,0,0,3,46.190839,100.0,100.0,76.904424
6,ESTADO-PXO,State of the PXO (pig-crossover) valve,"[0, 0.5, 1]",Categorical,2,0,0,2,44.706544,100.0,100.0,76.267346
7,ESTADO-SDV-GL,State of the gas lift SDV (shutdown valve),"[0, 0.5, 1]",Categorical,2,0,0,2,47.964405,100.0,100.0,77.665661
8,ESTADO-SDV-P,State of the production SDV (shutdown valve),"[0, 0.5, 1]",Categorical,2,0,0,2,26.028468,100.0,100.0,68.250478
9,ESTADO-W1,State of the PWV (production wing valve),"[0, 0.5, 1]",Categorical,2,0,0,2,42.388988,100.0,100.0,75.272622
10,ESTADO-W2,State of the AWV (annulus wing valve),"[0, 0.5, 1]",Categorical,3,0,0,3,44.349833,100.0,100.0,76.114241


In [59]:
# export the tables to csv
unique_values_table_with_missing.to_csv('/Users/ghazijalal/Documents/IE University/Term 3/CAPSTONE PROJECT/Data/dataset_reference_table_with_unique_and_missing.csv')

# Unique Instances

In [36]:
def count_unique_by_feature(df1, df2, df3, label_column, count_basis, df1_name='DF1', df2_name='DF2', df3_name='DF3'):
    # List of dataframes and their names
    dfs = [df1, df2, df3]
    names = [df1_name, df2_name, df3_name]

    # Ensure all values in the label_column are treated as strings
    df1[label_column] = df1[label_column].astype(str)
    df2[label_column] = df2[label_column].astype(str)
    df3[label_column] = df3[label_column].astype(str)

    # Get all unique labels across the three dataframes, ensuring they are strings
    all_labels = sorted(set(df1[label_column]).union(df2[label_column]).union(df3[label_column]))

    # Initialize a DataFrame to store the results
    result_df = pd.DataFrame(index=all_labels + ['Grand Total'], columns=[df1_name, df2_name, df3_name, 'Row Total'])

    # Iterate over the dataframes and calculate counts for each unique label based on the count_basis column
    for df, name in zip(dfs, names):
        # Group by the specified label column and count unique values of the specified count_basis column
        label_counts = df.groupby(label_column)[count_basis].nunique()

        # Create a new Series with the same index as all_labels, filling missing labels with 0
        counts_aligned = pd.Series(label_counts, index=all_labels).fillna(0)

        # Fill in the result dataframe
        result_df[name] = counts_aligned

    # Calculate the row totals
    result_df['Row Total'] = result_df[df1_name] + result_df[df2_name] + result_df[df3_name]

    # Calculate the column totals
    column_totals = result_df.sum(axis=0)

    # Add the column totals to the 'Grand Total' row
    result_df.loc['Grand Total'] = column_totals

    # Convert floats to integers for cleaner presentation
    result_df = result_df.astype(int)

    return result_df

In [37]:
count_unique_by_feature(df_real, df_sim, df_drawn, 'label', 'Instance', 'Real', 'Simulated', 'Hand-Drawn')

Unnamed: 0,Real,Simulated,Hand-Drawn,Row Total
0,594,0,0,594
1,4,114,10,128
2,22,16,0,38
3,32,74,0,106
4,343,0,0,343
5,11,439,0,450
6,6,215,0,221
7,36,0,10,46
8,14,81,0,95
9,57,150,0,207


In [38]:
count_unique_by_feature(df_real, df_sim, df_drawn, 'label', 'id', 'Real', 'Simulated', 'Hand-Drawn')

Unnamed: 0,Real,Simulated,Hand-Drawn,Row Total
0,588,0,0,588
1,4,114,10,128
2,22,16,0,38
3,32,74,0,106
4,343,0,0,343
5,11,439,0,450
6,6,215,0,221
7,36,0,10,46
8,14,81,0,95
9,57,150,0,207


In [44]:
def compare_unique_values(df, label_column, identifier_1, identifier_2):
    # Get all unique labels in the dataframe
    all_labels = sorted(df[label_column].unique())

    # Initialize a DataFrame to store the results
    result_df = pd.DataFrame(index=all_labels + ['Grand Total'], columns=[identifier_1, identifier_2, 'Delta'])

    # Group by label and count unique values for identifier_1 and identifier_2
    identifier_1_counts = df.groupby(label_column)[identifier_1].nunique()
    identifier_2_counts = df.groupby(label_column)[identifier_2].nunique()

    # Create new Series with the same index as all_labels, filling missing labels with 0
    identifier_1_counts_aligned = pd.Series(identifier_1_counts, index=all_labels).fillna(0)
    identifier_2_counts_aligned = pd.Series(identifier_2_counts, index=all_labels).fillna(0)

    # Fill in the result dataframe with counts
    result_df[identifier_1] = identifier_1_counts_aligned
    result_df[identifier_2] = identifier_2_counts_aligned

    # Calculate the delta (difference between the two identifiers)
    result_df['Delta'] = result_df[identifier_1] - result_df[identifier_2]

    # Calculate the grand totals for each column
    grand_totals = result_df.sum(axis=0)
    
    # Add the grand totals to the 'Grand Total' row
    result_df.loc['Grand Total'] = grand_totals

    # Replace NaN values in the label column with 'Unknown' for non-numeric fields
    result_df.index = result_df.index.fillna('Unknown')

    # Replace NaN values in the numeric columns with 0
    result_df = result_df.fillna(0)

    # Convert numeric columns to integers for cleaner presentation
    result_df[[identifier_1, identifier_2, 'Delta']] = result_df[[identifier_1, identifier_2, 'Delta']].astype(int)

    return result_df

In [42]:
compare_unique_values(df_combined, 'label', 'Instance', 'id')

Unnamed: 0,Instance,id,Delta
0,594,588,6
1,128,118,10
2,38,38,0
3,106,106,0
4,343,343,0
5,450,450,0
6,221,221,0
7,46,46,0
8,95,95,0
9,207,207,0


In [45]:
compare_unique_values(df_combined, 'class', 'Instance', 'id')

Unnamed: 0,Instance,id,Delta
Unknown,0,0,0
0.0,1779,1177,602
1.0,124,118,6
2.0,27,27,0
3.0,106,106,0
4.0,343,343,0
5.0,446,446,0
6.0,221,221,0
7.0,9,9,0
8.0,73,73,0


In [46]:
compare_unique_values(df_combined, 'state', 'Instance', 'id')

Unnamed: 0,Instance,id,Delta
Unknown,0,0,0
0.0,2228,1552,676
1.0,12,12,0
2.0,6,6,0
3.0,3,3,0
4.0,4,4,0
5.0,2,2,0
6.0,3,3,0
7.0,6,6,0
8.0,9,9,0


In [18]:
def count_unique_values(df, row_feature, col_feature, count_feature):
    # Ensure all values in the row and column features are treated as strings and handle NaN values
    df[row_feature] = df[row_feature].fillna('Unknown').astype(str)
    df[col_feature] = df[col_feature].fillna('Unknown').astype(str)

    # Get all unique values from the row and column features
    unique_row_values = sorted(df[row_feature].unique())
    unique_col_values = sorted(df[col_feature].unique())

    # Initialize a DataFrame to store the results
    result_df = pd.DataFrame(index=unique_row_values + ['Grand Total'], columns=unique_col_values + ['Row Total'])

    # Group by the row and column features, then count unique values of the count feature
    for row_value in unique_row_values:
        for col_value in unique_col_values:
            # Filter the DataFrame for the specific row and column combination, then count unique 'count_feature'
            count = df[(df[row_feature] == row_value) & (df[col_feature] == col_value)][count_feature].nunique()
            result_df.loc[row_value, col_value] = count

    # Calculate row totals
    result_df['Row Total'] = result_df[unique_col_values].sum(axis=1)

    # Calculate column totals
    column_totals = result_df.sum(axis=0)
    
    # Add the column totals to the 'Grand Total' row
    result_df.loc['Grand Total'] = column_totals

    # Convert floats to integers for cleaner presentation
    result_df = result_df.fillna(0).astype(int)

    return result_df

In [19]:
count_unique_values(df_real, 'label', 'state', 'Instance')

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,Unknown,Row Total
0,594,1,0,1,1,1,1,1,1,594,1195
1,4,0,0,0,0,0,0,0,0,4,8
2,22,0,0,0,0,0,0,0,0,22,44
3,32,0,0,0,0,0,0,0,0,32,64
4,343,0,0,0,0,0,0,0,0,343,686
5,11,0,0,0,0,0,0,0,0,11,22
6,6,0,0,0,0,0,0,0,0,6,12
7,36,0,0,0,0,0,0,0,0,36,72
8,14,11,6,2,3,1,2,5,8,14,66
9,57,0,0,0,0,0,0,0,0,57,114


In [20]:
count_unique_values(df_real, 'label', 'class', 'Instance')

Unnamed: 0,0.0,1.0,101.0,102.0,105.0,106.0,107.0,108.0,109.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,Unknown,Row Total
0,594,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,594,1188
1,4,4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,16
2,22,0,0,22,0,0,0,0,0,11,0,0,0,0,0,0,0,22,77
3,0,0,0,0,0,0,0,0,0,0,32,0,0,0,0,0,0,32,64
4,0,0,0,0,0,0,0,0,0,0,0,343,0,0,0,0,0,343,686
5,11,0,0,0,11,0,0,0,0,0,0,0,7,0,0,0,0,11,40
6,6,0,0,0,0,6,0,0,0,0,0,0,0,6,0,0,0,6,24
7,36,0,0,0,0,0,36,0,0,0,0,0,0,0,5,0,0,36,113
8,14,0,0,0,0,0,0,14,0,0,0,0,0,0,0,12,0,14,54
9,57,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,3,57,131


In [21]:
count_unique_values(df_real, 'state', 'class', 'Instance')

Unnamed: 0,0.0,1.0,101.0,102.0,105.0,106.0,107.0,108.0,109.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,Unknown,Row Total
0.0,744,4,4,22,11,6,36,8,14,11,32,343,7,6,5,1,3,0,1257
1.0,1,0,0,0,0,0,0,11,0,0,0,0,0,0,0,0,0,0,12
2.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,5,0,0,6
3.0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,3
4.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,3,0,0,5
5.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2
6.0,1,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,3
7.0,1,0,0,0,0,0,0,4,0,0,0,0,0,0,0,3,0,0,8
8.0,1,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0,0,0,9
Unknown,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1119,1119


# Events Deep Dive

In [22]:
# print the value count of each label in each dataset
print("Real Instances:")
print(df_real['label'].value_counts())
print("\nSimulated Instances:")
print(df_sim['label'].value_counts())
print("\nHand-drawn Instances:")
print(df_drawn['label'].value_counts())
print("\nCombined Instances:")

Real Instances:
0    12158183
7     7864945
8     4809035
4     3689683
9     2635372
3      684352
5      439408
2      277001
1      236794
6       77477
Name: label, dtype: int64

Simulated Instances:
5    12862269
1     8323113
9     6845855
6     5804790
3     4264927
8     2186920
2      460784
Name: label, dtype: int64

Hand-drawn Instances:
7    2419210
1     547200
Name: label, dtype: int64

Combined Instances:


In [23]:
def extract_instances_with_label(df, target_label):
    'Extracts all instances that contain a specific label'
    
    # Step 1: Find all 'Instance' values that contain the target label
    instances_with_label = df[df['label'] == target_label]['Instance'].unique()
    
    # Step 2: Extract all rows for the found 'Instance' values
    result_df = df[df['Instance'].isin(instances_with_label)].copy()

    return result_df


## Label 1:  ABRUPT INCREASE OF BSW

In [24]:
target_label = 1
df_real_1 = extract_instances_with_label(df_real, target_label)
df_sim_1 = extract_instances_with_label(df_sim, target_label)
df_drawn_1 = extract_instances_with_label(df_drawn, target_label)
df_combined_1 = extract_instances_with_label(df_combined, target_label)

In [25]:
count_unique_instances(df_real_1, df_sim_1, df_drawn_1, 'Real', 'Simulated', 'Hand-drawn')

Unnamed: 0,Real,Simulated,Hand-drawn,Row Total
1,0,114,10,124
Grand Total,0,114,10,124


In [26]:
unique_values_with_missing_table_0 = generate_unique_values_with_missing_table(unique_values_table, df_real_1, df_sim_1, df_drawn_1, df_combined_1)
unique_values_with_missing_table_0

Unnamed: 0_level_0,Tag,Name,Unit,Unique Real,Unique Simulated,Unique Hand-drawn,Unique Combined,Missing Real (%),Missing Simulated (%),Missing Hand-drawn (%),Missing Combined (%)
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,ABER-CKGL,Opening of the GLCK (gas lift choke),%,300484,0,0,300484,,100.0,100.0,100.0
2,ABER-CKP,Opening of the PCK (production choke),%,850210,0,0,850210,,100.0,100.0,100.0
3,ESTADO-DHSV,State of the DHSV (downhole safety valve),"[0, 0.5, 1]",2,0,0,2,,100.0,100.0,97.597228
4,ESTADO-M1,State of the PMV (production master valve),"[0, 0.5, 1]",2,0,0,2,,100.0,100.0,97.597228
5,ESTADO-M2,State of the AMV (annulus master valve),"[0, 0.5, 1]",3,0,0,3,,100.0,100.0,97.597228
6,ESTADO-PXO,State of the PXO (pig-crossover) valve,"[0, 0.5, 1]",2,0,0,2,,100.0,100.0,97.597228
7,ESTADO-SDV-GL,State of the gas lift SDV (shutdown valve),"[0, 0.5, 1]",2,0,0,2,,100.0,100.0,97.597228
8,ESTADO-SDV-P,State of the production SDV (shutdown valve),"[0, 0.5, 1]",2,0,0,2,,100.0,100.0,97.597228
9,ESTADO-W1,State of the PWV (production wing valve),"[0, 0.5, 1]",2,0,0,2,,100.0,100.0,97.597228
10,ESTADO-W2,State of the AWV (annulus wing valve),"[0, 0.5, 1]",3,0,0,3,,100.0,100.0,97.597228


## Label 2: SPURIOUS CLOSURE OF DHSV

In [27]:
target_label = 2
df_real_2 = extract_instances_with_label(df_real, target_label)
df_sim_2 = extract_instances_with_label(df_sim, target_label)
df_drawn_2 = extract_instances_with_label(df_drawn, target_label)
df_combined_2 = extract_instances_with_label(df_combined, target_label)

In [28]:
count_unique_instances(df_real_2, df_sim_2, df_drawn_2, 'Real', 'Simulated', 'Hand-drawn')

Unnamed: 0,Real,Simulated,Hand-drawn,Row Total
2,0,16,0,16
Grand Total,0,16,0,16


In [29]:
unique_values_with_missing_table_2 = generate_unique_values_with_missing_table(unique_values_table, df_real_2, df_sim_2, df_drawn_2, df_combined_2)
unique_values_with_missing_table_2

Unnamed: 0_level_0,Tag,Name,Unit,Unique Real,Unique Simulated,Unique Hand-drawn,Unique Combined,Missing Real (%),Missing Simulated (%),Missing Hand-drawn (%),Missing Combined (%)
Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,ABER-CKGL,Opening of the GLCK (gas lift choke),%,300484,0,0,300484,,100.0,,100.0
2,ABER-CKP,Opening of the PCK (production choke),%,850210,0,0,850210,,100.0,,97.141173
3,ESTADO-DHSV,State of the DHSV (downhole safety valve),"[0, 0.5, 1]",2,0,0,2,,100.0,,98.598779
4,ESTADO-M1,State of the PMV (production master valve),"[0, 0.5, 1]",2,0,0,2,,100.0,,98.598779
5,ESTADO-M2,State of the AMV (annulus master valve),"[0, 0.5, 1]",3,0,0,3,,100.0,,93.867048
6,ESTADO-PXO,State of the PXO (pig-crossover) valve,"[0, 0.5, 1]",2,0,0,2,,100.0,,87.645859
7,ESTADO-SDV-GL,State of the gas lift SDV (shutdown valve),"[0, 0.5, 1]",2,0,0,2,,100.0,,96.781312
8,ESTADO-SDV-P,State of the production SDV (shutdown valve),"[0, 0.5, 1]",2,0,0,2,,100.0,,90.315878
9,ESTADO-W1,State of the PWV (production wing valve),"[0, 0.5, 1]",2,0,0,2,,100.0,,95.739951
10,ESTADO-W2,State of the AWV (annulus wing valve),"[0, 0.5, 1]",3,0,0,3,,100.0,,87.646537
