This notebook will help with the initial setup and understanding the labels present.

Some initial cleaning, if need be will be done

In [3]:
"""
For setting up local imports in an Ipython Shell
This is a workaround for ipython, dont need it for basic python scripts
"""
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
import pandas as pd
# from library import lib_aws, lib_dyna, lib_metrics

import warnings
warnings.filterwarnings('ignore')

## Importing Labeled Data

In [3]:
label_errors = {
    "Fluid pound": "Fluid Pound"
}

In [5]:
# s3 Path
data_path = 's3://et-oasis/labeledData'

# File names
wells = [
    'Bonner 9-12H.csv',
    'Bonner 9X-12HA.csv',
    'Bonner 9X-12HB.csv',
    'Cade 12-19HA.csv'
]

cols = ['NodeID', "Date", "TrueLabel1", "TrueLabel2", 'pocdowncard']

frames = []
for i in wells:
    path = os.path.join(data_path, i)
    path = path.replace("\\", "/")
    print(path)
    try:
        data_temp = pd.read_csv(path, usecols=cols, parse_dates=['Date'])
        frames.append(data_temp)
    except KeyError as e:
        print("Columns Dont Match for well {}".format(i))


s3://et-oasis/labeledData/Bonner 9-12H.csv


ImportError: Missing optional dependency 'fsspec'.  Use pip or conda to install fsspec.

In [11]:
data = pd.concat(frames)
data.reset_index(inplace=True, drop=True)
l1 = data.shape[0]
data.drop_duplicates(subset=['NodeID', 'Date'], inplace=True)

print("# of Duplicates dropped {}".format(l1-data.shape[0]))

# of Duplicates dropped 93


In [12]:
data.TrueLabel1 = data.TrueLabel1.map(label_errors).fillna(data.TrueLabel1)
data.TrueLabel2 = data.TrueLabel2.map(label_errors).fillna(data.TrueLabel2)

In [13]:
"""
Check the labels
"""

mlabels = lib_dyna.MultiLabels(df=data,
                              card_col='pocdowncard',
                              well_col='NodeID',
                              label_cols=['TrueLabel1', 'TrueLabel2'])

mlabels.remove_errors()
mlabels.merge_labels()
display(mlabels.get_group_counts())
display(mlabels.get_label_counts())

Total errors found in 0 datapoints


Unnamed: 0,TrueLabel1,TrueLabel2,totalVal,pctVal
0,Fluid Pound,,3344,24.9
1,Full Pump,Wellbore Friction,2079,15.48
2,Gas Interference,,1354,10.08
3,Leak in Traveling Valve,,1310,9.75
4,Full Pump,,1273,9.48
5,Fluid Pound,Leak in Traveling Valve,752,5.6
6,Full Pump,Tubing Movement,743,5.53
7,Fluid Pound,Pump Tagging Up,458,3.41
8,Fluid Pound,Full Pump,400,2.98
9,Gas Interference,Leak in Traveling Valve,313,2.33


Unnamed: 0,totalVal,pctVal
Fluid Pound,5392,40.146
Full Pump,5101,37.979
Gas Interference,2502,18.629
Leak in Traveling Valve,2461,18.323
Wellbore Friction,2090,15.561
Pump Tagging Up,926,6.894
Tubing Movement,752,5.599
Leak in Standing Valve,205,1.526
Excessive Vibration of the Rod,116,0.864
Pump Tagging Down,5,0.037


In [14]:
"""
Adding Labeled Data to the Database
"""

merge_type = 'replace'  # If data needs to be replaced , for appending use 'append'

# Adding data to the DB
# Use the class AddData from library.sqlFunc
lib_aws.AddData.add_data(data, db='oasis-dev', table='dynalabel', schema='clean',
                 merge_type=merge_type,card_col=['pocdowncard'], index_col='Date')

if merge_type == 'replace':
    # Update Index
    with lib_aws.PostgresRDS(db='oasis-dev') as engine:
        with engine.begin() as connection:
            connection.execute("""CREATE UNIQUE INDEX dynalabel_idx ON clean.dynalabel ("NodeID", "Date");""")


Connected to oasis-dev DataBase
Connection Closed
Data replaceed on Table dynalabel in time 45.77s
Connected to oasis-dev DataBase
Connection Closed


## Metrics of Trained Model

Model Training can be done using the script `model_training.py`.

This model is saved in an s3 bucket in the following location `s3://et-oasis/algo/rfcDynaClassification.pkl`.

The next section will get some metrics for the saved model.

**Steps:**
- Import the labeled data
- Import the model (Can also define a model here to test out new configurations and test them out)
- Use the Classes in lib_metrics to get metrics

In [None]:
"""
Import the labled Data
"""
query = """SELECT * FROM clean.dynalabel ORDER BY "NodeID", "Date"; """
query_bounds = """SELECT * FROM clean.dynabounds;"""

with lib_aws.PostgresRDS(db='oasis-dev') as engine:
    data = pd.read_sql(query, engine, parse_dates=['Date'])
    bounds_df = pd.read_sql(query_bounds, engine)
bounds_df.set_index('index', inplace=True)

display(data.head())
display(bounds_df)


In [None]:
"""
Features
"""

fea = lib_dyna.Features(df=data,
                       card_col='pocdowncard',
                       well_col='NodeID',
                       label_cols=['TrueLabel1', 'TrueLabel2'])

fea.remove_errors()
fea.merge_labels()
print("Use this table to select a min threshold to drop label groups(in pct)")
display(fea.get_group_counts())

In [None]:
thresh = 0.5  # All label gps which are below 0.5% will be dropped
fea.remove_labels(thresh=thresh)  # Remove Labels below a threshold
label_counts = fea.get_label_counts()  # Get final label counts

# Features
X = fea.get_X(fd_order=5, area=True, centroid=True, normalize_fn='df', norm_arg=bounds_df)
Y, binarizer = fea.get_Y()  # Get y and binarizer

In [None]:
"""
Defining a model to get metrics for
We are using the saved model to get the metrics
Can define a new model as well, if tests need to be performed
"""

# Importing the model from s3
s3 = lib_aws.S3(bucket='et-oasis')  # Bucket being used
rfc_model, _ = s3.import_model('algo/rfcDynaClassification.pkl')  # DOnt need the binarizer, just the architecture

print(rfc_model)

In [None]:
binarizer.classes_

In [None]:
"""
TESTING
"""
# Instantize the class for testing
testing = lib_metrics.MultiLabelAlgoTesting(x=X,
                                           y=Y,
                                           model=rfc_model,
                                           counts_df=label_counts,
                                           cl=binarizer.classes_)

In [None]:
# Baseline metrics
avg_scores, all_scores = testing.quick_test(split=0.25, verbose=0)  # test set split

print("Avergaed Scores")
display(avg_scores)
print("Class Specific Scores")
display(all_scores)


In [None]:
# Validation Metrics
kf_df = testing.kfold_validation(splits=4, oversample_thresh=0, verbose=1)

In [None]:
# """
# Setup Bounds
# Only have to do it once 
# """

# # IMport the entire dataset
# query = """select "NodeID", "Date", pocdowncard from xspoc.card order by "NodeID", "Date";"""
# with lib_aws.PostgresRDS(db='oasis-dev') as engine:
#     data = pd.read_sql(query, engine, parse_dates=['Date'])
    
# # Remove errors and get bounds
# cf = lib_dyna.CardFunctions(df=data,
#                             card_col='pocdowncard',
#                             well_col='NodeID')
# cf.remove_errors()
# full_bounds = cf.all_bounds()
# display(full_bounds)

# # Replace the full bounds df
# lib_aws.AddData.add_data(df=full_bounds, db='oasis-dev', table='dynabounds', schema='clean',
#                          merge_type='replace', card_col=None, index_col=None)
