# Notebook Info

From the data tables that we have, we try identifying the features that matter the most for forecasting
failures.

For now the data is pulled from the `xdiag` table and failure is imported from the `failure_info` table.

Database Details:
```
# Data
database = 'oasis-prod'
schema = 'xspoc'
table = 'xdiag'

# Failure
database = 'oasis-prod'
schema = 'analysis'
table = 'failure_info'  
```

Note: The tables especially `xdiag` has data from around 900 wells. Querying the entire table may take time. Can try working on a group of wells or single wells for the analysis.

# Imports

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
import pandas as pd
from library import lib_aws

pd.set_option('display.max_rows', 500)
import warnings
warnings.filterwarnings('ignore')

# Initial Analysis

Just to check the timestamps and how the data is spread out in both the tables

In [4]:
%%time
query_initial = """
SELECT
    distinct("NodeID"),
    min("Date") as min_date,
    max("Date") as max_date
FROM xspoc.xdiag
GROUP BY "NodeID"
ORDER BY "NodeID"
"""

# queryinh the entire failure info
query_failures = """
SELECT 
    "NodeID",
    "Last Oil",
    "Start Date",
    "Finish Date",
    "Job Type",
    "Job Bucket",
    "Primary Symptom",
    "Secondary Symptom"
FROM
    analysis.failure_info
ORDER BY "NodeID";
"""

with lib_aws.PostgresRDS(db='oasis-prod', verbose=1) as engine:
    data_info = pd.read_sql(query_initial, engine, parse_dates=['Date'])
    failures = pd.read_sql(query_failures, engine, parse_dates=['Last Oil', 'Start Date', 'Finish Date'])

Connected to oasis-prod DataBase
Connection Closed
Wall time: 35.6 s


In [5]:
print('Data info')
display(data_info.head())

print('Failure info')
display(failures.head())

Data info


Unnamed: 0,NodeID,min_date,max_date
0,Aagvik 1-35H,2019-06-21 15:58:34,2020-07-21 10:18:00
1,Acadia 31-25H,2019-05-27 23:33:12,2020-08-15 16:18:15
2,Acklins 12-18H,2019-05-27 23:52:43,2020-08-15 15:29:25
3,Aerabelle 5502 43-7T,2019-05-27 23:49:54,2020-08-15 02:39:51
4,Ak Strangeland 43-12T,2019-05-28 01:20:59,2020-08-15 12:27:00


Failure info


Unnamed: 0,NodeID,Last Oil,Start Date,Finish Date,Job Type,Job Bucket,Primary Symptom,Secondary Symptom
0,Aagvik 1-35H,2019-11-27,2019-12-02,2019-12-06,TUBING LEAK,TUBING,Mechanically Induced Damage,Solids in Pump
1,Aagvik 5298 41-35 2TX,2019-05-29,2019-06-04,2019-06-25,GAS LIFT,PUMP,Low Production,Blank
2,Acadia 31-25H,2018-04-11,2018-05-05,2018-05-11,TUBING LEAK,TUBING,Corrosion,Sand
3,Acadia 31-25H,2019-03-30,2019-04-10,2019-04-16,"1-1/4"" PUMP",PUMP,Corrosion,Mechanically Induced Damage
4,Acklins 6092 12-18H,2019-12-24,2020-01-02,2020-01-03,POLISH ROD BREAK,ROD,Mechanically Induced Damage,


In [6]:
"""
Most of the data we have is after '2019-05-01'
So for now we remove all failures before that from the failures df
This will help us pick wells for analysis which have seen failures
"""

fail_cut = pd.Timestamp('2019-05-01')
failures = failures[failures['Start Date'] >= fail_cut].reset_index(drop=True)
failures.head()

Unnamed: 0,NodeID,Last Oil,Start Date,Finish Date,Job Type,Job Bucket,Primary Symptom,Secondary Symptom
0,Aagvik 1-35H,2019-11-27,2019-12-02,2019-12-06,TUBING LEAK,TUBING,Mechanically Induced Damage,Solids in Pump
1,Aagvik 5298 41-35 2TX,2019-05-29,2019-06-04,2019-06-25,GAS LIFT,PUMP,Low Production,Blank
2,Acklins 6092 12-18H,2019-12-24,2020-01-02,2020-01-03,POLISH ROD BREAK,ROD,Mechanically Induced Damage,
3,Aerabelle 5502 43-7T,2018-10-10,2019-08-13,2019-08-15,"3/4"" ROD SECTION",ROD,Mechanically Induced Damage,Dropped (X) Amount of Times
4,Alder 6092 43-8H,2019-12-23,2019-12-27,2020-01-07,"1-1/2"" PUMP",PUMP,Loose Connection,


In [7]:
"""
Checking the Distribution of Failures
"""
print("Job Type Distribution")
display(failures['Job Type'].value_counts())

print("Job Bucket Distribution")
display(failures['Job Bucket'].value_counts())

failures.groupby('Job Bucket').agg({
    'NodeID': ['nunique'],
    'Job Type': ['nunique']
})

Job Type Distribution


TUBING LEAK           133
1-1/2" PUMP            98
1" ROD SECTION         48
POLISH ROD BREAK       46
1-3/4" PUMP            35
3/4" ROD SECTION       29
7/8" ROD SECTION       21
2" PUMP                19
1-1/4" PUMP            10
SUBS (PONY ROD)         4
2-1/4 PUMP              4
GAS LIFT                4
Tubing - Body           4
Pump - Plunger          4
Polish Rod              2
BHA CHANGE              2
Pump - Barrel           2
Tubing - Unknown        1
BHA - TAC               1
ROD SINKER SECTION      1
TUBING                  1
Unknown                 1
BHA                     1
Rod - Main Body         1
Rod - Pin               1
Pump - Stuck Pump       1
Pump - Junked           1
Name: Job Type, dtype: int64

Job Bucket Distribution


PUMP      174
ROD       154
TUBING    140
BHA         6
Packer      1
Name: Job Bucket, dtype: int64

Unnamed: 0_level_0,NodeID,Job Type
Unnamed: 0_level_1,nunique,nunique
Job Bucket,Unnamed: 1_level_2,Unnamed: 2_level_2
BHA,6,2
PUMP,160,11
Packer,1,1
ROD,123,10
TUBING,125,7


In [8]:
data_wells = set(data_info.NodeID)
fail_wells = set(failures.NodeID.unique())

print("Wells with Failure:")
display(data_wells & fail_wells) # wells with failure

# print("Wells without Failure (Atleast in the failure info being used):")
# display(data_wells - fail_wells)

Wells with Failure:


{'Aagvik 1-35H',
 'Aerabelle 5502 43-7T',
 'Amazing Grace Federal 11-2H',
 'Anderson 7-18H',
 'Andre 5501 13-4H',
 'Andre 5501 14-5 3B',
 'Andre Shepherd 5501 21-5 3T',
 'Andre Shepherd 5501 21-5 5T',
 'Andrea 5502 44-7T',
 'Anvers Federal 5602 13-18H',
 'Arnold 16X-12H',
 'Arnstad 3-10H',
 'Autumn Wind State 5601 14-16B',
 'B & Rt 2958 13-25H',
 'Baffin 5601 12-18H',
 'Barenthsen 11-20H',
 'Behan 2-29H',
 'Berkner Federal 5602 43-11H',
 'Berquist 34-27H',
 'Berwick 4-2HE',
 'Betsy Federal 2758 24-29H',
 'Beulah Irene Federal 19-18H',
 'Bobby 5602 43-35H',
 'Bonita 5992 42-22H',
 'Bouvardia Federal 2658 12-12H',
 'Bowie 2958 42-21 1H',
 'Brewer 2759 13-15H',
 'Broderson 13-35H',
 'Broderson 2-27H',
 'Burleson 5502 41-7B',
 'Cade 12-19HA',
 'Carl Federal 2658 43-23H',
 'Carol 12-35H',
 'Carson Federal 2658 13-17H',
 'Ceynar 4-18HB',
 'Ceynar 4X-18H',
 'Charlie 5603 43-19H',
 'Chokecherry 2758 11-10B',
 'Christianson 5404 14-34H',
 'Conry Federal 5992 43-21 1H',
 'Contreras 5502 42-7H',


# Data Import

- Features imported from `xspoc.xdiag`

Following are the Features (Columns) to use for the initial analysis:
```
"NodeID"
"Date",
"PPRL",
"MPRL",
"FluidLoadonPump",
"PumpIntakePressure"
```


## Well Specific

In [12]:
well_name = 'Autumn Wind State 5601 14-16B'  # choose from wells which have failure

query_well = """
SELECT 
    "NodeID",
    "Date",
    "PPRL",
    "MPRL",
    "FluidLoadonPump",
    "PumpIntakePressure"
FROM
    xspoc.xdiag
WHERE "NodeID" = '{}'
ORDER BY "NodeID", "Date";
""".format(well_name)

with lib_aws.PostgresRDS(db='oasis-prod', verbose=1) as engine:
    data_well = pd.read_sql(query_well, engine, parse_dates=['Date'])
 
# Just failures for that well
failure_well = failures[failures.NodeID == well_name]
failure_well.reset_index(inplace=True, drop=True)

# Info
display(data_well.head())
print("Failure Info")
display(failure_well)

Connected to oasis-prod DataBase
Connection Closed


Unnamed: 0,NodeID,Date,PPRL,MPRL,FluidLoadonPump,PumpIntakePressure
0,Autumn Wind State 5601 14-16B,2019-05-28 00:41:41,31655.0,14863.0,9828.0,259.0
1,Autumn Wind State 5601 14-16B,2019-05-28 02:50:19,31998.0,14950.0,9559.0,402.0
2,Autumn Wind State 5601 14-16B,2019-05-28 05:36:49,32117.0,15030.0,9162.0,567.0
3,Autumn Wind State 5601 14-16B,2019-05-28 07:18:16,31959.0,14835.0,9945.0,241.0
4,Autumn Wind State 5601 14-16B,2019-05-28 09:02:41,32087.0,14747.0,8933.0,662.0


Failure Info


Unnamed: 0,NodeID,Last Oil,Start Date,Finish Date,Job Type,Job Bucket,Primary Symptom,Secondary Symptom
0,Autumn Wind State 5601 14-16B,2020-02-03,2020-02-05,2020-02-10,TUBING LEAK,TUBING,Mechanically Induced Damage,Fluid Pound


## Group of Wells

In [29]:
%%time
well_list = [
    'Anderson 7-18H',
    'Andre 5501 14-5 3B',
    'Autumn Wind State 5601 14-16B',
    'Berwick 4-2HE',
    'Carl Federal 2658 43-23H',
    'Carson Federal 2658 13-17H',
    'Cook 5300 12-13 6B',
    'Dixon 5602 44-34H',
    'Emma 13-7H',
    'Forland 28-33H',
    'Hanson 33-28H'
    'Inez 6093 43-19H',
    'Johnsrud 5198 12-18 10T',
    'Mae 5603 43-19H',
    'Susie 15-22H'
]

query_list = """
SELECT
    "NodeID",
    "Date",
    "PPRL",
    "MPRL",
    "FluidLoadonPump",
    "PumpIntakePressure"
FROM xspoc.xdiag
WHERE "NodeID" in {}
ORDER BY "NodeID","Date"
""".format(tuple(well_list))

with lib_aws.PostgresRDS(db='oasis-prod') as engine:
    data_list = pd.read_sql(query_list, engine, parse_dates=['Date'])

failure_list = failures[failures.NodeID.isin(well_list)]
failure_list.reset_index(inplace=True, drop=True)

# info
display(data_list.head())
print("Failure info in these in these wells")
display(failure_list)

Unnamed: 0,NodeID,Date,PPRL,MPRL,FluidLoadonPump,PumpIntakePressure
0,Anderson 7-18H,2019-05-28 06:33:24,31015.0,16083.0,6070.0,15.0
1,Anderson 7-18H,2019-05-28 14:25:54,31001.0,15578.0,6070.0,15.0
2,Anderson 7-18H,2019-05-28 14:28:46,31025.0,15477.0,6070.0,15.0
3,Anderson 7-18H,2019-05-29 02:30:30,30408.0,17165.0,6070.0,15.0
4,Anderson 7-18H,2019-05-29 03:43:44,30137.0,17385.0,7247.0,501.0


Failure info in these in these wells


Unnamed: 0,NodeID,Last Oil,Start Date,Finish Date,Job Type,Job Bucket,Primary Symptom,Secondary Symptom
0,Anderson 7-18H,2019-10-08,2019-10-10,2019-10-16,POLISH ROD BREAK,ROD,1st Thread Pin,
1,Andre 5501 14-5 3B,2020-03-06,2020-03-10,2020-03-13,"1-3/4"" PUMP",PUMP,Corrosion,Abrasion - Foreign Debris
2,Autumn Wind State 5601 14-16B,2020-02-03,2020-02-05,2020-02-10,TUBING LEAK,TUBING,Mechanically Induced Damage,Fluid Pound
3,Berwick 4-2HE,2019-10-31,2019-11-05,2019-11-11,"2"" PUMP",PUMP,Scale,Salt
4,Carl Federal 2658 43-23H,2019-07-26,2019-08-08,2019-08-13,"1"" ROD SECTION",ROD,Mechanically Induced Damage,Handling
5,Carl Federal 2658 43-23H,2019-06-04,2019-07-02,2019-07-02,POLISH ROD BREAK,ROD,Mechanically Induced Damage,
6,Carl Federal 2658 43-23H,2020-02-03,2020-02-07,2020-02-07,POLISH ROD BREAK,ROD,Mechanically Induced Damage,
7,Carson Federal 2658 13-17H,2020-05-21,2020-06-11,2020-06-19,TUBING LEAK,TUBING,Corrosion,Mechanically Induced Damage
8,Cook 5300 12-13 6B,2019-12-13,2019-12-17,2019-12-19,TUBING LEAK,TUBING,Corrosion,Mechanically Induced Damage
9,Dixon 5602 44-34H,2019-09-06,2019-09-19,2019-09-19,POLISH ROD BREAK,ROD,,Unknown


Wall time: 18.8 s


## Entire Feature Data

Running the next query will import the entire dataset from `xspoc.xdiag`. It has around 3,228,303 rows and took around 14min to run the query

In [9]:
# Dont Run This cell for now
# Will Import the entire Dataset
# Querying the features
query_full = """
SELECT 
    "NodeID",
    "Date",
    "PPRL",
    "MPRL",
    "FluidLoadonPump",
    "PumpIntakePressure"
FROM
    xspoc.xdiag
ORDER BY "NodeID", "Date";
"""



with lib_aws.PostgresRDS(db='oasis-prod') as engine:
    data_full = pd.read_sql(query_full, engine, parse_dates=['Date'])
    
data_full.head()

UsageError: Line magic function `%%time` not found.


## Combining

Note: the original failure info can be used. However to make it efficient we only use those wells which are present in the feature dataframe (data_well, data_list, data_full)

In [13]:
"""
Before analysing the data we need to merge the information
Transfering info from failures to data (copy of features)
Using a for loop -- may not be very efficient
"""

def fill_null(df, chk_col='PPRL', well_col='NodeID', time_col='Date'):
    """
    This function will fill in Null Values on those dates where no datapoints are present
    Helps Show failures where no data was present
    Will have to take this into account when running analysis 
    """
    data = df.copy()
    # Set time col as index if it is not
    if time_col in data.columns:
        data.set_index(time_col, inplace=True)
    
    data_gp = data.groupby(well_col).resample('1D').max()  # Groupby wellname and resample to Day freq
    data_gp.drop(columns=[well_col], inplace=True)  # Drop these columns as they are present in the index
    data_gp.reset_index(inplace=True)  # Get Back WellCol from
    data_null = data_gp[data_gp.loc[:, chk_col].isnull()]  # Get all null values, which need to be added to the main data file
    data_null.reset_index(inplace=True, drop=True)
    data.reset_index(inplace=True)  # get timestamp back in the column for concating
    data_full = pd.concat([data, data_null], axis=0, ignore_index=True)  # concat null and og files
    data_full.sort_values(by=[well_col, time_col], inplace=True)
    data_full.drop_duplicates(subset=[well_col, time_col], inplace=True)
    data_full.reset_index(drop=True, inplace=True)
    
    return data_full

def failure_merge(df, failure_df, transfer_cols):
    """
    Merges the failures info
    :param df: dataframe to which info is being transferred to. (Should have columns "NodeID" and "Date")
    :param failure_df: Failure info data (Should have columns "NodeID", "Start Date" and "End Data")
    :param cols: Columns which need to be transferred
    """
    merged = df.copy()  
    for col in transfer_cols:
        merged[col] = 'Normal'  # for now putting everything as normal (even NAN's)
        
    for i in failure_df.index:
        well = failure_df.loc[i, 'NodeID']
        t_start = failure_df.loc[i, 'Last Oil']
        t_end = failure_df.loc[i, 'Finish Date'] + pd.Timedelta('1 day')  # As we have day based frequency (the times in a day are considered as 00:00:00)
        bool_ = (merged.NodeID == well) & (merged.Date >= t_start) & (merged.Date <= t_end)  # Boolean mask for main data
        merged.loc[bool_, transfer_cols] = failure_df.loc[i, transfer_cols].values
        
    return merged

In [16]:
# Using the list of wells as the data (data_list and failure_list)
# We could choose whichever dataset we wanted

fill_data = fill_null(data_well)  # FIlling in Nan's where data was missing

transfer_col = ['Job Type', 'Job Bucket', 'Primary Symptom', 'Secondary Symptom']
data = failure_merge(fill_data, failure_well, transfer_col)

data.head()

Unnamed: 0,Date,NodeID,PPRL,MPRL,FluidLoadonPump,PumpIntakePressure,Job Type,Job Bucket,Primary Symptom,Secondary Symptom
0,2019-05-28 00:41:41,Autumn Wind State 5601 14-16B,31655.0,14863.0,9828.0,259.0,Normal,Normal,Normal,Normal
1,2019-05-28 02:50:19,Autumn Wind State 5601 14-16B,31998.0,14950.0,9559.0,402.0,Normal,Normal,Normal,Normal
2,2019-05-28 05:36:49,Autumn Wind State 5601 14-16B,32117.0,15030.0,9162.0,567.0,Normal,Normal,Normal,Normal
3,2019-05-28 07:18:16,Autumn Wind State 5601 14-16B,31959.0,14835.0,9945.0,241.0,Normal,Normal,Normal,Normal
4,2019-05-28 09:02:41,Autumn Wind State 5601 14-16B,32087.0,14747.0,8933.0,662.0,Normal,Normal,Normal,Normal


In [17]:
# Check this out to see if our data looks good for analysis
data.groupby('NodeID').agg({
    'Date': [min, max, 'count'],
    'Job Bucket': ['nunique']
})

Unnamed: 0_level_0,Date,Date,Date,Job Bucket
Unnamed: 0_level_1,min,max,count,nunique
NodeID,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Autumn Wind State 5601 14-16B,2019-05-28 00:41:41,2020-08-15 15:33:14,5007,2


In [18]:
print("Job Type value counts:")
display(data['Job Type'].value_counts())

print("Job Bucket value counts:")
display(data['Job Bucket'].value_counts())

# print("Primary Symptome value counts:")
# display(data['Primary Symptom'].value_counts())

# print("Secondary Symptom value counts:")
# display(data['Secondary Symptom'].value_counts())

Job Type value counts:


Normal         4994
TUBING LEAK      13
Name: Job Type, dtype: int64

Job Bucket value counts:


Normal    4994
TUBING      13
Name: Job Bucket, dtype: int64

# Visualizing 

## Well Specific Features and Failures

In [1]:
# imports
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

In [20]:
print('Unique wells in our Data:\n-------',*data.NodeID.unique(),sep='\n')

# data.head()

Unique wells in our Data:
-------
Autumn Wind State 5601 14-16B


In [21]:
def plot_features(df, well_name, fail_col, feature_cols, mov_avg=None):
    """
    Plots the features and failures of a specific well
    :param df: The data frame we need to use
    :param well_name: Name of the well
    :param fail_col: Failure Column to be considered
    :param feature_cols: Columns to plot as features (Should be numerical)
    :param mov_avg: Plot Moving Averages if needed (Default: None)
    """

    # get the specifc well
    df_well = df[df.NodeID == well_name].reset_index(drop=True)

    # get all the unique failures from the failure col
    fail = df_well[fail_col].unique()
    fail = fail[fail!='Normal']

    # Get only features (for mov_avging)
    if mov_avg is not None:
        df_feature = df_well.set_index('Date')[feature_cols].rolling(mov_avg).mean()
    else:
        df_feature = df_well.set_index("Date")

    # set up the figure
    fig = make_subplots(specs=[[{"secondary_y": True}]])  # secondary y_axis for failures

    # plot features
    for c in feature_cols:
        fig.add_trace(go.Scatter(x=df_feature.index, y=df_feature[c], mode='lines', name=c), secondary_y=False)

    # Plot failures
    for f in fail:
        temp_fail = df_well[fail_col].map(lambda x: 1 if x==f else 0)
        fig.add_trace(go.Scatter(x=df_well.Date, 
                                 y=temp_fail, 
                                 line={
                                     'width':0,
                                     'shape': 'hv'
                                 },
                                 fill='tozerox',
                                 name=f), secondary_y=True)

    fig.update_xaxes(rangeslider_visible=True)
    fig.update_layout(template="seaborn",title=well_name + " with MA of :" + str(mov_avg), autosize=True)
    fig.update_yaxes(title_text="Features (KPI)", secondary_y=False)
    fig.update_yaxes(title_text="Failure", secondary_y=True)

    return fig.show()

In [25]:
# Plotting
# Chanhe the params in the dictionary below
plot_params = {
    'well_name': 'Johnsrud 5198 12-18 10T',
    'fail_col': 'Job Bucket',
    'feature_cols': ['PPRL', 'MPRL', 'FluidLoadonPump', 'PumpIntakePressure'],
    'mov_avg': '12H'
}

plot_features(df=data,**plot_params)

## Creating Prediction Window

In [26]:
def create_prediction_zones(df, fail_col, prediction_zone_dict):
    """
    Depending on the prediction_zone_dict will create predictions zones for failures 
    in the Failure column.
    :param df: The dataframe to extract it from
    :param fail_col: Failure column to use from the dataframe
    :param prediction_zone_dict: A dict with timedeltas for each type of Failure in fail_col
    :return Will return a Series or an Array of these Prediction Zones
    """
    
    test_data = df[['NodeID', 'Date', fail_col]].copy()
    fail_zones = test_data[fail_col]  # fail_zones will be initialized as a copy of the fail col
    
    # Getting start of predictions from fail col
    fail_dates = test_data[test_data[fail_col] != 'Normal']  # everthing other than normal is considered as a prediction
    fail_start = fail_dates[fail_dates.Date.diff().abs().fillna(pd.Timedelta('10D')) > pd.Timedelta('1d 12H')]
    fail_start.reset_index(inplace=True, drop=True)
    
    # Adding zones by iterating over each prediction start date
    for i in fail_start.index:
        temp_well = fail_start.loc[i, 'NodeID']  # well name
        zone_end_date = fail_start.loc[i, 'Date']  # prediction start date
        fail = fail_start.loc[i, fail_col]  # actual prediction class
        zone_delta = pd.Timedelta(prediction_zone_dict[fail])  # delta to subtract from the dictionary
        zone_start_date = zone_end_date - zone_delta

        bool_ = (test_data.NodeID == temp_well) & (test_data.Date < zone_end_date) & (test_data.Date >= zone_start_date)
        fail_zones[bool_] = 'fz_' + fail
        
    return fail_zones

We can use the function `create_predictions_zones` to create prediction zones for speciic failures. The windows for each failure is specified as a dict:
```
pred_zone_dict = {
    'PUMP': '7 days',
    'ROD': '7 days',
    'TUBING': '7 day'
}
```

Using the function `plot_features` these zones can be visulized in a well specific basis.

In [32]:
# Create pred zones
# Note:  The output of the fucntion will be a pandas Series
# This can be appended to a column in your main dataframe

pred_zone_dict = {
    'PUMP': '7 days',
    'ROD': '7 days',
    'TUBING': '15 day'
}

pred_zone = create_prediction_zones(df=data, 
                                    fail_col='Job Bucket', 
                                    prediction_zone_dict=pred_zone_dict)

pred_zone.value_counts()

Normal       4873
fz_TUBING     132
TUBING          2
Name: Job Bucket, dtype: int64

In [33]:
# We can visualize thes pred zones 
# Just add the pred zones as a column to the dataframe  
# And use that as the fail_col arg 

data['Pred Zones'] = pred_zone  # appending it to the main dataframe

# Play around with the wells  
plot_params = {
    'well_name': 'Autumn Wind State 5601 14-16B',
    'fail_col': 'Pred Zones',
    'feature_cols': ['PPRL', 'MPRL', 'FluidLoadonPump', 'PumpIntakePressure'],
    'mov_avg': '12H'
}

plot_features(df=data,**plot_params)

## Clustering Prediction Zones

In [307]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [308]:
def plot_clusters(df, features, label_col, n_components=2):
    """
    Cluster Analysis for data in `df`.
    :param df: The Dataframe which is used
    :param features: Column names as an array from df
    :param label_col: Column to be considered as label (Color Dist)
    """
    features.append(label_col)
    pca_data = df[features].copy()

    # Drop Nan's and set up data
    pca_data.dropna(inplace=True)
    labels = pca_data[label_col]
    X = pca_data.drop(columns=label_col)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X) 

    # PCA
    pca = PCA(n_components=n_components)
    components = pca.fit_transform(X_scaled)
    total_var = pca.explained_variance_ratio_.sum() * 100
    
    if n_components == 2:
        fig = px.scatter(components, x=0, y=1, color=labels)
        fig.update_layout(template="seaborn", title=f'Total Explained Variance: {total_var:.2f}%',autosize=True)
        return fig.show()
    
    elif n_components == 3:
        fig = px.scatter_3d(
            components, x=0, y=1, z=2, color=labels,
            title=f'Total Explained Variance: {total_var:.2f}%',
            labels={'0': 'PC 1', '1': 'PC 2', '2': 'PC 3'}
        )
        
        return fig.show()
    
    else:
        return print("Choose n_components as 2 or 3")
    

In [314]:
plot_clusters(df=data, 
              features=['PPRL', 'MPRL', 'FluidLoadonPump', 'PumpIntakePressure'], 
              label_col='Pred Zones', 
              n_components=2)

# Coding Challenge Data

In [17]:
# Importing Failure Data
fail_loc = r"C:\Users\rai_v\OneDrive\Python Coursera\local-data\Coding Challenge Wells_ver2.xlsx"
fail_info = pd.read_excel(fail_loc, parse_dates=['LAST OIL - FAILURE START', 'LOE FINISH DATE'])

cols_rename = {
    'Well': 'NodeID',
    'LAST OIL - FAILURE START': 'Last Oil',
    'LOE FINISH DATE': 'Finish Date'
}
fail_info.rename(columns=cols_rename, inplace=True)
display(fail_info.head())
display(fail_info.Components.value_counts())
wells = list(fail_info.NodeID.unique())  # Wells we will use

Unnamed: 0,NodeID,Last Oil,Finish Date,Run time (days),Components
0,Moore 5304 13-1H,2020-05-13,2020-06-05,260,Rod - Pin
1,Nelson 24-13H,2020-02-24,2020-02-27,271,Rod - Pin
2,Susie 15-22H,2020-02-15,2020-02-25,1281,Rod - Pin
3,Thornburgh 6092 44-15H,2020-01-09,2020-01-31,1344,Rod - Pin
4,McCauley 5501 14-3 2B,2019-12-21,2019-12-27,157,Rod - Pin


Rod - Pin                 5
Pump - Barrel             5
Pump - Traveling Valve    5
Pump - On - Off Tool      5
Pump - Plunger            5
Pump - Stuck Pump         5
Rod - Main Body           5
Polish Rod                5
Pump - Standing Valve     5
Rod - Coupling            5
Name: Components, dtype: int64

In [38]:
%%time
# Query the main data
query = """
select 
    "NodeID",
    "Date",
    "PPRL",
    "MPRL",
    "FluidLoadonPump",
    "PumpIntakePressure"
from xspoc.xdiag
where "NodeID" in {}
order by "NodeID", "Date"
""".format(tuple(wells))

with lib_aws.PostgresRDS(db='oasis-prod') as engine:
    data = pd.read_sql(query, engine, parse_dates='Date')

data.dropna(subset=['PPRL', 'MPRL', 'FluidLoadonPump'], inplace=True)
data.drop_duplicates(subset=['NodeID', 'Date'], inplace=True)
data.reset_index(inplace=True, drop=True)
    
data.head()

Wall time: 17.7 s


Unnamed: 0,NodeID,Date,PPRL,MPRL,FluidLoadonPump,PumpIntakePressure
0,Andre 5501 13-4H,2019-06-05 06:33:55,26845.0,14880.0,6752.0,115.0
1,Andre 5501 13-4H,2019-06-05 08:22:05,26868.0,14827.0,6261.0,393.0
2,Andre 5501 13-4H,2019-06-05 10:09:01,26897.0,14662.0,5917.0,587.0
3,Andre 5501 13-4H,2019-06-05 11:55:58,26723.0,14660.0,5948.0,570.0
4,Andre 5501 13-4H,2019-06-05 13:42:19,27065.0,14582.0,6285.0,379.0


In [39]:
# Transferring the Failure Info
data = fill_null(data)  # FIlling in Nan's where data was missing
transfer_col = ['Components', 'Run time (days)']
data = failure_merge(data, fail_info, transfer_col)
data.drop(columns = ['Run time (days)'], inplace=True)
data.rename(columns={'Components': 'Failure'}, inplace=True)

data.head()

Unnamed: 0,Date,NodeID,PPRL,MPRL,FluidLoadonPump,PumpIntakePressure,Failure
0,2019-06-05 06:33:55,Andre 5501 13-4H,26845.0,14880.0,6752.0,115.0,Normal
1,2019-06-05 08:22:05,Andre 5501 13-4H,26868.0,14827.0,6261.0,393.0,Normal
2,2019-06-05 10:09:01,Andre 5501 13-4H,26897.0,14662.0,5917.0,587.0,Normal
3,2019-06-05 11:55:58,Andre 5501 13-4H,26723.0,14660.0,5948.0,570.0,Normal
4,2019-06-05 13:42:19,Andre 5501 13-4H,27065.0,14582.0,6285.0,379.0,Normal


In [41]:
data.to_csv('s3://enfinite-public/sample_data/forecasting_test_data.csv', index=False)

In [42]:
data_test = pd.read_csv('s3://enfinite-public/sample_data/forecasting_test_data.csv', parse_dates=['Date'])

In [43]:
data_test.head()

Unnamed: 0,Date,NodeID,PPRL,MPRL,FluidLoadonPump,PumpIntakePressure,Failure
0,2019-06-05 06:33:55,Andre 5501 13-4H,26845.0,14880.0,6752.0,115.0,Normal
1,2019-06-05 08:22:05,Andre 5501 13-4H,26868.0,14827.0,6261.0,393.0,Normal
2,2019-06-05 10:09:01,Andre 5501 13-4H,26897.0,14662.0,5917.0,587.0,Normal
3,2019-06-05 11:55:58,Andre 5501 13-4H,26723.0,14660.0,5948.0,570.0,Normal
4,2019-06-05 13:42:19,Andre 5501 13-4H,27065.0,14582.0,6285.0,379.0,Normal
