# A notebook for Checking Available Data
## Identify wells, tops, & curves that can be used in model

------------------------

## Goals for this notebook
- 1.A. Code to identify what tops are common enough to be included
- 1.B. Code to identify what curves are common enough to be included
- 1.C. Create list of wells that include necessary tops and curves based on two steps listed above
- 1.D Document what wells were not included in training and why!
- 1.E. Write to a file a initial dataset of wells to include and tops to include.

## THINGS YOU MIGHT NEED TO CHANGE IN THIS NOTEBOOK!
1. Links to the various files!
2. Decide what the minimum number of tops that need to be present before you can work with that top!
3. Decide what the minimum number of wells that have a curve name need to be before that curve is included as a required curve to include a well in the dataset you'll work with.

## Classes & Purpose
1. InputData Class = 
2. TopsAvailable Class = 
3. CurveAvailable Class = 
4. ComboAvailable Class = 
5. ConfigurationMemory Class = Class that keeps track of configuraton choices and names/paths of intermediate files

# Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import welly
from welly import Well
import lasio
import glob
import pickle
import math

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
import os
env = %env

In [3]:
from IPython.display import display
#### Had to change display options to get this to print in full!
# pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.options.display.max_colwidth = 100000

In [151]:
class input_data():
    """doc string"""
    def __init__(self, picks_file_path, picks_delimiter_str,path_to_logs_str):
        #### Default initiation = ('../../../SPE_006_originalData/OilSandsDB/PICKS.TXT','\t','../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS')
        #### Only things that are mandatory on initiation are below
        self.picks_file_path = picks_file_path #### example = '../../../SPE_006_originalData/OilSandsDB/PICKS.TXT'
        self.picks_delimiter_str = picks_delimiter_str  #### example = '\t'
        self.picks_df = pd.read_csv(picks_file_path,delimiter=picks_delimiter_str)
        self.logs_path_to_folder = path_to_logs_str #### example = '../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS'
        #### non-mandatory attributes, defaults should work for the example dataset. Can be changed with set functions below
        self.wells_file_path = '../../../SPE_006_originalData/OilSandsDB/WELLS.TXT'
        self.wells_file_path_delimiter = '\t'
        self.gis_file_path = '../../../well_lat_lng.csv'
        self.gis_file_path_delimiter = ','
        wells_wTopsCuves_toLoad = 'WellNamesWithGivenTopsCurves_defaultFileName.pkl'
        #### for logs
        self.las_folder_path = '../../../SPE_006_originalData/OilSandsDB/Logs/'
        self.well_format = '.LAS'
        #### Technically optional but often used. 
        #### GIS file is mandatory if you want to use information from nearby wells or well's general location.
        self.wells_df = None
        self.gis_df = None

    def load_wells_file(self):
        """ load wells file into pandas dataframe """
        self.wells_df = pd.read_csv(self.wells_file_path,delimiter=self.wells_file_path_delimiter)
        return self.wells_df
    
    def load_gis_file(self):
        """ load wells file into pandas dataframe """
        self.gis_df = pd.read_csv(self.gis_file_path,delimiter=self.gis_file_path_delimiter)
        return self.gis_df
        
    def set_wells_file_path(self,wells_file_path_str,wells_file_delimiter):
        """ set wells file path as attribute of object and returns wells data frame using load_well_file. Can be txt, tsv, or csv"""
        self.wells_file_path = wells_file_path_str
        self.wells_file_path_delimiter = wells_file_delimiter
        return self.load_wells_file()
    
    def set_gis_file_path(self,gis_file_path_str,gis_file_path_delimiter):
        """ set wells file path as attribute of object and returns wells data frame using load_well_file. Can be txt, tsv, or csv"""
        self.gis_file_path = gis_file_path_str
        self.gis_file_path_delimiter = gis_file_path_delimiter
        return self.load_gis_file()

In [152]:
input_data_inst = input_data('../../../SPE_006_originalData/OilSandsDB/PICKS.TXT','\t','../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS')

In [10]:
input_data_inst.gis_df = "test_string"

In [11]:
input_data_inst.gis_df 

'test_string'

In [12]:
input_data_inst.set_wells_file_path('../../../SPE_006_originalData/OilSandsDB/WELLS.TXT','\t')

Unnamed: 0,SitID,UWI (AGS),UWI
0,102496,0674010812000,00/12-08-067-01W4/0
1,102497,0674020807000,00/07-08-067-02W4/0
2,102498,0674021109000,00/09-11-067-02W4/0
3,102500,0674022910000,00/10-29-067-02W4/0
4,102501,0674023406000,00/06-34-067-02W4/0
5,102503,0674030411000,00/11-04-067-03W4/0
6,102505,0674030810000,00/10-08-067-03W4/0
7,102507,0674031410000,00/10-14-067-03W4/0
8,102514,0674032810000,00/10-28-067-03W4/0
9,102517,0674033607000,00/07-36-067-03W4/0


In [13]:
input_data_inst.set_gis_file_path('../../../well_lat_lng.csv',',')

Unnamed: 0,SitID,UWI (AGS),UWI,HorID,Pick,Quality,lat,lng
0,102496,0674010812000,00/12-08-067-01W4/0,13000,475,3,54.785907,-110.129320
1,102497,0674020807000,00/07-08-067-02W4/0,13000,515,3,54.782284,-110.269446
2,102498,0674021109000,00/09-11-067-02W4/0,13000,480,3,54.785892,-110.186851
3,102500,0674022910000,00/10-29-067-02W4/0,13000,549,3,54.829624,-110.269422
4,102501,0674023406000,00/06-34-067-02W4/0,13000,529,2,54.840471,-110.224832
5,102503,0674030411000,00/11-04-067-03W4/0,13000,488.5,2,54.771449,-110.402983
6,102505,0674030810000,00/10-08-067-03W4/0,13000,501.5,2,54.785901,-110.422131
7,102507,0674031410000,00/10-14-067-03W4/0,13000,553.5,2,54.800533,-110.345762
8,102514,0674032810000,00/10-28-067-03W4/0,13000,493.5,3,54.829633,-110.396621
9,102517,0674033607000,00/07-36-067-03W4/0,13000,536.5,2,54.840441,-110.320301


In [14]:
input_data_inst.picks_df.head()

Unnamed: 0,SitID,HorID,Pick,Quality
0,102496,1000,321.0,1
1,102496,2000,,-1
2,102496,3000,,-1
3,102496,4000,,-1
4,102496,5000,438.0,2


## ConfigurationMemory Class

In [32]:
class configuration():
    """
    class to keep configuration variables you might change between runs. 
    Types of information information stored in here would include paths and names of intermediate files, 
    so not initial input files which go in the input_data class, as well as which tops or curves were mandatory in well select.
    """
    def __init__(self):
        #### intermediate files and paths
        self.csv_of_well_names_wTopsCuves__name = ''
        self.csv_of_well_names_wTopCurves__path = '.'
        #### Choices
        self.must_have_curves_list = [''] # ['ILD', 'NPHI', 'GR', 'DPHI', 'DEPT']
        self.must_have_tops__list = ['']
        #### Column string names
        self.top_name_col_in_picks_df = '' 
        self.siteID_col_in_picks_df = ''
        self.quality_col_name_in_picks_df = "Quality"
        self.picks_depth_col_in_picks_df = 'Pick'
        self.quality_items_to_skip__list = [-1,0]
        self.test = "test0"
    
    #### only keep wells that have these curves
    def set_must_have_curves(self,must_have_curves_in_list):
        """doc string goes here"""
        self.must_have_curves_list = must_have_curves_in_list
        print("must have curve list is: ",must_have_curves_in_list)
        
    def get_must_have_curves(self,must_have_curves_in_list):
        """doc string goes here"""
        return self.must_have_curves_list
    
    #### only keep wells that have these tops 
    def set_must_have_tops__list(self,must_have_tops__list):
         #[13000,14000]
        self.must_have_tops__list = must_have_tops__list
        print("set must_have_tops_list as: ",self.must_have_tops__list)
    
    def get_must_have_tops__list(self):
         #[13000,14000]
        return self.must_have_tops__list
    
    def set_quality_items_to_skip__list(self,quality_items_to_skip__list):
        self.quality_items_to_skip__list = quality_items_to_skip__list
        print("set quality_items_to_skip__list as: ",quality_items_to_skip__list)
        
    def get_quality_items_to_skip__list(self):
        return self.quality_items_to_skip__list
      
        
    #### column names in picks_df
    def set_top_name_col_in_picks_df(self,top_name_col_in_picks_df__str):
        self.top_name_col_in_picks_df = top_name_col_in_picks_df__str
        print(" set self.top_name_col_in_picks_df as: ",top_name_col_in_picks_df__str)
        
    def set_siteID_col_in_picks_df(self,sitID__str):
        self.siteID_col_in_picks_df = sitID__str
        print(" set siteID_col_in_picks_df as: ",self.siteID_col_in_picks_df)
    
    def get_siteID_col_in_picks_df(self):
        return self.siteID_col_in_picks_df
    
    def get_top_name_col_in_picks_df(self):
        return self.top_name_col_in_picks_df
    
    def set_quality_col_name_in_picks_df(self,Quality__str):
        self.quality_col_name_in_picks_df = Quality__str
        
    def get_quality_col_name_in_picks_df(self):
        return self.quality_col_name_in_picks_df
    
    def set_picks_depth_col_in_picks_df(self,picks_depth_col_in_picks_df):
        self.picks_depth_col_in_picks_df = picks_depth_col_in_picks_df
        
    def get_picks_depth_col_in_picks_df(self):
        return self.picks_depth_col_in_picks_df
    
   

In [33]:
config = configuration()

In [34]:
config.set_must_have_curves(['ILD', 'NPHI', 'GR', 'DPHI', 'DEPT'])

must have curve list is:  ['ILD', 'NPHI', 'GR', 'DPHI', 'DEPT']


In [35]:
config.set_must_have_tops__list([13000,14000])

set must_have_tops_list as:  [13000, 14000]


In [36]:
config.get_must_have_tops__list()

[13000, 14000]

In [37]:
config.set_top_name_col_in_picks_df('HorID')

 set self.top_name_col_in_picks_df as:  HorID


In [38]:
config.set_siteID_col_in_picks_df('SitID')

 set siteID_col_in_picks_df as:  SitID


In [39]:
config.set_siteID_col_in_picks_df('SitID')

 set siteID_col_in_picks_df as:  SitID


In [40]:
config.test = "test1"

In [46]:
config.test

'test2'

## Establish names of key files to write

In [433]:
#wells_wTopsCuves_toLoad = 'WellNamesWithGivenTopsCurves_20180927_vC.pkl'

## Key Variables

In [434]:
#curvesMustHave = ['ILD', 'NPHI', 'GR', 'DPHI', 'DEPT']

## Import Initial Data

#### YOU'LL WANT TO CHANGE THESE LINKS IF YOU USE DIFFERENT DATA OR CHANGE LOCATION OF NOTEBOOK OR DATA

In [435]:
#picks_dic = pd.read_csv('../../../SPE_006_originalData/OilSandsDB/PICKS_DIC.TXT',delimiter='\t')
#picks = pd.read_csv('../../../SPE_006_originalData/OilSandsDB/PICKS.TXT',delimiter='\t')
#wells = pd.read_csv('../../../SPE_006_originalData/OilSandsDB/WELLS.TXT',delimiter='\t')
#gis = pd.read_csv('../../../well_lat_lng.csv')

In [436]:
picks.head()

Unnamed: 0,SitID,HorID,Pick,Quality
0,102496,1000,321.0,1
1,102496,2000,,-1
2,102496,3000,,-1
3,102496,4000,,-1
4,102496,5000,438.0,2


In [437]:
#picks_dic

In [438]:
#wells.head()

In [439]:
#gis.head()

## Question 1: How many wells are included for each top? 

In [100]:

class TopsAvailable():
    """
    Class that uses the configuration class and data_inpunt class objects and additional 
    user input to find out the number of wells of those available that have the tops we want.
    """
    def __init__(self,input_data_obj,configuration_obj):
        """doc string goes here"""
        #### intermediate files and paths
        self.input = input_data_obj
        self.config = configuration_obj
        ####
        self.picks_df_noNullPicks = "nothing here yet, run take_out_wells_with_no_tops or set_picks_df_noNullPicks"
        self.wells_wAny_tops__list = "nothing here yet"
        
        
    def find_unique_tops_list(self):
        """doc string goes here"""
        unique_tops_list = self.input.picks_df[self.config.get_top_name_col_in_picks_df()].unique()
        print(": unique_tops_list",unique_tops_list)
        return unique_tops_list
        
    def get_must_have_tops(self):
        """doc string goes here"""
        return self.config.get_must_have_tops__list()
        #print("must have top list is: ",must_have_curves_in_list)
        
    def take_out_wells_with_no_tops(self):
        """
        function is defined to take in a picks_df and 
        exclude any wells that have no picks or are flagged as very bad quality.
        This function assumes a data structure that is
        """
        #### THIS FUNCTION ASSUMES SOME STRUCTURES THAT MIGHT NOT EXIST IN YOUR PROJECT
        #### YOU MAY HAVE TO DO THIS A DIFFERENT WAY
        print('THIS FUNCTION ASSUMES SOME STRUCTURES THAT MIGHT NOT EXIST IN YOUR PROJECT. It should work find with Mannville default data')
        #### produces dataframe with no picks that have a value of zero
        noZeroPicks = self.input.picks_df[self.input.picks_df.Pick != 0]
        #### produces dataframe that doesn't have any picks with a quality of negative one, meaning not to be trusted or present
        noNullPicks = noZeroPicks[noZeroPicks.Quality != -1]
        self.picks_df_noNullPicks = noNullPicks
        
    def get_picks_df_noNullPicks(self):
        """doc string goes here"""
        return self.picks_df_noNullPicks

    
 #### THIS ONE NEEDS EDITING AS IT REQUIRES ARGUMENTS THAT MAY NOT BE NEEDED LIKE THIS????????   
    def set_picks_df_noNullPicks(self,picks_df_noNullPicks):
        """doc string goes here"""
        if(str(type(picks_df_noNullPicks)) == "<class 'pandas.core.frame.DataFrame'>"):
            self.picks_df_noNullPicks = picks_df_noNullPicks
            print("set picks_df_noNullPicks attribute as ",picks_df_noNullPicks)
        else:
            raise ValueError("Argument picks_df_noNullPicks should be type dataframe")
            
    def get_df_of_top_counts_in_picks_df(self):
        """doc string goes here"""
        #### produces dataframe of horID and counts of non-zero,non-null picks
        pick_counts = self.picks_df_noNullPicks.groupby(self.config.get_top_name_col_in_picks_df())[self.config.siteID_col_in_picks_df].count()
        return pick_counts
    
    def get_df_wells_with_any_top(self):
        """doc string goes here"""
        #### The total number of wells with any sort of pick is:
        self.wells_wAny_tops__list = self.picks_df_noNullPicks[self.config.siteID_col_in_picks_df].unique()
        return self.wells_wAny_tops__list
    
    def get_number_wells_with_any_top(self):
        """doc string goes here"""
        if type(self.wells_wAny_tops__list) == str:
            self.get_df_wells_with_any_top()
            if type(self.wells_wAny_tops__list) == str:
                raise ValueError("expected self.wells_wAny_tops__list to be array but is type str, please run get_df_wells_with_any_top function")
            else:
                return len(self.wells_wAny_tops__list)
        else:
            return len(self.wells_wAny_tops__list)
        

    def findWellsThatHaveCertainTop(self,top,quality_items_to_skip__list):
        #### Takes in top
        #### Returns a list of wells with the given top
        #print(top)
        if self.wells_wAny_tops__list == "nothing here yet":
            raise ValueError("self.wells_wAny_tops__list has not been populated properly yet. I'll run take_out_wells_with_no_tops() & get_df_wells_with_any_top()")            
        else:
            picks = self.picks_df_noNullPicks
            Quality = self.config.get_quality_col_name_in_picks_df()
            HorID = self.config.get_top_name_col_in_picks_df()
            SitID = self.config.get_siteID_col_in_picks_df()
            Pick = self.config.get_picks_depth_col_in_picks_df()
            #####   g
            for item in quality_items_to_skip__list:
                #print("Quality = ",Quality)
                #print("picks type",type(picks),"and picks = ",picks)
                picks = picks[picks[Quality] != item]
            rows_with_picks = picks
            #rows_with_picks = picks[picks.Quality != 0]
            #rows_with_picks = rows_with_picks[rows_with_picks.Quality != -1]
            #print(rows_with_picks[0:4])\

            test = rows_with_picks.loc[rows_with_picks[HorID] == top]
            test[Pick].replace('', np.nan, inplace=True)
            #print(test)
            rows_with_that_top = list(rows_with_picks.loc[rows_with_picks[HorID] == top].dropna()[SitID].unique())
            #print("before return",rows_with_that_top)
            return rows_with_that_top

    def findWellsWithAllTopsGive(self):
        #### Takes in a list of tops
        #### Returns a list of wells that include all of those tops. If only one top occurs, well is not included
        ###
        tops = self.config.get_must_have_tops__list()
        quality_items_to_skip__list = self.config.get_quality_items_to_skip__list()
        
        #
        list_of_wells_with_tops = []
        for top in tops:
            list_of_wells_with_tops.append(self.findWellsThatHaveCertainTop(top,quality_items_to_skip__list)) 
        #print(len(list_of_wells_with_tops))
        if len(list_of_wells_with_tops) == 0:
            raise ValueError("nothing in list_of_wells_with_tops, there should be at least one item! Something bad happened.")
        elif len(list_of_wells_with_tops) == 1:
            print("returning list of wells names that have the required tops. The length of list is :",len(list_of_wells_with_tops[0])," If this number is too small, consider changing the required tops in the configuration object.")
            return list_of_wells_with_tops[0]
        else:
            for eachlist in list_of_wells_with_tops[1:]:
                list_of_wells_with_tops =list(set(list_of_wells_with_tops[0]).intersection(eachlist))
            print("returning list of wells names that have the required tops. The length of list is :",len(list_of_wells_with_tops)," If this number is too small, consider changing the required tops in the configuration object.")
            return list_of_wells_with_tops
            #list_of_wells_with_tops =list(set(list_of_wells_with_tops[0]).intersection(list_of_wells_with_tops[1]))
    
    def run_all(self):
        unique_tops = self.find_unique_tops_list()
        print("The list of unique tops is: ",unique_tops)
        print("The list of required tops from the configuration object that was used as an argument are: ",self.get_must_have_tops())
        print("This will, of course, exclude wells with no tops.")
        self.take_out_wells_with_no_tops()
        print("The counts for each top in the dataset are: ")
        top_counts = self.get_df_of_top_counts_in_picks_df()
        print(top_counts)
        print("and the total number of wells with any tops is: ",self.get_number_wells_with_any_top())
        wells_with_tops = tops.findWellsWithAllTopsGive()
        print("The length of the list with all the names of the wells that have the required tops is",len(wells_with_tops), " and it will be what is returned")
        return wells_with_tops

In [79]:
tops = TopsAvailable(input_data_inst,config)

In [80]:
tops.config.test = "test2"

In [81]:
tops.config.test

'test2'

In [82]:
tops.find_unique_tops_list()

: unique_tops_list [ 1000  2000  3000  4000  5000  6000  7000  9000  9500 10000 11000 12000
 13000 14000]


array([ 1000,  2000,  3000,  4000,  5000,  6000,  7000,  9000,  9500,
       10000, 11000, 12000, 13000, 14000])

In [83]:
#input_data_.picks_df.HorID.unique()

In [84]:
#### THIS SHOULD PRODUCE AN ERROR, testing error return functionality of this function here
#tops.set_picks_df_noNullPicks("this is a string")

In [85]:
tops.take_out_wells_with_no_tops()

THIS FUNCTION ASSUMES SOME STRUCTURES THAT MIGHT NOT EXIST IN YOUR PROJECT. It should work find with Mannville default data


In [88]:
tops_counts = tops.get_df_of_top_counts_in_picks_df()

In [89]:
tops_counts

HorID
1000     1903
2000      517
3000      531
4000      597
5000     2188
6000      461
7000     2191
9000     2184
9500     2184
10000    2187
11000    2184
12000    2182
13000    2184
14000    2169
Name: SitID, dtype: int64

In [90]:
tops.get_number_wells_with_any_top()

2193

In [91]:
test = tops.findWellsWithAllTopsGive()

returning list of wells names that have the required tops. The length of list is : 1926  If this number is too small, consider changing the required tops in the configuration object.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [92]:
len(test)

1926

In [93]:
new_tops = TopsAvailable(input_data_inst,config)

In [101]:
wells_with_required_tops = new_tops.run_all()

: unique_tops_list [ 1000  2000  3000  4000  5000  6000  7000  9000  9500 10000 11000 12000
 13000 14000]
The list of unique tops is:  [ 1000  2000  3000  4000  5000  6000  7000  9000  9500 10000 11000 12000
 13000 14000]
The list of required tops from the configuration object that was used as an argument are:  [13000, 14000]
This will, of course, exclude wells with no tops.
THIS FUNCTION ASSUMES SOME STRUCTURES THAT MIGHT NOT EXIST IN YOUR PROJECT. It should work find with Mannville default data
The counts for each top in the dataset are: 
HorID
1000     1903
2000      517
3000      531
4000      597
5000     2188
6000      461
7000     2191
9000     2184
9500     2184
10000    2187
11000    2184
12000    2182
13000    2184
14000    2169
Name: SitID, dtype: int64
and the total number of wells with any tops is:  2193
returning list of wells names that have the required tops. The length of list is : 1926  If this number is too small, consider changing the required tops in the configurat

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [102]:
wells_with_required_tops[0]

[106501,
 106503,
 106507,
 106508,
 106512,
 163860,
 106518,
 106519,
 122903,
 106524,
 106526,
 106527,
 106529,
 106532,
 106533,
 106534,
 106537,
 106538,
 106540,
 106544,
 106547,
 106548,
 106552,
 106553,
 106554,
 106555,
 106556,
 106558,
 114753,
 106562,
 114754,
 114755,
 114756,
 114758,
 106569,
 114761,
 163919,
 114768,
 163923,
 106580,
 114772,
 114773,
 163925,
 114779,
 114783,
 114784,
 114789,
 122982,
 106600,
 106602,
 114795,
 114799,
 106608,
 114801,
 114805,
 106617,
 114810,
 114811,
 106621,
 106626,
 114819,
 114822,
 114829,
 123024,
 114833,
 114834,
 163985,
 106644,
 106645,
 114836,
 114838,
 163986,
 106649,
 106650,
 114842,
 114843,
 114844,
 163993,
 114847,
 106656,
 114850,
 106659,
 106661,
 106662,
 106663,
 114855,
 106665,
 114857,
 106669,
 114861,
 114862,
 114864,
 106673,
 106674,
 114865,
 106676,
 114869,
 106679,
 106680,
 106681,
 106682,
 114871,
 114872,
 106685,
 114873,
 106687,
 114879,
 114887,
 106698,
 106700,
 106701,
 

In [103]:
vars(config)

{'csv_of_well_names_wTopsCuves__name': '',
 'csv_of_well_names_wTopCurves__path': '.',
 'must_have_curves_list': ['ILD', 'NPHI', 'GR', 'DPHI', 'DEPT'],
 'must_have_tops__list': [13000, 14000],
 'top_name_col_in_picks_df': 'HorID',
 'siteID_col_in_picks_df': 'SitID',
 'quality_col_name_in_picks_df': 'Quality',
 'picks_depth_col_in_picks_df': 'Pick',
 'quality_items_to_skip__list': [-1, 0],
 'test': 'test2'}

In [108]:
vars(new_tops.config)

{'csv_of_well_names_wTopsCuves__name': '',
 'csv_of_well_names_wTopCurves__path': '.',
 'must_have_curves_list': ['ILD', 'NPHI', 'GR', 'DPHI', 'DEPT'],
 'must_have_tops__list': [13000, 14000],
 'top_name_col_in_picks_df': 'HorID',
 'siteID_col_in_picks_df': 'SitID',
 'quality_col_name_in_picks_df': 'Quality',
 'picks_depth_col_in_picks_df': 'Pick',
 'quality_items_to_skip__list': [-1, 0],
 'test': 'test2'}

each class should be configuration & objects. Configuration should include any flags, variables, file paths or other configuration. Input should be dicts, objects, dataframes passed in

We should try to only have one function edit one attribute if at all possible.

output in line above is well ID

In [510]:
#picks_dic

In [511]:
#listOfTops = picks_dic.HorID.unique()

In [512]:
#listOfTops

### We'll use the fact that absent picks are categorized as -1 in terms of quality to exclude those and then count the rest that remain. You input files might require a different methodology!

In [513]:
# #### produces dataframe with no picks that have a value of zero
# noZeroPicks = picks[picks.Pick != 0]
# #### produces dataframe that doesn't have any picks with a quality of negative one, meaning not to be trusted or present
# noNullPicks = noZeroPicks[noZeroPicks.Quality != -1]
# #### produces dataframe of horID and counts of non-zero,non-null picks
# pick_counts = noNullPicks.groupby('HorID').SitID.count()

In [514]:
# pick_counts

In [515]:
# picks

In [516]:
#### The total number of wells with any sort of pick is:
# wells_with_picks_array = picks.SitID.unique()
# print("number of wells with picks of some sort is: ",len(wells_with_picks_array))

### A human decision is required to determine the minimum umber of tops needed to do anything with that top. For our purposes, we'll limit to those with at least 1900

### We're most interested in wells that have both the Top and Base McMurry picks, so let's see how many wells have both and get that list of wells.

In [517]:
# topsMustHave = [13000,14000]

#### Idea for this task:
- Make a list of wells for each top in the topsMustHave list
- Find the wells that exist in all of the lists

In [534]:
def findWellsThatHaveCertainTop(top):
    #### Takes in top
    #### Returns a list of wells with the given top
    #print(top)
    rows_with_picks = picks[picks.Quality != 0]
    rows_with_picks = rows_with_picks[rows_with_picks.Quality != -1]
    #print(rows_with_picks[0:4])\
    
    test = rows_with_picks.loc[rows_with_picks['HorID'] == top]
    test['Pick'].replace('', np.nan, inplace=True)
    print(test)
    rows_with_that_top = list(rows_with_picks.loc[rows_with_picks['HorID'] == top].dropna().SitID.unique())
    #print("before return",rows_with_that_top)
    return rows_with_that_top

In [535]:
def findWellsWithAllTopsGive(tops):
    #### Takes in a list of tops
    #### Returns a list of wells that include all of those tops. If only one top occurs, well is not included
    list_of_wells_with_tops = []
    for top in tops:
        list_of_wells_with_tops.append(findWellsThatHaveCertainTop(top))
    print(len(list_of_wells_with_tops))
    list_of_wells_with_tops =list(set(list_of_wells_with_tops[0]).intersection(list_of_wells_with_tops[1]))
    return list_of_wells_with_tops

In [536]:
# wells_with_all_given_tops = findWellsWithAllTopsGive(topsMustHave)

In [537]:
# len(wells_with_all_given_tops)

In [538]:
# wells_with_all_given_tops

------------------------

# Import the logs and see how common each curve name is

In [163]:
class TopsAvailable():
    """
    Class that uses the configuration class and data_inpunt class objects and additional 
    user input to find out the number of wells of those available that have the tops we want.
    """
    def __init__(self,input_data_obj,configuration_obj):
        """doc string goes here"""
        #### intermediate files and paths
        self.input = input_data_obj
        self.config = configuration_obj
        ####
        self.full_well_path = las_path+'*'+well_format
        self.objectOfCurves = None # populates from findAllCurvesInGivenWells()
        #self.picks_df_noNullPicks = "nothing here yet, run take_out_wells_with_no_tops or set_picks_df_noNullPicks"
        #self.wells_wAny_tops__list = "nothing here yet"
        #### self.input.las_folder_path
        #### self.input.well_format
        
    def findAllCurvesInGivenWells(self):
        objectOfCurves = {}
        las_folder_path = self.input.las_folder_path
        well_format = self.input.well_format
        path = las_folder_path+'*'+well_format
        print("loading all wells in the path = ",path)
        for fn in glob.glob(path):
            las = lasio.read(fn, ignore_data=True)
            mnemonics = [c.mnemonic for c in las.curves]
            fnShort = fn.replace(las_folder_path,"")
            objectOfCurves[fnShort] = mnemonics
        #print(fn + '\n\t' + '\n\t'.join(mnemonics))
        self.objectOfCurves = objectOfCurves
        return objectOfCurves
    
    def countsOfCurves(self):
        if self.objectOfCurves == None:
            self.findAllCurvesInGivenWells()
        else:
            pass
        listOfListOfCurves = objectOfCurves.values()
        startList = []
        for listI in listOfListOfCurves:
            startList = startList+listI
        uniq_CurvesList = set(startList)
        countsOfCurves = {}
        for eachCurve in uniq_CurvesList:
            countsOfCurves[eachCurve] = startList.count(eachCurve)
        print("counts of curves are: ",countsOfCurves)
        return countsOfCurves
    
    
    def run_all(self):
        """
        runs all included functions and returns a ____ of wells with the requested log curves
        """
        
        
        return wells_with_requested_logs

In [164]:
topsInst = TopsAvailable(input_data_inst,config)

In [165]:
countsOfCurves = topsInst.countsOfCurves()

loading all wells in the path =  ../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS


Header section Parameter regexp=~P was not found.


counts of curves are:  {'IL': 2, 'GR:1': 1, 'RT': 1, 'GR:2': 1, 'DPHI:1': 1, 'DT': 14, 'GR': 2169, 'DPHI': 1917, 'LITH': 1, 'LLD': 2, 'DENS': 4, 'DEPT': 2164, 'DPHI:2': 1, 'SN': 1, 'PHID': 8, 'LLS': 1, 'ILD:2': 1, 'SFL': 3, 'PHIN': 4, 'SNP': 2, 'RESD': 6, 'RHOB': 132, 'ILM': 6, 'COND': 3, 'DEPTH': 7, 'SP': 14, 'NPHI': 2008, 'CALI': 783, 'SFLU': 6, 'ILD:1': 1, 'DELT': 98, 'ILD': 2154}


In [166]:
countsOfCurves

{'IL': 2,
 'GR:1': 1,
 'RT': 1,
 'GR:2': 1,
 'DPHI:1': 1,
 'DT': 14,
 'GR': 2169,
 'DPHI': 1917,
 'LITH': 1,
 'LLD': 2,
 'DENS': 4,
 'DEPT': 2164,
 'DPHI:2': 1,
 'SN': 1,
 'PHID': 8,
 'LLS': 1,
 'ILD:2': 1,
 'SFL': 3,
 'PHIN': 4,
 'SNP': 2,
 'RESD': 6,
 'RHOB': 132,
 'ILM': 6,
 'COND': 3,
 'DEPTH': 7,
 'SP': 14,
 'NPHI': 2008,
 'CALI': 783,
 'SFLU': 6,
 'ILD:1': 1,
 'DELT': 98,
 'ILD': 2154}

In [144]:
def findAllCurvesInGivenWells(las_folder_path,well_format):
    objectOfCurves = {}
    path = las_folder_path+'*'+well_format
    print(path)
    for fn in glob.glob(path):
        las = lasio.read(fn, ignore_data=True)
        mnemonics = [c.mnemonic for c in las.curves]
        fnShort = fn.replace(las_folder_path,"")
        objectOfCurves[fnShort] = mnemonics
    #print(fn + '\n\t' + '\n\t'.join(mnemonics))
    return objectOfCurves
    


In [145]:
#las_path = '../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS'
las_folder_path = '../../../SPE_006_originalData/OilSandsDB/Logs/'
well_format = '.LAS'
full_las_path = las_path+'*'+well_format
full_las_path

'../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS'

In [146]:
objectOfCurves = findAllCurvesInGivenWells(las_folder_path,well_format)

../../../SPE_006_originalData/OilSandsDB/Logs/*.LAS


Header section Parameter regexp=~P was not found.


In [150]:
print(objectOfCurves['00-10-32-080-20W4-0.LAS'])

['DEPT', 'ILD', 'NPHI', 'DPHI', 'GR', 'CALI']


In [28]:
def countsOfCurves(objectOfCurves):
    listOfListOfCurves = objectOfCurves.values()
    startList = []
    for listI in listOfListOfCurves:
        startList = startList+listI
    uniq_CurvesList = set(startList)
    countsOfCurves = {}
    for eachCurve in uniq_CurvesList:
        countsOfCurves[eachCurve] = startList.count(eachCurve)
    return countsOfCurves

In [29]:
countsOfCurves = countsOfCurves(objectOfCurves)

In [30]:
countsOfCurves

{'NPHI': 2008,
 'ILD:2': 1,
 'DPHI:2': 1,
 'LLS': 1,
 'DENS': 4,
 'PHID': 8,
 'ILD': 2154,
 'LLD': 2,
 'DPHI:1': 1,
 'DT': 14,
 'GR:2': 1,
 'SFL': 3,
 'ILM': 6,
 'COND': 3,
 'PHIN': 4,
 'SFLU': 6,
 'RT': 1,
 'RHOB': 132,
 'SNP': 2,
 'DPHI': 1917,
 'DEPTH': 7,
 'GR:1': 1,
 'SN': 1,
 'DEPT': 2164,
 'GR': 2169,
 'ILD:1': 1,
 'RESD': 6,
 'DELT': 98,
 'LITH': 1,
 'IL': 2,
 'SP': 14,
 'CALI': 783}

### One thing to note is that there are some curves with slightly different names that might be the exact same thing.
For example, GR:1 might be identical to GR or it might different enough we wouldn't want to treat them the same way. I don't have information on that, so I'll just skip the GR:1 gamma-ray wells.
    
### However, there are seven wells with DEPTH instead of DEPT. I'll include those for now as that's just a spelling difference but I will need to remember to change the name of that column later when I import !

### Let's set the minimum number of wells we want to have the common curves to be 1900. If your dataset is different, you'll likely want to change this number

In [31]:
minNumberCurves = 2000

In [32]:
def getCurvesInMinNumberOfWells(minNumberCurves,countsOfCurves):
    #### Takes in a minmum number of wells that need to have specific curves and an object where keys are curve names and values is the count of that curves across all wells.
    #### Returns an array of curve names that are found in at least the given number of wells.
    curvePlusCountArray = countsOfCurves.items()
    onlyPlentifulCurvesArray = []
    for curve in curvePlusCountArray:
        if curve[1] > minNumberCurves:
            onlyPlentifulCurvesArray.append(curve[0])
    return onlyPlentifulCurvesArray

In [33]:
plentifulCurves = getCurvesInMinNumberOfWells(minNumberCurves,countsOfCurves)

In [34]:
plentifulCurves

['NPHI', 'ILD', 'DEPT', 'GR']

### Now lets find all the wells that have all of those curves!

In [35]:
def findWellsWithCertainCurves(objectOfCurves,plentifulCurves):
    #### Function takes in an object with keys that are well names and values that are all curves in that well and as the second argument an array of plentiful curves expected to be in every well
    #### Function returns an array of wells that have the specified curves in the second argument.
    wellsWithWantedCurves = []
    for eachWell in objectOfCurves.keys():
        if set(plentifulCurves).issubset(objectOfCurves[eachWell]):
            wellsWithWantedCurves.append(eachWell)
    return wellsWithWantedCurves

In [36]:
wellsWithNeededCurvesList = findWellsWithCertainCurves(objectOfCurves,plentifulCurves)

In [37]:
print("number of wells with all the required curves is",len(wellsWithNeededCurvesList))

number of wells with all the required curves is 2000


### NOTE! when we import the wells for real, we should add in the wells that have DEPTH instead of DEPT and rename the curve to DEPT!
Those wells are....

In [38]:
print(plentifulCurves)

['NPHI', 'ILD', 'DEPT', 'GR']


In [39]:
def getCurvesListWithDifferentCurveName(originalCurveList,origCurve,newCurve):
    #### Takes in list of curves, curve name to be replaced, and curve name to replace with.
    #### Returns a list with the orginal and new curve names switched in the given curve list
    plentifulCurves_wDEPTH = originalCurveList.copy()
    plentifulCurves_wDEPTH.remove(origCurve)
    plentifulCurves_wDEPTH.append(newCurve)
    return plentifulCurves_wDEPTH

In [40]:
newCurveList = getCurvesListWithDifferentCurveName(plentifulCurves,'DEPT','DEPTH')
newCurveList

['NPHI', 'ILD', 'GR', 'DEPTH']

In [41]:
wellsWithNeededCurvesListButDEPTHinsteadDEPT = findWellsWithCertainCurves(objectOfCurves,newCurveList)
print("number of wells with all the required curves but DEPTH instead of DEPT is",len(wellsWithNeededCurvesListButDEPTHinsteadDEPT))

number of wells with all the required curves but DEPTH instead of DEPT is 0


Hmmm, zero? Let's see if we can get those 7 wells that we know have DEPTH instead of DEPT to appear if we reduce the other curve names?

In [42]:
wellsWithNeededCurvesListButDEPTHinsteadDEPT = findWellsWithCertainCurves(objectOfCurves,['GR','DEPTH'])
print("number of wells with all the required curves but DEPTH instead of DEPT is",len(wellsWithNeededCurvesListButDEPTHinsteadDEPT))

number of wells with all the required curves but DEPTH instead of DEPT is 7


In [43]:
wellsWithNeededCurvesListButDEPTHinsteadDEPT = findWellsWithCertainCurves(objectOfCurves,['GR','DEPT'])
print("number of wells with all the required curves but DEPTH instead of DEPT is",len(wellsWithNeededCurvesListButDEPTHinsteadDEPT))

number of wells with all the required curves but DEPTH instead of DEPT is 2162


In [44]:
wellsWithNeededCurvesListButDEPTHinsteadDEPT = findWellsWithCertainCurves(objectOfCurves,['ILD', 'NPHI', 'GR','DEPT'])
print("number of wells with all the required curves but DEPTH instead of DEPT is",len(wellsWithNeededCurvesListButDEPTHinsteadDEPT))

number of wells with all the required curves but DEPTH instead of DEPT is 2000


In [45]:
wellsWithNeededCurvesListButDEPTHinsteadDEPT = findWellsWithCertainCurves(objectOfCurves,['ILD', 'GR', 'DPHI','DEPT'])
print("number of wells with all the required curves but DEPTH instead of DEPT is",len(wellsWithNeededCurvesListButDEPTHinsteadDEPT))

number of wells with all the required curves but DEPTH instead of DEPT is 1911


In [46]:
wellsWithNeededCurvesListButDEPTHinsteadDEPT = findWellsWithCertainCurves(objectOfCurves,['ILD', 'GR', 'DEPT'])
print("number of wells with all the required curves but DEPTH instead of DEPT is",len(wellsWithNeededCurvesListButDEPTHinsteadDEPT))

number of wells with all the required curves but DEPTH instead of DEPT is 2153


In [47]:
wellsWithNeededCurvesListButDEPTHinsteadDEPT = findWellsWithCertainCurves(objectOfCurves,['ILD', 'GR', 'DEPTH'])
print("number of wells with all the required curves but DEPTH instead of DEPT is",len(wellsWithNeededCurvesListButDEPTHinsteadDEPT))

number of wells with all the required curves but DEPTH instead of DEPT is 0


In [48]:
wellsWithNeededCurvesListButDEPTHinsteadDEPT = findWellsWithCertainCurves(objectOfCurves,['ILD', 'NPHI', 'GR', 'DPHI', 'DEPT'])
print("number of wells with all the required curves but DEPTH instead of DEPT is",len(wellsWithNeededCurvesListButDEPTHinsteadDEPT))

number of wells with all the required curves but DEPTH instead of DEPT is 1848


In [49]:
wellsWithNeededCurvesListButDEPTHinsteadDEPT = findWellsWithCertainCurves(objectOfCurves,['ILD', 'NPHI', 'GR', 'DEPT'])
print("number of wells with all the required curves but DEPTH instead of DEPT is",len(wellsWithNeededCurvesListButDEPTHinsteadDEPT))

number of wells with all the required curves but DEPTH instead of DEPT is 2000


#### Analysis

It appears like the number of wells available if we only use wells that have 'ILD', 'NPHI', 'GR', 'DPHI' plus a depth curve is 1848 vs. 2153 if we use only GR and ILD and depth.

and

It appears like the number of wells available if we only use wells that have 'ILD', 'NPHI', 'GR', plus a depth curve is 2000 vs. 2153 if we use only GR and ILD and depth.

In prevous runs, only ILD and GR were treated as mandatory. IT is probably worth it to try both ways, population one would have ~2150 wells and population two would have only ~1850 wells but two density logs that related to porisity, NPHI and DPHI.

Some notes on DPHI and NPHI logs <a href="http://www.pe.tamu.edu/blasingame/data/z_zCourse_Archive/P663_10B/P663_Schechter_Notes/PETE_663_DEN_NEUTR.pdf">here</a>.

#### USing the variable `curvesMustHave` that is set at top of this notebook!!!!

In [83]:
wellsWithNeededCurvesList_real = findWellsWithCertainCurves(objectOfCurves,curvesMustHave)
print("number of wells with all the required curves but DEPTH instead of DEPT is",len(wellsWithNeededCurvesList_real))

number of wells with all the required curves but DEPTH instead of DEPT is 1848


In [84]:
wellsWithNeededCurvesList_real

['00-07-27-073-18W4-0.LAS',
 '00-10-16-092-19W4-0.LAS',
 '00-07-35-078-10W4-0.LAS',
 'AA-15-36-096-11W4-0.LAS',
 '00-07-11-078-02W5-0.LAS',
 '00-10-26-076-09W4-0.LAS',
 '00-10-08-067-17W4-0.LAS',
 '00-15-20-072-21W4-0.LAS',
 '00-11-04-092-18W4-0.LAS',
 '00-07-36-075-23W4-0.LAS',
 '00-06-32-069-04W4-0.LAS',
 '00-11-09-079-15W4-0.LAS',
 '00-08-29-077-10W4-0.LAS',
 'AA-15-14-101-13W4-0.LAS',
 '00-10-20-075-18W4-0.LAS',
 '00-10-27-078-07W4-0.LAS',
 '00-10-08-076-06W4-0.LAS',
 '00-10-09-071-26W4-0.LAS',
 '00-10-03-094-21W4-0.LAS',
 '00-13-26-080-22W4-0.LAS',
 '00-14-03-093-19W4-0.LAS',
 '00-06-07-073-06W4-0.LAS',
 '00-11-16-095-18W4-0.LAS',
 'AA-10-23-082-18W4-0.LAS',
 '00-06-32-073-09W4-0.LAS',
 'AA-12-12-099-15W4-0.LAS',
 '00-05-25-081-03W5-0.LAS',
 '00-10-23-081-20W4-0.LAS',
 '00-11-26-078-01W5-0.LAS',
 'AB-10-18-096-10W4-0.LAS',
 '00-15-31-084-14W4-0.LAS',
 '00-11-29-074-01W4-0.LAS',
 'AA-10-18-098-10W4-0.LAS',
 '00-10-30-071-12W4-0.LAS',
 '00-11-33-077-17W4-0.LAS',
 '00-11-18-074-08W4-

# Make list of wells that includes both the minimum required curves & minimum required tops

In [85]:
wells_with_all_given_tops

[106501,
 106503,
 106507,
 106508,
 106512,
 163860,
 106518,
 106519,
 122903,
 106524,
 106526,
 106527,
 106529,
 106532,
 106533,
 106534,
 106537,
 106538,
 106540,
 106544,
 106547,
 106548,
 106552,
 106553,
 106554,
 106555,
 106556,
 106558,
 114753,
 106562,
 114754,
 114755,
 114756,
 114758,
 106569,
 114761,
 163919,
 114768,
 163923,
 106580,
 114772,
 114773,
 163925,
 114779,
 114783,
 114784,
 114789,
 122982,
 106600,
 106602,
 114795,
 114799,
 106608,
 114801,
 114805,
 106617,
 114810,
 114811,
 106621,
 106626,
 114819,
 114822,
 114829,
 123024,
 114833,
 114834,
 163985,
 106644,
 106645,
 114836,
 114838,
 163986,
 106649,
 106650,
 114842,
 114843,
 114844,
 163993,
 114847,
 106656,
 114850,
 106659,
 106661,
 106662,
 106663,
 114855,
 106665,
 114857,
 106669,
 114861,
 114862,
 114864,
 106673,
 106674,
 114865,
 106676,
 114869,
 106679,
 106680,
 106681,
 106682,
 114871,
 114872,
 106685,
 114873,
 106687,
 114879,
 114887,
 106698,
 106700,
 106701,
 

In [86]:
print(len(wells_with_all_given_tops))

1926


In [87]:
wellsWithNeededCurvesList_real

['00-07-27-073-18W4-0.LAS',
 '00-10-16-092-19W4-0.LAS',
 '00-07-35-078-10W4-0.LAS',
 'AA-15-36-096-11W4-0.LAS',
 '00-07-11-078-02W5-0.LAS',
 '00-10-26-076-09W4-0.LAS',
 '00-10-08-067-17W4-0.LAS',
 '00-15-20-072-21W4-0.LAS',
 '00-11-04-092-18W4-0.LAS',
 '00-07-36-075-23W4-0.LAS',
 '00-06-32-069-04W4-0.LAS',
 '00-11-09-079-15W4-0.LAS',
 '00-08-29-077-10W4-0.LAS',
 'AA-15-14-101-13W4-0.LAS',
 '00-10-20-075-18W4-0.LAS',
 '00-10-27-078-07W4-0.LAS',
 '00-10-08-076-06W4-0.LAS',
 '00-10-09-071-26W4-0.LAS',
 '00-10-03-094-21W4-0.LAS',
 '00-13-26-080-22W4-0.LAS',
 '00-14-03-093-19W4-0.LAS',
 '00-06-07-073-06W4-0.LAS',
 '00-11-16-095-18W4-0.LAS',
 'AA-10-23-082-18W4-0.LAS',
 '00-06-32-073-09W4-0.LAS',
 'AA-12-12-099-15W4-0.LAS',
 '00-05-25-081-03W5-0.LAS',
 '00-10-23-081-20W4-0.LAS',
 '00-11-26-078-01W5-0.LAS',
 'AB-10-18-096-10W4-0.LAS',
 '00-15-31-084-14W4-0.LAS',
 '00-11-29-074-01W4-0.LAS',
 'AA-10-18-098-10W4-0.LAS',
 '00-10-30-071-12W4-0.LAS',
 '00-11-33-077-17W4-0.LAS',
 '00-11-18-074-08W4-

In [88]:
print(len(wellsWithNeededCurvesList_real))

1848


### These two lists are different. One is SITEID the other is LAS file name. We'll convert them in the function below and find the ones in common and returnt that as a new list of wells.

In [89]:
new_wells = wells.set_index('SitID').T.to_dict('list')
new_wells

{102496: ['0674010812000', '00/12-08-067-01W4/0'],
 102497: ['0674020807000', '00/07-08-067-02W4/0'],
 102498: ['0674021109000', '00/09-11-067-02W4/0'],
 102500: ['0674022910000', '00/10-29-067-02W4/0'],
 102501: ['0674023406000', '00/06-34-067-02W4/0'],
 102503: ['0674030411000', '00/11-04-067-03W4/0'],
 102505: ['0674030810000', '00/10-08-067-03W4/0'],
 102507: ['0674031410000', '00/10-14-067-03W4/0'],
 102514: ['0674032810000', '00/10-28-067-03W4/0'],
 102517: ['0674033607000', '00/07-36-067-03W4/0'],
 102518: ['0674040210000', '00/10-02-067-04W4/0'],
 102565: ['0674040710000', '00/10-07-067-04W4/0'],
 102577: ['0674042411000', '00/11-24-067-04W4/0'],
 102581: ['0674043307000', '00/07-33-067-04W4/0'],
 102583: ['0674050313000', '00/13-03-067-05W4/0'],
 102589: ['0674051606000', '00/06-16-067-05W4/0'],
 102592: ['0674052507000', '00/07-25-067-05W4/0'],
 102596: ['0674053310000', '00/10-33-067-05W4/0'],
 102597: ['0674060115000', '00/15-01-067-06W4/0'],
 102606: ['0674061706000', '00/

In [90]:
def findWellsWithGivenTopsCurves(wells,wells_with_all_given_tops,wellsWithNeededCurvesList_real):
    new_wells = wells.set_index('SitID').T.to_dict('list')
    #print("new_wells",new_wells)
    for key in new_wells:
        new_wells[key].append(new_wells[key][1].replace("/","-")+".LAS") 
    print("new_wells",new_wells)
    print(len(new_wells))
    new_wells_with_all_given_tops = []
    for well in wells_with_all_given_tops:
        new_wells_with_all_given_tops.append(new_wells[well][2])
    return list(set(new_wells_with_all_given_tops).intersection(wellsWithNeededCurvesList_real))

In [91]:
WellsWithGivenTopsCurves = findWellsWithGivenTopsCurves(wells,wells_with_all_given_tops,wellsWithNeededCurvesList_real)

new_wells {102496: ['0674010812000', '00/12-08-067-01W4/0', '00-12-08-067-01W4-0.LAS'], 102497: ['0674020807000', '00/07-08-067-02W4/0', '00-07-08-067-02W4-0.LAS'], 102498: ['0674021109000', '00/09-11-067-02W4/0', '00-09-11-067-02W4-0.LAS'], 102500: ['0674022910000', '00/10-29-067-02W4/0', '00-10-29-067-02W4-0.LAS'], 102501: ['0674023406000', '00/06-34-067-02W4/0', '00-06-34-067-02W4-0.LAS'], 102503: ['0674030411000', '00/11-04-067-03W4/0', '00-11-04-067-03W4-0.LAS'], 102505: ['0674030810000', '00/10-08-067-03W4/0', '00-10-08-067-03W4-0.LAS'], 102507: ['0674031410000', '00/10-14-067-03W4/0', '00-10-14-067-03W4-0.LAS'], 102514: ['0674032810000', '00/10-28-067-03W4/0', '00-10-28-067-03W4-0.LAS'], 102517: ['0674033607000', '00/07-36-067-03W4/0', '00-07-36-067-03W4-0.LAS'], 102518: ['0674040210000', '00/10-02-067-04W4/0', '00-10-02-067-04W4-0.LAS'], 102565: ['0674040710000', '00/10-07-067-04W4/0', '00-10-07-067-04W4-0.LAS'], 102577: ['0674042411000', '00/11-24-067-04W4/0', '00-11-24-067-04

## List of wells in LAS filename to be used

In [92]:
WellsWithGivenTopsCurves

['00-10-14-078-18W4-0.LAS',
 'AA-10-30-097-07W4-0.LAS',
 'AA-10-33-097-06W4-0.LAS',
 '00-12-12-074-02W5-0.LAS',
 '00-10-01-073-09W4-0.LAS',
 '00-14-09-096-15W4-0.LAS',
 '00-11-32-086-22W4-0.LAS',
 'AA-06-08-082-09W4-0.LAS',
 '00-11-07-074-15W4-0.LAS',
 'AA-03-02-096-15W4-0.LAS',
 '00-07-10-075-11W4-0.LAS',
 'AA-10-29-080-07W4-0.LAS',
 '00-06-22-082-12W4-0.LAS',
 '00-11-36-072-19W4-0.LAS',
 'AA-10-26-090-16W4-0.LAS',
 '00-10-25-075-15W4-0.LAS',
 '00-11-04-067-03W4-0.LAS',
 '00-11-25-081-17W4-0.LAS',
 'AA-04-03-090-07W4-0.LAS',
 '00-05-14-069-05W4-0.LAS',
 '00-07-32-076-15W4-0.LAS',
 '00-11-17-080-15W4-0.LAS',
 'AA-10-26-082-22W4-0.LAS',
 '00-06-05-084-06W4-0.LAS',
 'AA-07-23-086-07W4-0.LAS',
 '00-07-26-076-12W4-0.LAS',
 '00-10-16-079-09W4-0.LAS',
 '00-11-31-078-05W5-0.LAS',
 '00-11-35-079-12W4-0.LAS',
 '00-10-08-067-03W4-0.LAS',
 '00-15-19-083-06W4-0.LAS',
 '00-11-10-067-18W4-0.LAS',
 '00-07-19-071-26W4-0.LAS',
 '00-07-31-070-14W4-0.LAS',
 '00-06-23-083-18W4-0.LAS',
 '00-11-12-087-22W4-

In [93]:
print(len(WellsWithGivenTopsCurves))

1601


## Write to file

In [94]:
with open(wells_wTopsCuves_toLoad, 'wb') as f:
    pickle.dump(WellsWithGivenTopsCurves, f)

### This hasn't yet checked for other circumstances that may prevent wells from being used, for example:
1. Wells have malformed LAS files
2. Wells don't have nearby neighbors to use for certain feature calculations.

### FUTURE WORK !!!!!

In [95]:
#### Try to include one or two wells that have different curve names using 
# def useDiffColNamesToFillInNA(dataframeOfWells,colReplaceList):
#     """
#     Takes in two arguments,
#     Argument one is a dataframe of multiple wells
#     Argument two is a list of lists. Where each sub-list is a  pair of column names. 
#     The right col is used to fill in NANs where they exist in left column.
#     The function returns a dataframe of wells with the NANs in certain columns replaced based on input arguments.
#     Example = [[ColA,ColB],[ColF,ColG],[ColZ,ColE]]
#     """
#     for each in colReplaceList:
#         print("each",each)
#         dataframeOfWells[each[0]].fillna(dataframeOfWells[each[1]], inplace=True)
#     return dataframeOfWells

In [96]:
### list of sub-lists. Items on left are replaced with volumns from right column if left column has a NaN
# colReplaceList = [['DEPT','DEPTH'],['GR','GR:1'],['GR','GR:2']]

In [97]:
#df_all_wells_wKNN_DEPTHtoDEPT = useDiffColNamesToFillInNA(df_all_wells_wKNN,colReplaceList)