Note download data from https://drive.google.com/drive/folders/1EgDN57LDuvlZAwr5-eHWB5CTJ7K9HpDP

Credit to this repo: https://github.com/LukasMosser/geolink_dataset

## Data Disclaimer

All the data serving as an input to these notebooks was generously donated by GEOLINK  
and is CC-by-SA 4.0 

If you use their data please reference their dataset properly to give them credit for their contribution.

In [1]:
import lasio
import matplotlib.pyplot as plt
%matplotlib inline
import os
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn import preprocessing
from operator import itemgetter

# in and our directories

In [17]:

data_locations = Path("../../data/raw/geolink_dataset/GEOLINK North sea wells with Lithology interpretation/GEOLINK_Lithology and wells NORTH SEA")
interim_locations = Path("../../data/processed/geolink_norge_dataset/")

# load and save as parquet

In [15]:
df_lithology = pd.read_excel(data_locations/'../Lithology code data.xlsx', header=1)[:-1]
df_lithology['Abbreviation'] = pd.to_numeric(df_lithology['Abbreviation'])
df_lithology.to_parquet(interim_locations/'geolink_norge_lithology.parquet', compression='gzip')
df_lithology

Unnamed: 0,Lithology,Color,Lithology Attribute,Abbreviation
0,Aeolian Sandstone,LightYellow,CrossBedded Sand,35
1,Anhydrite,Light Magenta,Anhydrite,22
2,Argillaceous Limestone,Dodger Blue,Chalk,12
3,Arkose,LightGoldenrod,Gravel,36
4,Basement,Salmon,Intrusive,23
5,Biogenic Ooze,DarkYellow,Sandy Shale,25
6,Calcareous Cement,Cyan,Sandy Limestone,16
7,Calcareous Debris Flow,Turquoise,Breccia,31
8,Calcareous Shale,DarkCyan,Calcareous Shale,14
9,Carnallite,Magenta,Halite,33


In [18]:
df_picks = pd.read_excel(data_locations/'../NPD stratigraphic picks north sea.xlsx', header=0)
df_picks.to_parquet(interim_locations/'geolink_norge_picks.parquet', compression='gzip')

In [21]:
df_well_heads = pd.concat([
    pd.read_csv(data_locations/'../../wellbore_exploration_all.csv'),
    pd.read_csv(data_locations/'../../wellbore_development_all.csv'),
    pd.read_csv(data_locations/'../../wellbore_other_all.csv'),
])
df_well_heads.to_parquet(interim_locations/'norge_well_heads.parquet', compression='gzip')

FileNotFoundError: [Errno 2] File ../../data/raw/geolink_dataset/GEOLINK North sea wells with Lithology interpretation/GEOLINK_Lithology and wells NORTH SEA/../../wellbore_exploration_all.csv does not exist: '../../data/raw/geolink_dataset/GEOLINK North sea wells with Lithology interpretation/GEOLINK_Lithology and wells NORTH SEA/../../wellbore_exploration_all.csv'

## Las files

We can now proceed to import these files as las files and get their dataframes and hopefully put them into a data format that is more suited for ML tasks.

In [20]:
if not (interim_locations/'geolink_norge_well_logs_raw.parquet').exists():
    
    # load las files
    well_dataframes = []
    files = sorted(data_locations.glob('*.las'))
    for f in tqdm(files):
        df = lasio.read(f).df()
        df['Well'] = f.stem
        well_dataframes.append(df)
        
    df_all = pd.concat(well_dataframes)

    df_all['Well'] = df_all['Well'].astype('category')

    # Name lithology
    litho_dict = df_lithology.set_index('Abbreviation')['Lithology'].to_dict()
    df_all['LITHOLOGY_GEOLINK'] = df_all['LITHOLOGY_GEOLINK'].replace(litho_dict).astype('category')

    # unique index
    df_all = df_all.reset_index()#.set_index(['Well', 'DEPT'])

    df_all.to_parquet(interim_locations/'geolink_norge_well_logs_raw.parquet', compression='gzip')

HBox(children=(FloatProgress(value=0.0, max=223.0), HTML(value='')))




## Clean las files

In [183]:
# Clean. must have lithology. Remove logs which are present less than half the time
df_all_clean2 = df_all.dropna(subset=['LITHOLOGY_GEOLINK'])
print('nans', df_all_clean2.isna().mean().sort_values())
df_all_clean1 = df_all_clean2.dropna(axis=1, thresh=0.9*len(df_all_clean))
print(f'kept {len(df_all_clean1.columns)/len(df_all_clean2.columns):%} cols')
df_all_clean = df_all_clean1.dropna(axis=0)
df_all_clean.to_parquet(interim_locations/'geolink_norge_well_logs_clean.parquet', compression='gzip')
print(f'kept {len(df_all_clean)/len(df_all_clean2):%} rows')
df_all_clean

DEPT                 0.000000
Well                 0.000000
LITHOLOGY_GEOLINK    0.000000
GR                   0.010837
RDEP                 0.022700
CALI                 0.050810
DTC                  0.053175
RMED                 0.062857
RHOB                 0.067850
DRHO                 0.142370
NPHI                 0.235756
SP                   0.300878
RSHA                 0.316865
BS                   0.554892
PEF                  0.662368
DTE                  0.662811
ROP                  0.817933
DCAL                 0.825841
DTS                  0.869548
RMIC                 0.900082
URAN                 0.922957
THOR                 0.922972
MUDWEIGHT            0.957331
SGR                  0.958548
RXO                  0.966190
DT                   0.970601
RHOM                 0.980990
CGR                  0.996408
TGAS                 0.998670
dtype: float64
kept 44.827586% cols
kept 45.335716% rows


Unnamed: 0,DEPT,LITHOLOGY_GEOLINK,CALI,DRHO,NPHI,RHOB,GR,DTC,RDEP,SP,RSHA,RMED,Well
11302,2215.917725,Shaly Silt,14.438001,0.010000,0.447725,2.363000,60.285748,134.253601,0.737006,50.570400,0.785088,0.785088,15_9-12
11303,2216.070068,Shaly Silt,14.633000,0.006000,0.432095,2.340000,63.250000,129.101868,0.741000,50.820000,0.840000,0.840000,15_9-12
11304,2216.222412,Shaly Silt,14.813001,0.006000,0.410166,2.314000,61.405998,122.476944,0.752000,50.820000,0.858000,0.858000,15_9-12
11305,2216.375000,Shaly Silt,14.383001,-0.001000,0.414143,2.293000,62.561596,116.908607,0.739962,51.070000,0.857046,0.857046,15_9-12
11306,2216.527344,Shaly Silt,14.202999,0.001000,0.439923,2.275000,61.691055,115.390953,0.715966,51.070602,0.886082,0.886082,15_9-12
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3134951,2723.585938,Anhydrite,9.157000,0.141856,0.000000,2.692753,15.846580,53.005497,1504.687866,59.057552,14.382000,1453.242920,7_3-1
3134952,2723.738525,Anhydrite,9.197000,0.148906,0.000000,2.695477,16.209375,53.386738,1514.930054,58.484932,14.387000,1471.283691,7_3-1
3134953,2723.890869,Anhydrite,9.236000,0.148817,0.000000,2.701625,16.638577,53.277893,1522.655029,58.483002,14.390000,1473.072998,7_3-1
3134954,2724.043213,Anhydrite,9.262000,0.150856,0.000000,2.702774,16.997316,52.928886,1530.355835,58.843422,14.390000,1473.072998,7_3-1


In [184]:
df_all_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 748470 entries, 11302 to 3134955
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   DEPT               748470 non-null  float64 
 1   LITHOLOGY_GEOLINK  748470 non-null  category
 2   CALI               748470 non-null  float64 
 3   DRHO               748470 non-null  float64 
 4   NPHI               748470 non-null  float64 
 5   RHOB               748470 non-null  float64 
 6   GR                 748470 non-null  float64 
 7   DTC                748470 non-null  float64 
 8   RDEP               748470 non-null  float64 
 9   SP                 748470 non-null  float64 
 10  RSHA               748470 non-null  float64 
 11  RMED               748470 non-null  float64 
 12  Well               748470 non-null  category
dtypes: category(2), float64(11)
memory usage: 70.7 MB


# Load

In [200]:
# Test load
df = pd.read_parquet(interim_locations/'geolink_norge_well_logs_clean.parquet').set_index(['Well', 'DEPT'])

Unnamed: 0_level_0,Unnamed: 1_level_0,LITHOLOGY_GEOLINK,CALI,DRHO,NPHI,RHOB,GR,DTC,RDEP,SP,RSHA,RMED
Well,DEPT,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
15_9-12,2215.917725,Shaly Silt,14.438001,0.010000,0.447725,2.363000,60.285748,134.253601,0.737006,50.570400,0.785088,0.785088
15_9-12,2216.070068,Shaly Silt,14.633000,0.006000,0.432095,2.340000,63.250000,129.101868,0.741000,50.820000,0.840000,0.840000
15_9-12,2216.222412,Shaly Silt,14.813001,0.006000,0.410166,2.314000,61.405998,122.476944,0.752000,50.820000,0.858000,0.858000
15_9-12,2216.375000,Shaly Silt,14.383001,-0.001000,0.414143,2.293000,62.561596,116.908607,0.739962,51.070000,0.857046,0.857046
15_9-12,2216.527344,Shaly Silt,14.202999,0.001000,0.439923,2.275000,61.691055,115.390953,0.715966,51.070602,0.886082,0.886082
...,...,...,...,...,...,...,...,...,...,...,...,...
7_3-1,2723.585938,Anhydrite,9.157000,0.141856,0.000000,2.692753,15.846580,53.005497,1504.687866,59.057552,14.382000,1453.242920
7_3-1,2723.738525,Anhydrite,9.197000,0.148906,0.000000,2.695477,16.209375,53.386738,1514.930054,58.484932,14.387000,1471.283691
7_3-1,2723.890869,Anhydrite,9.236000,0.148817,0.000000,2.701625,16.638577,53.277893,1522.655029,58.483002,14.390000,1473.072998
7_3-1,2724.043213,Anhydrite,9.262000,0.150856,0.000000,2.702774,16.997316,52.928886,1530.355835,58.843422,14.390000,1473.072998
