# SPE DSEAT Datathon 

### Importing libariries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Matplotlib is building the font cache; this may take a moment.


### Loading Datasets

In [4]:
well_df = pd.read_csv("C:/Users/osuol/Desktop/SPE DSEAT/spe_africa_dseats_datathon_2025_wells_dataset.csv")
reservoir_df = pd.read_csv("C:/Users/osuol/Desktop/SPE DSEAT/reservoir_info.csv")

### Previewing the Dataset

In [5]:
print("Shape of the dataset: ", well_df.shape)
well_df.info()

Shape of the dataset:  (7955, 13)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7955 entries, 0 to 7954
Data columns (total 13 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   PROD_DATE                             7955 non-null   object 
 1   WELL_NAME                             7955 non-null   object 
 2   ON_STREAM_HRS                         7955 non-null   float64
 3   BOTTOMHOLE_FLOWING_PRESSURE (PSI)     7955 non-null   object 
 4   DOWNHOLE_TEMPERATURE (deg F)          7955 non-null   float64
 5   ANNULUS_PRESS (PSI)                   7955 non-null   object 
 6   CHOKE_SIZE (%)                        7955 non-null   float64
 7   WELL_HEAD_PRESSURE (PSI)              7955 non-null   object 
 8   WELL_HEAD_TEMPERATURE (deg F)         7955 non-null   float64
 9   CUMULATIVE_OIL_PROD (STB)             7955 non-null   object 
 10  CUMULATIVE_FORMATION_GAS_PROD (MSCF)  7955 non-nul

#### It is established from the info that there is no missing values in our well dataset, since total non-null count is the same as the total entries

In [6]:
well_df.head()

Unnamed: 0,PROD_DATE,WELL_NAME,ON_STREAM_HRS,BOTTOMHOLE_FLOWING_PRESSURE (PSI),DOWNHOLE_TEMPERATURE (deg F),ANNULUS_PRESS (PSI),CHOKE_SIZE (%),WELL_HEAD_PRESSURE (PSI),WELL_HEAD_TEMPERATURE (deg F),CUMULATIVE_OIL_PROD (STB),CUMULATIVE_FORMATION_GAS_PROD (MSCF),CUMULATIVE_TOTAL_GAS_PROD (MSCF),CUMULATIVE_WATER_PROD (BBL)
0,15-Feb-14,Well_#1,0.0,4050,189.866,0,1.17951,482.46,50.864,0,0,0,0
1,16-Feb-14,Well_#1,0.0,3961,189.945,0,2.9944,328.601,47.668,0,0,0,0
2,17-Feb-14,Well_#1,0.0,3961,190.004,0,1.90349,387.218,48.962,0,0,0,0
3,18-Feb-14,Well_#1,0.0,3964,190.02,0,0.0,308.98,46.636,0,0,0,0
4,19-Feb-14,Well_#1,0.0,3965,190.107,0,30.2076,196.057,47.297,0,0,0,0


In [7]:
reservoir_df.head()

Unnamed: 0,Reservoir Name,Initial Reservoir Pressure (PSI),Bubble Point Pressure (PSI),Current Average Reservoir Pressure (PSI),Solution Gas-Oil-Ratio (SCF/BBL),Formation Volume Factor (RB/STB)
0,ACHI,3500,3300,2700,800,1.2
1,KEMA,4200,4000,3900,600,1.45
2,MAKO,3500,3500,3000,500,1.15
3,DEPU,2800,2800,2400,1200,1.37
4,JANI,4500,4300,4200,1000,1.3


In [8]:
reservoir_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 6 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Reservoir Name                            5 non-null      object 
 1   Initial Reservoir Pressure (PSI)          5 non-null      object 
 2   Bubble Point Pressure (PSI)               5 non-null      object 
 3   Current Average Reservoir Pressure (PSI)  5 non-null      object 
 4   Solution Gas-Oil-Ratio (SCF/BBL)          5 non-null      object 
 5   Formation Volume Factor (RB/STB)          5 non-null      float64
dtypes: float64(1), object(5)
memory usage: 372.0+ bytes


#### It is also established here from the info, that there is no missing values in our well datasets, since total non-null count is the same as the total entries

### Cleaning The Data
#### I observed that some of the numeric column in the dataset are read as objects, which are supposed to be numerics.
#### And also the production date to datetime
#### The next line of code converts this column to numerics


In [9]:
def convert_to_float(df, columns):
    for col in columns:
        df[col] = pd.to_numeric(df[col].str.replace(',', ''), errors='coerce')
    return df

In [10]:
# Columns we are converting in well_df
well_float_cols = [
    'BOTTOMHOLE_FLOWING_PRESSURE (PSI)',
    'ANNULUS_PRESS (PSI)',
    'WELL_HEAD_PRESSURE (PSI)',
    'CUMULATIVE_OIL_PROD (STB)',
    'CUMULATIVE_FORMATION_GAS_PROD (MSCF)',
    'CUMULATIVE_TOTAL_GAS_PROD (MSCF)',
    'CUMULATIVE_WATER_PROD (BBL)'
]

# Columns we are converting in reservoir_df
reservoir_float_cols = [
    'Initial Reservoir Pressure (PSI)',
    'Bubble Point Pressure (PSI)',
    'Current Average Reservoir Pressure (PSI)',
    'Solution Gas-Oil-Ratio (SCF/BBL)'
]

# Applying the function
well_df = convert_to_float(well_df, well_float_cols)
reservoir_df = convert_to_float(reservoir_df, reservoir_float_cols)

In [11]:
well_df['PROD_DATE'] = pd.to_datetime(well_df['PROD_DATE'], format='%d-%b-%y')

In [12]:
well_df.info()
reservoir_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7955 entries, 0 to 7954
Data columns (total 13 columns):
 #   Column                                Non-Null Count  Dtype         
---  ------                                --------------  -----         
 0   PROD_DATE                             7955 non-null   datetime64[ns]
 1   WELL_NAME                             7955 non-null   object        
 2   ON_STREAM_HRS                         7955 non-null   float64       
 3   BOTTOMHOLE_FLOWING_PRESSURE (PSI)     7955 non-null   int64         
 4   DOWNHOLE_TEMPERATURE (deg F)          7955 non-null   float64       
 5   ANNULUS_PRESS (PSI)                   7955 non-null   float64       
 6   CHOKE_SIZE (%)                        7955 non-null   float64       
 7   WELL_HEAD_PRESSURE (PSI)              7955 non-null   float64       
 8   WELL_HEAD_TEMPERATURE (deg F)         7955 non-null   float64       
 9   CUMULATIVE_OIL_PROD (STB)             7955 non-null   int64         
 10  

### Saving the clean file, so as to share among the team

In [14]:
well_df.to_csv('cleaned_wells_dataset.csv', index=False)

In [16]:
reservoir_df.to_csv('cleaned_reservoir_dataset.csv', index=False)

In [13]:
!git --version

git version 2.43.0.windows.1


In [19]:
!git c

fatal: Unable to create 'C:/Users/osuol/Desktop/SPE DSEAT/.git/index.lock': File exists.

Another git process seems to be running in this repository, e.g.
an editor opened by 'git commit'. Please make sure all processes
are terminated then try again. If it still fails, a git process
may have crashed in this repository earlier:
remove the file manually to continue.
