# SPE DSEAT Datathon 

### Importing libariries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading Datasets

In [None]:
well_df = pd.read_csv("C:/Users/osuol/Desktop/SPE DSEAT/spe_africa_dseats_datathon_2025_wells_dataset.csv")
reservoir_df = pd.read_csv("C:/Users/osuol/Desktop/SPE DSEAT/reservoir_info.csv")

### Preprocessing the Dataset

In [None]:
well_df.head()

In [None]:
print("Dimensions: {}".format(well_df.shape))
well_df.info()

It was established from the info that there is no missing values in our well dataset, since total non-null count is the same as the total entries.

In [None]:
reservoir_df.head()

In [None]:
reservoir_df.info()

It was also established here from the info, that there is no missing values in our well datasets, since total non-null count is the same as the total entries

### Cleaning The Data
I observed that some of the numeric column in the dataset are read as objects, which are supposed to be numerics.
And also the production date to datetime
The next line of code converts this column to numerics


In [None]:
# creating a function for the conversion
def convert_to_float(df, columns):
    for col in columns:
        df[col] = pd.to_numeric(df[col].str.replace(',', ''), errors='coerce')
    return df

In [None]:
# Columns we are converting in well_df
well_float_cols = [
    'BOTTOMHOLE_FLOWING_PRESSURE (PSI)',
    'ANNULUS_PRESS (PSI)',
    'WELL_HEAD_PRESSURE (PSI)',
    'CUMULATIVE_OIL_PROD (STB)',
    'CUMULATIVE_FORMATION_GAS_PROD (MSCF)',
    'CUMULATIVE_TOTAL_GAS_PROD (MSCF)',
    'CUMULATIVE_WATER_PROD (BBL)'
]

# Columns we are converting in reservoir_df
reservoir_float_cols = [
    'Initial Reservoir Pressure (PSI)',
    'Bubble Point Pressure (PSI)',
    'Current Average Reservoir Pressure (PSI)',
    'Solution Gas-Oil-Ratio (SCF/BBL)'
]

# Applying the function
well_df = convert_to_float(well_df, well_float_cols)
reservoir_df = convert_to_float(reservoir_df, reservoir_float_cols)

In [None]:
# convertint the production date to datetime format
well_df['PROD_DATE'] = pd.to_datetime(well_df['PROD_DATE'], format='%d-%b-%y')

In [None]:
#confirming changes 
well_df.info()
reservoir_df.info()

In [None]:
# checking for duplicates
well_df.duplicated().sum()

In [None]:
well_df.describe()

In [None]:
well_df.to_csv('cleaned_wells_dataset.csv', index=False)

In [None]:
reservoir_df.to_csv('cleaned_reservoir_dataset.csv', index=False)

## DATASET EXPLORATION

Identified the number of unique wells and their names to confirm the dataset contains all 20 wells for analysis.

In [None]:
print("The total number of wells is: ", well_df['WELL_NAME'].nunique())
well_df['WELL_NAME'].unique()

### Understanding the well trends
Plotted cumulative oil production for all 20 wells to compare production trends across the dataset.

In [None]:
# Filter for Well_#1
well_1 = well_df[well_df['WELL_NAME'] == 'Well_#1']

# Plot
plt.figure(figsize=(8, 5))
plt.plot(well_1['PROD_DATE'], well_1['CUMULATIVE_OIL_PROD (STB)'])
plt.xlabel('Date')
plt.ylabel('Cumulative Oil Production (STB)')
plt.title('Cumulative Oil Production for Well_#1')
plt.grid(True)
plt.savefig('well_1_oil_prod.png')
plt.show()

In [None]:
# Filter for Well_#2
well_2 = well_df[well_df['WELL_NAME'] == 'Well_#2']

# Plot
plt.figure(figsize=(8, 5))
plt.plot(well_2['PROD_DATE'], well_2['CUMULATIVE_OIL_PROD (STB)'])
plt.xlabel('Date')
plt.ylabel('Cumulative Oil Production (STB)')
plt.title('Cumulative Oil Production for Well_#2')
plt.grid(True)
plt.savefig('well_2_oil_prod.png')  # Save as PNG
plt.show()

In [None]:
# Plot all wells
plt.figure(figsize=(10, 6))
for well in well_df['WELL_NAME'].unique():
    well_data = well_df[well_df['WELL_NAME'] == well]
    plt.plot(well_data['PROD_DATE'], well_data['CUMULATIVE_OIL_PROD (STB)'], label=well)
plt.xlabel('Date')
plt.ylabel('Cumulative Oil Production (STB)')
plt.title('Cumulative Oil Production for All Wells')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.savefig('all_wells_oil_prod.png')
plt.show()

In [None]:
!git add .

In [None]:
!git commit -m "started EDA"

In [None]:
!git push origin main