In [1]:
import pandas as pd
import numpy as np

# STEP 1: Load the cleaned CSV data file (confirm it's in the same folder)
wells_df = pd.read_csv("spe_africa_dseats_datathon_2025_wells_dataset.csv")
reservoir_df = pd.read_csv("reservoir_info.csv")

# Clean numeric reservoir data for consistency and computations
reservoir_numeric_cols = [
    'Initial Reservoir Pressure (PSI)',
    'Bubble Point Pressure (PSI)',
    'Current Average Reservoir Pressure (PSI)',
    'Solution Gas-Oil-Ratio (SCF/BBL)',
    'Formation Volume Factor (RB/STB)'
]

for col in reservoir_numeric_cols:
    reservoir_df[col] = (
        reservoir_df[col]
        .astype(str)
        .str.replace(",", "")
        .str.replace('"', "")
        .replace("NA", np.nan)
        .astype(float)
    )

# Clean and convert well-level numeric data
wells_numeric_cols = [
    'BOTTOMHOLE_FLOWING_PRESSURE (PSI)',
    'DOWNHOLE_TEMPERATURE (deg F)',
    'ANNULUS_PRESS (PSI)',
    'CHOKE_SIZE (%)',
    'WELL_HEAD_PRESSURE (PSI)',
    'WELL_HEAD_TEMPERATURE (deg F)',
    'CUMULATIVE_OIL_PROD (STB)',
    'CUMULATIVE_FORMATION_GAS_PROD (MSCF)',
    'CUMULATIVE_TOTAL_GAS_PROD (MSCF)',
    'CUMULATIVE_WATER_PROD (BBL)',
    'ON_STREAM_HRS'
]

for col in wells_numeric_cols:
    wells_df[col] = (
        wells_df[col]
        .astype(str)
        .str.replace(",", "")
        .str.replace('"', "")
        .replace("NA", np.nan)
        .astype(float)
    )

# Ensure correct parsing of production dates (format = dd-mmm-yy)
wells_df["PROD_DATE"] = pd.to_datetime(wells_df["PROD_DATE"], format="%d-%b-%y", errors="coerce")

# Isolate the first 20 wells for focused analysis
well_names_20 = wells_df["WELL_NAME"].dropna().unique()[:20]
wells_df_top20 = wells_df[wells_df["WELL_NAME"].isin(well_names_20)].copy()

# View status
print("Step 1 completed successfully. Data cleaned and ready for analysis.")
print(f"Total unique wells loaded: {len(well_names_20)}")
print("Here’s a preview of the parsed and cleaned data:")
print(wells_df_top20.head(3))

Step 1 completed successfully. Data cleaned and ready for analysis.
Total unique wells loaded: 20
Here’s a preview of the parsed and cleaned data:
   PROD_DATE WELL_NAME  ON_STREAM_HRS  BOTTOMHOLE_FLOWING_PRESSURE (PSI)  \
0 2014-02-15   Well_#1            0.0                             4050.0   
1 2014-02-16   Well_#1            0.0                             3961.0   
2 2014-02-17   Well_#1            0.0                             3961.0   

   DOWNHOLE_TEMPERATURE (deg F)  ANNULUS_PRESS (PSI)  CHOKE_SIZE (%)  \
0                       189.866                  0.0         1.17951   
1                       189.945                  0.0         2.99440   
2                       190.004                  0.0         1.90349   

   WELL_HEAD_PRESSURE (PSI)  WELL_HEAD_TEMPERATURE (deg F)  \
0                   482.460                         50.864   
1                   328.601                         47.668   
2                   387.218                         48.962   

   CUMULAT

In [2]:
from scipy.stats import linregress

prod_trends = []

for well in well_names_20:
    well_data = wells_df[wells_df["WELL_NAME"] == well].copy()
    well_data = well_data.sort_values("PROD_DATE")

    # Compute daily oil rate (difference of cumulative oil)
    well_data["Daily_Oil_Rate"] = well_data["CUMULATIVE_OIL_PROD (STB)"].diff()
    well_data["Days"] = (well_data["PROD_DATE"] - well_data["PROD_DATE"].min()).dt.days

    # Remove invalid/negative rates and NaN
    valid_data = well_data[
        (well_data["Daily_Oil_Rate"] > 0) & (well_data["Days"].notna())
    ]

    if len(valid_data) < 5:
        trend = "Insufficient Data"
    else:
        # Linear regression: Days vs Daily Oil Rate
        slope, _, r_value, _, _ = linregress(
            valid_data["Days"], valid_data["Daily_Oil_Rate"]
        )

        if slope > 1:
            trend = "Increasing"
        elif slope < -1:
            trend = "Decreasing"
        else:
            trend = "Stable"

    prod_trends.append({
        "WELL_NAME": well,
        "Production Trend": trend
    })

# Final production trend classification
prod_trend_df = pd.DataFrame(prod_trends)

# Display
print("Step 5: Production Curve Trend Classification")
print(prod_trend_df)

Step 5: Production Curve Trend Classification
   WELL_NAME Production Trend
0    Well_#1       Decreasing
1    Well_#2           Stable
2    Well_#3       Increasing
3    Well_#4           Stable
4    Well_#5       Decreasing
5    Well_#6       Increasing
6    Well_#7       Decreasing
7    Well_#8       Decreasing
8    Well_#9       Decreasing
9   Well_#10           Stable
10  Well_#11           Stable
11  Well_#12           Stable
12  Well_#13       Increasing
13  Well_#14       Decreasing
14  Well_#15           Stable
15  Well_#16           Stable
16  Well_#17           Stable
17  Well_#18           Stable
18  Well_#19           Stable
19  Well_#20       Decreasing
