In [None]:
# ---------------------------------------------------------------
# Pontem Analytics Competition - Starter Notebook
# OLS Regression Using Statsmodels
# Predicting BORE_OIL_VOL from AVG_WHP_P
# ---------------------------------------------------------------

import pandas as pd
import statsmodels.formula.api as smf

# 1. Load training data (with encoding fix)
train_url = "https://raw.githubusercontent.com/BivinSadler/Pontem-Analytics-Comp/main/Train%20Volve%20production%20data.csv"
train = pd.read_csv(train_url, encoding="latin1")
print("Training data loaded:", train.shape)


In [None]:
# 2. Clean numeric columns (replace commas â†’ make numeric)
numeric_cols = ["BORE_OIL_VOL", "AVG_WHP_P"]

for col in numeric_cols:
    train[col] = (
        train[col]
        .astype(str)        # ensure string so replace works
        .str.replace(",", "", regex=False)  # remove commas
    )
    train[col] = pd.to_numeric(train[col], errors="coerce")


In [None]:
# 3. Fit OLS model
model = smf.ols("BORE_OIL_VOL ~ AVG_WHP_P", data=train).fit()
print(model.summary())


In [None]:
# 4. Load holdout data
holdout_url = "https://raw.githubusercontent.com/BivinSadler/Pontem-Analytics-Comp/main/CompetitionVolveNoOil.csv"
holdout = pd.read_csv(holdout_url, encoding="latin1")
print("Holdout data loaded:", holdout.shape)

# Clean holdout numeric column the same way
holdout["AVG_WHP_P"] = (
    holdout["AVG_WHP_P"]
    .astype(str)
    .str.replace(",", "", regex=False)
)
holdout["AVG_WHP_P"] = pd.to_numeric(holdout["AVG_WHP_P"], errors="coerce")


In [None]:
# 5. Predict on holdout
preds = model.predict(holdout)

In [None]:
# 6. Build submission file
submission = pd.DataFrame({
    "ID": holdout["ID"],
    "Preds": preds
})
print(submission.head())

In [None]:

# 7. Save submission CSV
submission.to_csv("MySubmission.csv", index=False)
print("Saved MySubmission.csv")
