# Oil Well Project

## Preparing Data

In [61]:
# imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score,  roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, root_mean_squared_error
import plotly.express as px
from sklearn.neighbors import NearestNeighbors
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

All needed imports

In [62]:
data0 = pd.read_csv(r"C:\Users\alexi\Desktop\Coding Projects\Oil-Well-Project\Oil-well-Project\geo_data_0.csv")
data1 = pd.read_csv(r"C:\Users\alexi\Desktop\Coding Projects\Oil-Well-Project\Oil-well-Project\geo_data_1.csv")
data2 = pd.read_csv(r"C:\Users\alexi\Desktop\Coding Projects\Oil-Well-Project\Oil-well-Project\geo_data_2.csv")

CSV files

In [63]:
data0.info()
data1.info()
data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6250 entries, 0 to 6249
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 6250 non-null   object 
 1   f0                 6250 non-null   float64
 2   f1                 6250 non-null   float64
 3   f2                 6250 non-null   float64
 4   product            6250 non-null   float64
 5   predicted_product  6250 non-null   float64
dtypes: float64(5), object(1)
memory usage: 293.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6250 entries, 0 to 6249
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 6250 non-null   object 
 1   f0                 6250 non-null   float64
 2   f1                 6250 non-null   float64
 3   f2                 6250 non-null   float64
 4   product            6250 non-null   float64
 5   predicted

In [64]:
data0.duplicated().sum()

np.int64(0)

In [65]:
data1.duplicated().sum()

np.int64(0)

In [66]:
data2.duplicated().sum()

np.int64(0)

all 3 data frames are not missing any values or have any duplicate rows

# Train and Test Models

In [67]:
# Split data Into Training and Validation
data0_train, data0_val = train_test_split(data0, test_size=0.25, random_state=50)
data1_train, data1_val = train_test_split(data1, test_size=0.25, random_state=50)
data2_train, data2_val = train_test_split(data2, test_size=0.25, random_state=50)

# Print Results
{
    "data0": {"Train": data0_train.shape, "Val": data0_val.shape},
    "data1": {"Train": data1_train.shape, "Val": data1_val.shape},
    "data2": {"Train": data2_train.shape, "Val": data2_val.shape}
}


{'data0': {'Train': (4687, 6), 'Val': (1563, 6)},
 'data1': {'Train': (4687, 6), 'Val': (1563, 6)},
 'data2': {'Train': (4687, 6), 'Val': (1563, 6)}}

a 75/25 training/validation split on all 3 datasets

In [68]:
# Fucntion to train linear regression model
def train_and_predict_linear(train_df, val_df):
    x_train = train_df[['f0', 'f1', 'f2']]
    y_train = train_df[['product']]
    x_val = val_df[['f0', 'f1', 'f2']]
    y_val = val_df[['product']]

    model = LinearRegression()
    model.fit(x_train, y_train)
    predict = model.predict(x_val)
    mse = mean_squared_error(y_val, predict)

    return predict, mse

# Train and predict With Linear Regression
pred0, mse0 = train_and_predict_linear(data0_train, data0_val)
pred1, mse1 = train_and_predict_linear(data1_train, data1_val)
pred2, mse2 = train_and_predict_linear(data2_train, data2_val)

# Print Results
{
    "data0": {"mse": mse0},
    "data1": {"mse": mse1},
    "data2": {"mse": mse2} 
}

{'data0': {'mse': 1342.1088591959174},
 'data1': {'mse': 0.7871894002204988},
 'data2': {'mse': 1629.7954328349622}}

In [69]:
# Add predictions to the validation sets
data0_val_results = data0_val.copy()
data0_val_results['predicted_product'] = pred0

data1_val_results = data1_val.copy()
data1_val_results['predicted_product'] = pred1

data2_val_results = data2_val.copy()
data2_val_results['predicted_product'] = pred2

# Save to CSV files
data0_val_results.to_csv(r"C:\Users\alexi\Desktop\Coding Projects\Oil-Well-Project\Oil-well-Project\geo_data_0.csv", index=False)
data1_val_results.to_csv(r"C:\Users\alexi\Desktop\Coding Projects\Oil-Well-Project\Oil-well-Project\geo_data_1.csv", index=False)
data2_val_results.to_csv(r"C:\Users\alexi\Desktop\Coding Projects\Oil-Well-Project\Oil-well-Project\geo_data_2.csv", index=False)

# Return filenames
[
    "data0_predictions.csv",
    "data1_predictions.csv",
    "data2_predictions.csv"
]


['data0_predictions.csv', 'data1_predictions.csv', 'data2_predictions.csv']

In [70]:
data0_val_results.head()

Unnamed: 0,id,f0,f1,f2,product,predicted_product
1268,WZPDo,-0.326487,0.906874,-0.810567,27.35135,58.080219
3878,8fDKM,0.11509,0.242267,-1.064132,86.35895,68.327217
5778,fqOSx,1.044243,-0.373455,0.826999,135.979981,93.476445
787,LnihI,0.53693,-0.499839,1.585133,101.234625,98.744247
4849,9Fhwy,-0.066038,0.941207,0.418507,49.847911,66.458058


I had all 3 datasets predict on linear regression and copied their results to 3 new data sets

In [71]:
# Average Volumes and RSME
results_sumary = {
    "data0": {
        "average_predicted_volume": np.mean(pred0),
        "rsme": root_mean_squared_error(data0_val['product'], pred0)

    },
    "data1": {
        "average_predicted_volume": np.mean(pred1),
        "rsme": root_mean_squared_error(data1_val['product'], pred1)
    },

    "data2": {
        "average_predicted_volume": np.mean(pred2),
        "rsme": root_mean_squared_error(data2_val['product'], pred2)
    }
}

results_sumary

{'data0': {'average_predicted_volume': np.float64(91.95891333964379),
  'rsme': 36.63480393281664},
 'data1': {'average_predicted_volume': np.float64(71.08403146347835),
  'rsme': 0.8872369470555759},
 'data2': {'average_predicted_volume': np.float64(95.17129510459765),
  'rsme': 40.37072494809775}}

Ive found the average volumes and predicted reserves for the 3 models and that has offered us some insight. data0 shows moderate error at 36.63 rsme, data1 shows an excellent performance at .89 rsme, and finally data2 shows the worst rsme at 40.15. linear works great on data 1 however the rest dont perform as well. each region from data 0, 1, and 2 has a average predicted volume of 92.40, 68.14, 94.77 respectively and offer insight to 'typical product' values. It might be a good idea to work data0 and data 2 on a more complex model but for now this is what we will work with.