Import required libraries.

In [164]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

1. Prepare the data.

In [165]:
region_0 = pd.read_csv('datasets/geo_data_0.csv')
region_1 = pd.read_csv('datasets/geo_data_1.csv')
region_2 = pd.read_csv('datasets/geo_data_2.csv')

In [166]:
region_0.head()

Unnamed: 0,id,f0,f1,f2,product
0,txEyH,0.705745,-0.497823,1.22117,105.280062
1,2acmU,1.334711,-0.340164,4.36508,73.03775
2,409Wp,1.022732,0.15199,1.419926,85.265647
3,iJLyR,-0.032172,0.139033,2.978566,168.620776
4,Xdl7t,1.988431,0.155413,4.751769,154.036647


In [177]:
region_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       100000 non-null  object 
 1   f0       100000 non-null  float64
 2   f1       100000 non-null  float64
 3   f2       100000 non-null  float64
 4   product  100000 non-null  float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB


There are no missing values.

**Feature Selection:**

For machine learning models, all data has to be numerical. Therefore I will drop the `id` column, which has an object datatype. It is also not significant in learning and predicting.

2. Train and test the model for each region.

In [168]:
def data_splitter(data):
    data_train, data_valid = train_test_split(data, test_size=0.25, random_state=0)
    features_valid = data_valid.drop(['id', 'product'], axis=1)
    target_valid = data_valid['product']
    features_train = data_train.drop(['id', 'product'], axis=1)
    target_train = data_train['product']
    return features_train, features_valid, target_train, target_valid

In [202]:
def linear_reg(features_train, features_valid, target_train, target_valid):
    model = LinearRegression()
    model.fit(features_train, target_train)
    valid_predictions = model.predict(features_valid)
    mse = mean_squared_error(target_valid, valid_predictions)
    rmse = mse ** 0.5
    print("RMSE:", rmse)
    print('Average volume:', valid_predictions.mean())
    return valid_predictions

- `region_0`:

In [203]:
features_train_0, features_valid_0, target_train_0, target_valid_0 = data_splitter(region_0)

predictions_0 = linear_reg(features_train_0, features_valid_0, target_train_0, target_valid_0)

RMSE: 37.48100896950594
Average volume: 92.27144852242301


In [214]:
type(predictions_0)

pandas.core.series.Series

In [237]:
predictions_0 = pd.Series(predictions_0, index=target_valid_0.index, name='pred')

`region_1`:

In [171]:
features_train_1, features_valid_1, target_train_1, target_valid_1 = data_splitter(region_1)

predictions_1 = linear_reg(features_train_1, features_valid_1, target_train_1, target_valid_1)

RMSE: 0.8872573052219321
Average volume: 69.15162398290752


`region_2`:

In [172]:
features_train_2, features_valid_2, target_train_2, target_valid_2 = data_splitter(region_2)

linear_reg(features_train_2, features_valid_2, target_train_2, target_valid_2)

RMSE: 40.31290686044374
Average volume: 94.70753129105672


**I can observe that the higher the average volume of reserves, the higher the RMSE.**

**This could be because of the magnitude of the mean square error that is reflected in the square root.**

3. Profit calculations

In [178]:
cost = 10 ** 8
product_revenue = 4500
number_of_wells = 200

In [187]:
#calculate the volume of reserves sufficient for developing a new well without losses
min_volume = (cost / product_revenue) / number_of_wells
print(min_volume)

111.11111111111111


In [184]:
avg_vol_0 = region_0['product'].mean()
print(avg_vol_0)

92.49999999999976


In [185]:
avg_vol_1 = region_1['product'].mean()
print(avg_vol_1)

68.82500000002561


In [186]:
avg_vol_2 = region_2['product'].mean()
print(avg_vol_2)

95.00000000000041


**The voulume of reserves reauired to develop a new well without losses is 111.11**

**The average volumes of the reserves per region are below the minimum volume required, but region 2 has the highest average volume of 95.**

4. A function to calculate profit

In [253]:
# the function calculates profit from a set of selected oil wells and predictions
def profit(target, predictions, wells):
    subsample_valid = target.sample(n=500, replace=True, random_state=0)
    predictions = pd.Series(predictions)
    predictions = predictions.reindex(target.index)  
    predictions_sorted = predictions.sort_values(ascending=False)
    selected = subsample_valid.loc[predictions_sorted.index][:wells]
    return (selected.sum() * product_revenue) - (cost / number_of_wells)

In [254]:
profit(target_valid_0, predictions_0, number_of_wells)

KeyError: 'Passing list-likes to .loc or [] with any missing labels is no longer supported, see https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike'

In [236]:
pd.Series?