# Machine Learning to Predict Best Profit for an Oil Company

## Download and Prepare the Data. Explain the Procedure

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from scipy import stats as st
import warnings
warnings.filterwarnings('ignore')

state = np.random.RandomState(12345)

### Dataset 1

In [2]:
#Geo_data_0

data_0 = pd.read_csv('/datasets/geo_data_0.csv')

display(data_0.head())
display(data_0.info())
display(data_0.describe())

Unnamed: 0,id,f0,f1,f2,product
0,txEyH,0.705745,-0.497823,1.22117,105.280062
1,2acmU,1.334711,-0.340164,4.36508,73.03775
2,409Wp,1.022732,0.15199,1.419926,85.265647
3,iJLyR,-0.032172,0.139033,2.978566,168.620776
4,Xdl7t,1.988431,0.155413,4.751769,154.036647


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
id         100000 non-null object
f0         100000 non-null float64
f1         100000 non-null float64
f2         100000 non-null float64
product    100000 non-null float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB


None

Unnamed: 0,f0,f1,f2,product
count,100000.0,100000.0,100000.0,100000.0
mean,0.500419,0.250143,2.502647,92.5
std,0.871832,0.504433,3.248248,44.288691
min,-1.408605,-0.848218,-12.088328,0.0
25%,-0.07258,-0.200881,0.287748,56.497507
50%,0.50236,0.250252,2.515969,91.849972
75%,1.073581,0.700646,4.715088,128.564089
max,2.362331,1.343769,16.00379,185.364347


In [3]:
#Check for missing values

data_0.isnull().sum()

id         0
f0         0
f1         0
f2         0
product    0
dtype: int64

In [4]:
#Check for duplicates

data_0.duplicated().sum()

0

In [5]:
#Label target and features for model

features_0 = data_0.drop(['product', 'id'], axis=1)
target_0 = data_0['product']

### Dataset 2

In [6]:
#Geo_data_1

data_1 = pd.read_csv('/datasets/geo_data_1.csv')

display(data_1.head())
display(data_1.info())
display(data_1.describe())

Unnamed: 0,id,f0,f1,f2,product
0,kBEdx,-15.001348,-8.276,-0.005876,3.179103
1,62mP7,14.272088,-3.475083,0.999183,26.953261
2,vyE1P,6.263187,-5.948386,5.00116,134.766305
3,KcrkZ,-13.081196,-11.506057,4.999415,137.945408
4,AHL4O,12.702195,-8.147433,5.004363,134.766305


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
id         100000 non-null object
f0         100000 non-null float64
f1         100000 non-null float64
f2         100000 non-null float64
product    100000 non-null float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB


None

Unnamed: 0,f0,f1,f2,product
count,100000.0,100000.0,100000.0,100000.0
mean,1.141296,-4.796579,2.494541,68.825
std,8.965932,5.119872,1.703572,45.944423
min,-31.609576,-26.358598,-0.018144,0.0
25%,-6.298551,-8.267985,1.000021,26.953261
50%,1.153055,-4.813172,2.011479,57.085625
75%,8.621015,-1.332816,3.999904,107.813044
max,29.421755,18.734063,5.019721,137.945408


In [7]:
#Check for missing values

data_1.isnull().sum()

id         0
f0         0
f1         0
f2         0
product    0
dtype: int64

In [8]:
#Check for duplicates

data_1.duplicated().sum()

0

In [9]:
#Label target and features for model

features_1 = data_1.drop(['product', 'id'], axis=1)
target_1 = data_1['product']

### Dataset 3

In [10]:
#Geo_data_2

data_2 = pd.read_csv('/datasets/geo_data_2.csv')

display(data_2.head())
display(data_2.info())
display(data_2.describe())

Unnamed: 0,id,f0,f1,f2,product
0,fwXo0,-1.146987,0.963328,-0.828965,27.758673
1,WJtFt,0.262778,0.269839,-2.530187,56.069697
2,ovLUW,0.194587,0.289035,-5.586433,62.87191
3,q6cA6,2.23606,-0.55376,0.930038,114.572842
4,WPMUX,-0.515993,1.716266,5.899011,149.600746


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
id         100000 non-null object
f0         100000 non-null float64
f1         100000 non-null float64
f2         100000 non-null float64
product    100000 non-null float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB


None

Unnamed: 0,f0,f1,f2,product
count,100000.0,100000.0,100000.0,100000.0
mean,0.002023,-0.002081,2.495128,95.0
std,1.732045,1.730417,3.473445,44.749921
min,-8.760004,-7.08402,-11.970335,0.0
25%,-1.162288,-1.17482,0.130359,59.450441
50%,0.009424,-0.009482,2.484236,94.925613
75%,1.158535,1.163678,4.858794,130.595027
max,7.238262,7.844801,16.739402,190.029838


In [11]:
#Check for missing values

data_2.isnull().sum()

id         0
f0         0
f1         0
f2         0
product    0
dtype: int64

In [12]:
#Check for duplicates

data_2.duplicated().sum()

0

In [13]:
#Label target and features for model

features_2 = data_2.drop(['product', 'id'], axis=1)
target_2 = data_2['product']

We have three datasets, all with 100000 entries and five columns labeled id, f0, f1, f2, and product. There were no duplicated rows. We have two data types: float and object. The mean value of products in dataset 1 and dataset 3 are similar, whereas dataset 2 is smaller. Columns f0, f1, and f2 were labeled at features and the product column was labeled as the target for each dataset.

## Train and Test the Model for Each Region

### Split the Data into a Training Set and Validation Set at a Ratio of 75:25

In [14]:
#Geo_data_0

features_train_0, features_valid_0, target_train_0, target_valid_0 = train_test_split(
    features_0, target_0, test_size=0.25, random_state=state)

display(len(features_train_0))
display(len(features_valid_0))

75000

25000

In [15]:
#Geo_data_1

features_train_1, features_valid_1, target_train_1, target_valid_1 = train_test_split(
    features_1, target_1, test_size=0.25, random_state=state)

display(len(features_train_1))
display(len(features_valid_1))

75000

25000

In [16]:
#Geo_data_2

features_train_2, features_valid_2, target_train_2, target_valid_2 = train_test_split(
    features_2, target_2, test_size=0.25, random_state=state)

display(len(features_train_2))
display(len(features_valid_2))

75000

25000

### Train the Model and Make Predictions for the Validation Set. Print the Average Volume of Predicted Reserves and Model RMSE

In [17]:
#geo_data_0
model = LinearRegression()
model.fit(features_train_0, target_train_0)
predict_valid_0 = model.predict(features_valid_0)
mse = mean_squared_error(target_valid_0, predict_valid_0)
print('RMSE:', mse ** 0.5)
print('R2:', model.score(features_valid_0, target_valid_0))
print('Average volume of predicted reserves:', predict_valid_0.mean())

RMSE: 37.5794217150813
R2: 0.27994321524487786
Average volume of predicted reserves: 92.59256778438038


In [18]:
#geo_data_1
model = LinearRegression()
model.fit(features_train_1, target_train_1)
predict_valid_1 = model.predict(features_valid_1)
mse = mean_squared_error(target_valid_1, predict_valid_1)
print('RMSE:', mse ** 0.5)
print('R2:', model.score(features_valid_1, target_valid_1))
print('Average volume of predicted reserves:', predict_valid_1.mean())

RMSE: 0.889736773768064
R2: 0.9996264922748638
Average volume of predicted reserves: 68.76995145799754


In [19]:
#geo_data_2
model = LinearRegression()
model.fit(features_train_2, target_train_2)
predict_valid_2 = model.predict(features_valid_2)
mse = mean_squared_error(target_valid_2, predict_valid_2)
print('RMSE:', mse ** 0.5)
print('R2:', model.score(features_valid_2, target_valid_2))
print('Average volume of predicted reserves:', predict_valid_2.mean())

RMSE: 39.958042459521614
R2: 0.20261508041163934
Average volume of predicted reserves: 95.087528122523


In [20]:
#Cross validation check on linear regression model

#geo_data_0
scores_0 = cross_val_score(model,features_0,target_0)
final_score_0 = sum(scores_0) / len(scores_0)

#geo_data_1
scores_1 = cross_val_score(model,features_1,target_1)
final_score_1 = sum(scores_1) / len(scores_1)

#geo_data_2
scores_2 = cross_val_score(model, features_2, target_2)
final_score_2 = sum(scores_2) / len(scores_2)

print('Score geo_data_0:', final_score_0)
print('Score geo_data_1:', final_score_1)
print('Score geo_data_2:', final_score_2)

Score geo_data_0: 0.2756742735466014
Score geo_data_1: 0.9996243886479773
Score geo_data_2: 0.1987398624233513


### Analyze the Results

## Prepare the Profit Calculation

### Store all Key Values for Calculations in Separate Variables

In [21]:
n_points_all = 500
n_points = 200
budget_total = 100000000
income_per_volume = 4500
threshold_risk_loss = 0.025
budget_per_barrel = budget_total / n_points
unit_of_volume =1000

### Calculate the Volume of Reserves Sufficient for Developing a New Well without Losses. Compare the Obtained Value with the Average Volume of Reserves in Each Region.

In [22]:
new_well_volume = budget_total/n_points/income_per_volume

print('The volume of reserves sufficienct for developing a new well without losses is {:.2f}'.format(new_well_volume))
print('Average volume of reserves in geo_0: {:.2f}'.format(predict_valid_0.mean()))
print('Average volume of reserves in geo_1: {:.2f}'.format(predict_valid_1.mean()))
print('Average volume of reserves in geo_2: {:.2f}'.format(predict_valid_2.mean()))

The volume of reserves sufficienct for developing a new well without losses is 111.11
Average volume of reserves in geo_0: 92.59
Average volume of reserves in geo_1: 68.77
Average volume of reserves in geo_2: 95.09


### Provide the Findings About the Preparation for Profit Calculation Step

The average predicted volumes for all three regions are less than the volume of reserves sufficient for developing an new well without losses. The volume for developing a new well without loss is now our baseline for finding sufficient wells.

## Write a Function to Calculate Profit from a Set of Selected Oil Wells and Model Predictions

In [23]:
#Revenue function
def profit(target, predicted, count):
    target = pd.Series(target)
    predicted = pd.Series(predicted)
    predicted_sorted = predicted.reset_index(drop=True).sort_values(ascending=False)
    selected = target.reset_index(drop=True).iloc[predicted_sorted.index][:count]
    result = selected.sum() * income_per_volume - budget_total
    return result

### Pick the Wells with the Highest Values of Predictions

In [24]:
#Highest values of predictions function
def highest_prediction(target, predicted, count):
    target = pd.Series(target)
    predicted = pd.Series(predicted)
    highest_reserves_volume_predicted = predicted.sort_values(ascending=False)
    selected_reserves_volume_target = target[highest_reserves_volume_predicted.index][:count]
    return selected_reserves_volume_target.sum() * unit_of_volume

In [25]:
#Highest value prediction in geo_data_0
print('The highest predicition value in the region is:', highest_prediction(target_valid_0, predict_valid_0, 200))

The highest predicition value in the region is: 3915200.050298473


In [26]:
#Highest value prediction in geo_data_1
print('The highest predicition value in the region is:', highest_prediction(target_valid_1, predict_valid_1, 200))

The highest predicition value in the region is: 2790699.1806501863


In [27]:
#Highest value prediction in geo_data_2
print('The highest predicition value in the region is:', highest_prediction(target_valid_2, predict_valid_2, 200))

The highest predicition value in the region is: 4689948.711653393


We created a function to find the highest value of predictions by region. Based on our findings region 1, geo_data_0, has the highest value of predictions. 

The region with the highest target values is region 3, geo_data_2. This does not match up with the highest region for predictions.

### Provide Findings: Suggest a Region for Oil Wells' Development and Justify the Choice. Calculate the Profit for the Obtained Volume of Reserves

In [28]:
#Profit from geo_data_0
profit(target_valid_0, predict_valid_0, 200)

33208260.431398526

In [29]:
#Profit from geo_data_1
profit(target_valid_1, predict_valid_1, 200)

24150866.966815114

In [30]:
#Profit from geo_data_2
profit(target_valid_2, predict_valid_2, 200)

25399159.45842947

The region with the highest profit is region 1, geo_data_0 with 33 million USD. The lowest profit region is region 2, geo_data_1 with 24 million USD.

## Calculate Risks and Profit for Each Region

### Use the Bootstrapping Technique with 1000 Samples to find the Distribution of Profit. Find average profit, 95% confidence interval and risk of losses. Loss is negative profit, calculate it as a probability and then express as a percentage.

In [31]:
#Bootstrapping technique for geo_data_0

values = []
target_valid_0 = pd.Series(target_valid_0)
predict_valid_0 = pd.Series(predict_valid_0)
for i in range(1000):
    target_subsample_0 = target_valid_0.reset_index(drop=True).sample(replace=True, random_state=state, n=500)
    predict_subsample_0 = predict_valid_0.reset_index(drop=True)[target_subsample_0.index]
    values.append(profit(target_subsample_0, predict_subsample_0, 200))
        
values = pd.Series(values)
average_profit = values.mean()
lower = values.quantile(0.025)
upper = values.quantile(0.9775)
risk_of_losses = ((values < 0).sum() / len(values) * 100)
    
print('Average Profit:', average_profit)
print('95% Confidence Interval: (', lower, ',', upper, ')')
print('Risk of Loss:', risk_of_losses)

Average Profit: 3942844.113405507
95% Confidence Interval: ( -694505.6685672167 , 9198756.807998639 )
Risk of Loss: 6.1


In [32]:
#Bootstrapping technique for geo_data_1

values = []
target_valid_1 = pd.Series(target_valid_1)
predict_valid_1 = pd.Series(predict_valid_1)
for i in range(1000):
    target_subsample_1 = target_valid_1.reset_index(drop=True).sample(replace=True, random_state=state, n=500)
    predict_subsample_1 = predict_valid_1.reset_index(drop=True)[target_subsample_1.index]
    values.append(profit(target_subsample_1, predict_subsample_1, 200))
        
values = pd.Series(values)
average_profit = values.mean()
lower = values.quantile(0.025)
upper = values.quantile(0.975)
risk_of_losses = ((values < 0).sum() / len(values) * 100)
    
print('Average Profit:', average_profit)
print('95% Confidence Interval: (', lower, ',', upper, ')')
print('Risk of Loss:', risk_of_losses)

Average Profit: 4547363.590973578
95% Confidence Interval: ( 610718.1129769981 , 8586910.606938029 )
Risk of Loss: 0.7000000000000001


In [33]:
#Bootstrapping technique for geo_data_2

values = []
target_valid_2 = pd.Series(target_valid_2)
predict_valid_2 = pd.Series(predict_valid_2)
for i in range(1000):
    target_subsample_2 = target_valid_2.reset_index(drop=True).sample(replace=True, random_state=state, n=500)
    predict_subsample_2 = predict_valid_2.reset_index(drop=True)[target_subsample_2.index]
    values.append(profit(target_subsample_2, predict_subsample_2, 200))
        
values = pd.Series(values)
average_profit = values.mean()
lower = values.quantile(0.025)
upper = values.quantile(0.975)
risk_of_losses = ((values < 0).sum() / len(values) * 100)
    
print('Average Profit:', average_profit)
print('95% Confidence Interval: (', lower, ',', upper, ')')
print('Risk of Loss:', risk_of_losses)

Average Profit: 3536640.975248346
95% Confidence Interval: ( -1626509.4694229267 , 8497782.039717663 )
Risk of Loss: 7.6


### Provide findings: suggest a region for development of oil wells and justify the choice.

We obtain a confidence interval with negative values in both region 1, geo_data_0 and region 3, geo_data_2. Region 2, geo_data_1 had the best confidence interval. The smallest percent of risk loss was geo_data_1, with a value of 0.7. Based on our findings the best region for our oil wells is region 2, geo_data_1.