### Sprint 9 Project

### Summary:

### Step 1: Download and Prepare the Data. Explain the Procedure

In [179]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

from sklearn.model_selection import RandomizedSearchCV

from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.model_selection import cross_val_score

import numpy as np

In [180]:
r0 = pd.read_csv('geo_data_0.csv')

In [181]:
r0.head()

Unnamed: 0,id,f0,f1,f2,product
0,txEyH,0.705745,-0.497823,1.22117,105.280062
1,2acmU,1.334711,-0.340164,4.36508,73.03775
2,409Wp,1.022732,0.15199,1.419926,85.265647
3,iJLyR,-0.032172,0.139033,2.978566,168.620776
4,Xdl7t,1.988431,0.155413,4.751769,154.036647


In [182]:
r1 = pd.read_csv('geo_data_1.csv')

In [183]:
r1.head()

Unnamed: 0,id,f0,f1,f2,product
0,kBEdx,-15.001348,-8.276,-0.005876,3.179103
1,62mP7,14.272088,-3.475083,0.999183,26.953261
2,vyE1P,6.263187,-5.948386,5.00116,134.766305
3,KcrkZ,-13.081196,-11.506057,4.999415,137.945408
4,AHL4O,12.702195,-8.147433,5.004363,134.766305


In [184]:
r1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   id       100000 non-null  object 
 1   f0       100000 non-null  float64
 2   f1       100000 non-null  float64
 3   f2       100000 non-null  float64
 4   product  100000 non-null  float64
dtypes: float64(4), object(1)
memory usage: 3.8+ MB


In [185]:
r1.isna().sum()

id         0
f0         0
f1         0
f2         0
product    0
dtype: int64

In [186]:
r2 = pd.read_csv('geo_data_2.csv')

In [187]:
r2.head()

Unnamed: 0,id,f0,f1,f2,product
0,fwXo0,-1.146987,0.963328,-0.828965,27.758673
1,WJtFt,0.262778,0.269839,-2.530187,56.069697
2,ovLUW,0.194587,0.289035,-5.586433,62.87191
3,q6cA6,2.23606,-0.55376,0.930038,114.572842
4,WPMUX,-0.515993,1.716266,5.899011,149.600746


In [188]:
features_r0 = r0.drop(['id', 'product'], axis=1)

target_r0 = r0['product']

In [189]:
features_train_r0, features_valid_r0, target_train_r0, target_valid_r0 = train_test_split(features_r0, target_r0, test_size=0.25, random_state=12345)


In [190]:
features_r1 = r1.drop(['id', 'product'], axis=1)

target_r1 = r1['product']

In [191]:
features_train_r1, features_valid_r1, target_train_r1, target_valid_r1 = train_test_split(features_r1, target_r1, test_size=0.25, random_state=12345)

In [192]:
features_r2 = r2.drop(['id', 'product'], axis=1)

target_r2 = r2['product']

In [193]:
features_train_r2, features_valid_r2, target_train_r2, target_valid_r2 = train_test_split(features_r2, target_r2, test_size=0.25, random_state=12345)


Certainly! Although LinearRegression in scikit-learn does not have many hyperparameters to tune, it’s good practice to understand how to build a parameter grid and leverage tools like RandomizedSearchCV for hyperparameter optimization. 

For LinearRegression, the only parameters you might consider tuning are related to feature normalization or preprocessing steps rather than the model itself. However, if you want to practice the process, here’s how you could set it up:

Step-by-Step: Setting Up RandomizedSearchCV for LinearRegression
Import Necessary Libraries:
Create a Parameter Grid:
Set Up RandomizedSearchCV:
Fit the Model:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Step 1: Create a sample features and target dataframe (Replace with your actual data)
features = pd.DataFrame({'f0': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'f1': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1],
                         'f2': [5, 4, 3, 2, 1, 5, 4, 3, 2, 1]})
target = pd.DataFrame({'product': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]})

# Step 2: Split the data
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.25, random_state=42)

# Step 3: Define a pipeline including feature scaling and the regression model
model = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Step 4: Create a parameter grid. Note that LinearRegression doesn't have many parameters, 
# but for educational purposes, let's assume we want to toggle normalization in our scaler.
param_grid = {
    'scaler__with_mean': [True, False],
    'scaler__with_std': [True, False]
}

# Step 5: Set up RandomizedSearchCV
random_search = RandomizedSearchCV(model, param_distributions=param_grid, 
                                   n_iter=10, cv=5, random_state=42, n_jobs=-1)

# Step 6: Fit the model
random_search.fit(X_train, y_train)

# Output the best parameters and best score
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)



### Step 2. Train and Test the Model for Each Region.

#### Region 0 Model

In [194]:
r0_model = Pipeline([('scaler', StandardScaler()), ('regressor', LinearRegression())])

In [195]:
param_grid = {'scaler__with_mean': [True, False], 'scaler__with_std': [True, False]}

In [196]:
r0_lr_rsearch = RandomizedSearchCV(estimator=r0_model, param_distributions=param_grid, n_iter=10, cv=5, random_state=12345, n_jobs=1)

In [197]:
r0_lr_rsearch.fit(features_train_r0, target_train_r0)



In [198]:
r0_lr_bparams = r0_lr_rsearch.best_params_

r0_lr_bscore = r0_lr_rsearch.best_score_

print("Region 0 Linear Regression Best Hyperparameters:", r0_lr_bparams)

print('Region 0 Linear Regression Best Score:', r0_lr_bscore)

Region 0 Linear Regression Best Hyperparameters: {'scaler__with_std': True, 'scaler__with_mean': True}
Region 0 Linear Regression Best Score: 0.27408653371000735


In [199]:
r0_model = Pipeline([('scaler', StandardScaler(with_std=True, with_mean=True,)), ('regressor', LinearRegression())])

In [200]:
r0_scores = cross_val_score(r1_model, features_train_r0, target_train_r0, cv=5, scoring='neg_mean_squared_error')

r0_rmse_scores = (-r0_scores) ** 0.5

r0_rmse = r0_rmse_scores.mean()

print("Linear Regression Model RMSE:", r0_rmse)

Linear Regression Model RMSE: 37.73238188617927


In [201]:
r0_model.fit(features_train_r0, target_train_r0)

In [202]:
r0_predict = r0_model.predict(features_valid_r1)

In [203]:
r0_target_valid_actual = target_valid_r0

In [204]:
r0_results = pd.DataFrame({'Predictions': r0_predict, 'Actual Validation Set': r0_target_valid_actual})

r0_results.to_csv('predictions_vs_acutal.csv', index=False)

In [205]:
r0_apr = r0_predict.mean()

print("Region 1 Average Predicted Reserves:", r0_apr)

print("Linear Regression Region 1 Model RMSE", r0_rmse)

Region 1 Average Predicted Reserves: 166.21596826485813
Linear Regression Region 1 Model RMSE 37.73238188617927


#### Region 1 Model

In [206]:
r1_model = Pipeline([('scaler', StandardScaler()), ('regressor', LinearRegression())])

In [207]:
r1_lr_rsearch = RandomizedSearchCV(estimator=r1_model, param_distributions=param_grid, n_iter=10, cv=5, random_state=12345, n_jobs=1)

In [208]:
r1_lr_rsearch.fit(features_valid_r1, target_valid_r1)



In [209]:
r1lr_bparams = r1_lr_rsearch.best_params_
r1_lr_bscore = r1_lr_rsearch.best_score_

print("Linear Regression Region 2 Best Hyperparameters:", r1lr_bparams)

print("Linear Regression Region 2 Best Score:", r1_lr_bscore)

Linear Regression Region 2 Best Hyperparameters: {'scaler__with_std': True, 'scaler__with_mean': True}
Linear Regression Region 2 Best Score: 0.9996232414326588


In [210]:
r1_model = Pipeline([('scaler', StandardScaler(with_std=True, with_mean=True,)), ('regressor', LinearRegression())])

In [211]:
r1_model.fit(features_train_r1, target_train_r1)

In [212]:
r1_scores = cross_val_score(r1_model, features_train_r1, target_train_r1, cv=5, scoring='neg_mean_squared_error')

r1_rmse_scores = (-r1_scores) ** 0.5

r1_rmse = r1_rmse_scores.mean()

print("Linear Regression Region 1 Model RMSE:", r1_rmse)

Linear Regression Region 1 Model RMSE: 0.8895409511809378


In [213]:
r1_model.fit(features_train_r1, target_train_r1)

In [214]:
r1_predict = r1_model.predict(features_valid_r1)

In [215]:
r1_apr = r1_predict.mean()

r1_apr

print("Region 1 Average Predicted Reserves:", r1_apr)
print("Linear Regression Region 1 Model RMSE:", r1_rmse)

Region 1 Average Predicted Reserves: 68.728546895446
Linear Regression Region 1 Model RMSE: 0.8895409511809378


#### Region 2 Model

In [216]:
r2_model = Pipeline([('scaler', StandardScaler()), ('regressor', LinearRegression())])

In [217]:
r2_lr_rsearch = RandomizedSearchCV(estimator=r2_model, param_distributions=param_grid, n_iter=10, cv=5, random_state=12345, n_jobs=1)

In [218]:
r2_lr_rsearch.fit(features_train_r2, target_train_r2)



In [219]:
r2_bparams = r2_lr_rsearch.best_params_
r2_bscore = r2_lr_rsearch.best_score_

print("Linear Regression Region 2 Model Best Hyperparameters:", r2_bparams)
print("Linear Regression Model Region 2 Best Score:", r2_bscore)

Linear Regression Region 2 Model Best Hyperparameters: {'scaler__with_std': True, 'scaler__with_mean': True}
Linear Regression Model Region 2 Best Score: 0.19648225543040282


In [220]:
r2_model = Pipeline([('scaler', StandardScaler(with_std=True, with_mean=True,)), ('regressor', LinearRegression())])

In [221]:
r2_scores = cross_val_score(r2_model, features_train_r2, target_train_r2, cv=5, scoring='neg_mean_squared_error')

r2_rmse_scores = (-r2_scores) ** 0.5

r2_rmse = r2_rmse_scores.mean()

print("Linear Regression Region 3 Model RMSE:", r2_rmse)

Linear Regression Region 3 Model RMSE: 40.06576061701094


In [222]:
r2_model.fit(features_train_r2, target_train_r2)

In [223]:
r2_predict = r2_model.predict(features_valid_r2)

In [224]:
r2_apr = r2_predict.mean()

print("Region 2 Average Predicted Reserves:", r2_apr)
print("Linear Regression Model Region 2 RMSE:", r2_rmse)

Region 2 Average Predicted Reserves: 94.96504596800489
Linear Regression Model Region 2 RMSE: 40.06576061701094


3.1 Store All the Key Values in Separate Variables

In [225]:
budget = 100000000

revenue_per_barrel = 4500

cost_per_well = budget/200

3.2 Calculate the volume of reserves sufficient for developing a new well without losses

In [226]:
cost_per_well

500000.0

In [227]:
sufficient_volume = cost_per_well/revenue_per_barrel

sufficient_volume

111.11111111111111

In [228]:
r0.head()

Unnamed: 0,id,f0,f1,f2,product
0,txEyH,0.705745,-0.497823,1.22117,105.280062
1,2acmU,1.334711,-0.340164,4.36508,73.03775
2,409Wp,1.022732,0.15199,1.419926,85.265647
3,iJLyR,-0.032172,0.139033,2.978566,168.620776
4,Xdl7t,1.988431,0.155413,4.751769,154.036647


In [229]:
r0_valid_df = features_valid_r0.copy()
r0_valid_df['actual_valid'] = target_valid_r0
r0_valid_df['predicted'] = r0_predict

r0_valid_df.head()

Unnamed: 0,f0,f1,f2,actual_valid,predicted
71751,0.94897,-0.057547,2.095727,10.038645,122.572087
80493,0.992974,0.206671,-0.142278,114.551489,319.102042
2655,1.199854,-0.563356,-1.852991,132.603635,129.008314
53233,0.691422,-0.433347,0.564974,169.072125,293.718938
91141,0.420772,0.972638,0.73619,122.32518,163.196051


In [230]:
r1_valid_df = features_valid_r1.copy()
r1_valid_df['acutal_valid'] = target_valid_r1
r1_valid_df['predicted'] = r1_predict

r1_valid_df.head()

Unnamed: 0,f0,f1,f2,acutal_valid,predicted
71751,-0.371866,-1.862494,3.00221,80.859783,82.663314
80493,9.015122,-13.881455,1.995363,53.906522,54.431786
2655,-6.507568,-4.817448,1.003449,30.132364,29.74876
53233,14.560845,-10.667755,1.995175,53.906522,53.552133
91141,6.090476,-4.494723,0.013815,0.0,1.243856


In [231]:
r2_valid_df = features_valid_r2.copy()
r2_valid_df['actual_valid'] = target_valid_r2
r2_valid_df['predicted'] = r2_predict

r2_valid_df.head()

Unnamed: 0,f0,f1,f2,actual_valid,predicted
71751,-1.444717,-3.861599,2.225805,61.212375,93.599633
80493,-1.418617,1.276544,-0.976433,41.850118,75.105159
2655,-4.587649,-0.413199,1.649268,57.776581,90.066809
53233,1.871584,1.619101,4.273555,100.053761,105.162375
91141,-2.028785,4.128167,6.089547,109.897122,115.30331


#### Revenue Function

In [232]:
#Revenue Function
def calc_profit(valid_df, n_top_wells=200, barrel_price=4500, budget=100e6):
    predicted_sorted = valid_df.sort_values(by='predicted', ascending=False)
    
    top_well = predicted_sorted.head(n_top_wells)
    
    total_revenue = top_well['predicted'].sum() * barrel_price
    
    profit = total_revenue - budget
    
    return profit

In [233]:
r0_profit = calc_profit(r0_valid_df)

print(f"Region 0 Estimated Profit: ${r0_profit:.2f}") 

Region 0 Estimated Profit: $231826965.16


In [234]:
r0_reserves = r0_profit / revenue_per_barrel

print(f"Region 0 Estimated Volume of Reserves According to Predicitons: {int(r0_reserves)}")

Region 0 Estimated Volume of Reserves According to Predicitons: 51517


In [235]:
r1_profit = calc_profit(r1_valid_df)

print(f"Region 1 Estimated Profit: ${r1_profit:.2f}")

Region 1 Estimated Profit: $24857120.52


In [236]:
r1_reserves = r1_profit / revenue_per_barrel

print(f"Region 1 Estimated Volume of Reserves According to Predictions: {int(r1_reserves)}")

Region 1 Estimated Volume of Reserves According to Predictions: 5523


In [237]:
r2_profit = calc_profit(r2_valid_df)

print(f"Region 2 Estimated Profit:${r2_profit:.2f}")

Region 2 Estimated Profit:$33217543.96


In [238]:
r2_reserves = r2_profit / revenue_per_barrel 

print(f"Region 2 Estimated Volume of Reserves According to Predicitons: {int(r2_reserves)}")

Region 2 Estimated Volume of Reserves According to Predicitons: 7381


#### Bootstrapping

In [239]:
r0_predicted = r0_valid_df['predicted']
r1_predicted = r1_valid_df['predicted']
r2_predicted = r2_valid_df['predicted']


r0_actual = r0_valid_df['actual_valid']
r1_acutal = r1_valid_df['acutal_valid']
r2_actual = r2_valid_df['actual_valid']

In [240]:
def bootstrap_sample(predicted, actual_valid, num_samples=1000, sample_size=200):
    profits = []
    
    for i in range(num_samples):
        indicies_sample = np.random.choice(len(predicted), size=sample_size, replace=True)
        predicted_sample = predicted.iloc[indicies_sample] if isinstance(predicted, pd.Series) else predicted[indices_sample]
        actual_sample = actual_valid.iloc[indicies_sample] if isinstance(actual_valid, pd.Series) else actual_valid[indices_sample]
        
        sorted_id = np.argsort(predicted_sample)[-200:]
        top_predictions = predicted_sample.iloc[sorted_id] if isinstance(predicted_sample, pd.Series) else predicted_sample[sorted_id]
        top_actual_values = actual_sample.iloc[sorted_id] if isinstance(actual_sample, pd.Series) else actual_sample[sorted_id]
        
        valid_df = pd.DataFrame({'predicted': top_predictions, 'actual': top_actual_values})
        
        profit = calc_profit(valid_df)
        
       
        profits.append(profit)
        
    return profits

In [241]:
bootstrap_0 = bootstrap_sample(r0_predicted, r0_actual)


r0_average = np.mean(bootstrap_0)

In [242]:
r0_average

49550534.39936189

In [243]:
r0_confidence_interval = np.percentile(bootstrap_0, [2.5, 97.5])

r0_confidence_interval

array([40116393.71010908, 58601274.28203373])

In [244]:
r0_risk_of_loss = np.mean((np.array(bootstrap_0) < 0)) * 100

r0_risk_of_loss

0.0

In [251]:
bootstrap_1 = bootstrap_sample(r1_predicted, r1_acutal)

In [252]:
r1_average = np.mean(bootstrap_1)

r1_average

-38208236.593815096

In [253]:
r1_risk_of_loss = np.mean((np.array(bootstrap_1) < 0)) * 100

r1_risk_of_loss

100.0

In [254]:
bootstrap_2 = bootstrap_sample(r2_predicted, r2_actual)

r2_average = np.mean(bootstrap_2)

r2_average

-14498163.343799882

In [255]:
r2_risk_of_loss = np.mean((np.array(bootstrap_2) < 0)) * 100

r2_risk_of_loss

100.0