In [None]:
# !mkdir datasets
# !unzip '/content/drive/MyDrive/new_hackathon_all_over_worker.zip' -d datasets

In [None]:
# !pip install --upgrade lightgbm

In [None]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from scipy.stats import *

In [None]:
### Reading train and test datasets
well_df      = pd.read_csv('/content/datasets/new_hackathon_all_over_worker/well_details.csv')
train_df     = pd.read_csv('/content/datasets/new_hackathon_all_over_worker/public_train.csv')
test_df      = pd.read_csv('/content/datasets/new_hackathon_all_over_worker/private_input.csv')
well_depth   = pd.read_csv('/content/datasets/new_hackathon_all_over_worker/intake_depth v2.csv')

print(f'Shape of train data before removal of future values:- {train_df.shape}')
print(f'Shape of test data before removal of future values:- {test_df.shape}')
print(f'Shape of well depth before removal of future values:- {well_depth.shape}')
print('\n')

## Removig future values from the both train and test data
train_df     = train_df.dropna(subset=['PUMP_ID_PHASE'])
test_df      = test_df.dropna(subset=['PUMP_ID_PHASE'])
well_depth   = well_depth.dropna(subset=['Tubing_Pull_Date'])
well_depth   = well_depth[['Well_Number','Intake_Depth']]

print(f'Shape of train data after removal of future values:- {train_df.shape}')
print(f'Shape of test data after removal of future values:- {test_df.shape}')
print(f'Shape of well depth before removal of future values:- {well_depth.shape}')

train_df     = train_df.merge(well_df,on=['Well_Number'],how='left')
test_df      = test_df.merge(well_df,on=['Well_Number'],how='left')


Shape of train data before removal of future values:- (1734579, 17)
Shape of test data before removal of future values:- (726517, 15)
Shape of well depth before removal of future values:- (3752, 4)


Shape of train data after removal of future values:- (1117926, 17)
Shape of test data after removal of future values:- (477424, 15)
Shape of well depth before removal of future values:- (2552, 2)


### Creating some extra features based on data understanding

In [None]:
train_df['Difference_actual_pred_gas']    = train_df['Gas_Capacity']-train_df['Gas_Rate']
train_df['Difference_actual_pred_water']  = train_df['Water_Capacity']-train_df['Water_Rate']
train_df['Difference_bw_outer_inner']     = train_df['Gas_Pressure']-train_df['Pump_Pressure']

test_df['Difference_actual_pred_gas']    = test_df['Gas_Capacity']-test_df['Gas_Rate']
test_df['Difference_actual_pred_water']  = test_df['Water_Capacity']-test_df['Water_Rate']
test_df['Difference_bw_outer_inner']     = test_df['Gas_Pressure']-test_df['Pump_Pressure']

In [None]:
# train_df['Gas_Capacity_Rate']=np.where(train_df['Gas_Rate']!=0,train_df['Gas_Capacity']/train_df['Gas_Rate'],np.NaN)
# train_df['Water_Capacity_Rate']=np.where(train_df['Water_Rate']!=0,train_df['Water_Capacity']/train_df['Water_Rate'],np.NaN)
# train_df['Gas_Pump_Pressure']=np.where(train_df['Pump_Pressure']!=0,train_df['Gas_Pressure']/train_df['Pump_Pressure'],np.NaN)
# train_df['Pump_Pressure_Speed']=np.where(train_df['Pump_Speed']!=0,train_df['Pump_Pressure']/train_df['Pump_Speed'],np.NaN)

# test_df['Gas_Capacity_Rate']=np.where(test_df['Gas_Rate']!=0,test_df['Gas_Capacity']/test_df['Gas_Rate'],np.NaN)
# test_df['Water_Capacity_Rate']=np.where(test_df['Water_Rate']!=0,test_df['Water_Capacity']/test_df['Water_Rate'],np.NaN)
# test_df['Gas_Pump_Pressure']=np.where(test_df['Pump_Pressure']!=0,test_df['Gas_Pressure']/test_df['Pump_Pressure'],np.NaN)
# test_df['Pump_Pressure_Speed']=np.where(test_df['Pump_Speed']!=0,test_df['Pump_Pressure']/test_df['Pump_Speed'],np.NaN)


## Build Configuration

In [None]:
numerical_aggregation_columns = ['Gas_Rate','Water_Rate','Gas_Capacity','Water_Capacity',
                                  'Gas_Pressure','Pump_Pressure','Pump_Speed','Pump_Torque','Pump_Volumetric_Eff',
                                  'Downhole_Gauge_Fluid_Level_Above_Sensor','Pump_Torque.1',
                                  'Difference_actual_pred_gas','Difference_actual_pred_water','Difference_bw_outer_inner',
                                #  'Gas_Capacity_Rate','Water_Capacity_Rate','Gas_Pump_Pressure','Pump_Pressure_Speed'
                                ]   
batches                       = 5   
aggregations                  = ['min','max','mean','std','skew','nunique']     
phase2consider                = [5]                        

In [None]:
def compute_slope(y_values):
    '''
        Computes the slope corresponding to a list (The list is comprised of time-indexed feature values)
        If the inpute list is empty, NaN is returned.
    '''
    try:
      z = linregress(list(range(1,len(y_values)+1)), y_values)
      return z.slope
    except:
      return np.nan

def compute_mad(y_values):
    '''
        Computes the mean absolute deviation from the given set of observations.
    '''
    try:
      np.nanmean(np.absolute(y_values - np.nanmean(y_values))) 
    except:
      return np.nan

In [None]:
def get_phasewise_aggregation(input_column,colname,aggregation,groupby_index):
  '''
    Performs phasewise mathematical roll-ups to be incorporated as groupby aggregation.
    The inputs are :- 
    input_column   - The column where the aggregation needs to be performed
    colname        - The index corresponding to which the groupby is performed for this aggregation
    aggregation    - The aggregation name (like mean, std, slope etc)
    
    The output to this function will be a dictionary having phasewise aggregation results
  '''
  
  count                            = 0
  batch_size                       = len(input_column)//batches
  phases                           = {}
  while count<batches:
    
    if count!=batches-1:
      subset_array                 = input_column[count*batch_size:(count+1)*batch_size]
      phase                        = 'phase'
    
    else:
      subset_array                 = input_column[count*batch_size:]
      phase                        = 'phase'

    subset_array                   = np.array(subset_array)
    if count+1 in phase2consider:
    
      if aggregation == 'skew':
        if len(subset_array)<=1:
          phases[f'{groupby_index}_{colname}_{phase}_{count+1}_{aggregation}']  = np.nan
        else:
          try:
            phases[f'{groupby_index}_{colname}_{phase}_{count+1}_{aggregation}']  = np.float(skew(subset_array,nan_policy="omit").data)
          except:
            phases[f'{groupby_index}_{colname}_{phase}_{count+1}_{aggregation}']  = np.float(skew(subset_array,nan_policy="omit"))
        
      elif aggregation == 'slope':
        phases[f'{groupby_index}_{colname}_{phase}_{count+1}_{aggregation}']  = compute_slope(subset_array)
      
      elif aggregation == 'mad':
        phases[f'{groupby_index}_{colname}_{phase}_{count+1}_{aggregation}']  = compute_mad(subset_array)

      elif aggregation == 'nunique':
        phases[f'{groupby_index}_{colname}_{phase}_{count+1}_{aggregation}']  = len(np.unique(subset_array))

      else:
        try:
          phases[f'{groupby_index}_{colname}_{phase}_{count+1}_{aggregation}']  = eval(f'np.nan{aggregation}')(subset_array)
        except:
          phases[f'{groupby_index}_{colname}_{phase}_{count+1}_{aggregation}']  = np.nan
    count = count+1
  
  return phases


def call_phasewise_results(groupby_index,colname,aggregation,train=True):
  '''
      Acts as an aggregation wrapper around 'get_phasewise_aggregation' and returns the phasewise aggregated dictionary 
      and full aggregated dataframe
  '''
  if train:
    needed_df = train_df
  else:
    needed_df = test_df
  grouped_dictionary = needed_df.groupby([groupby_index]).agg({colname:lambda z : get_phasewise_aggregation(z,colname,aggregation,groupby_index)}).reset_index()
  phasewise_df       = pd.concat([grouped_dictionary.drop([colname], axis=1), grouped_dictionary[colname].apply(pd.Series)], axis=1)
  
  if aggregation!='slope':
    grouped_df         = needed_df.groupby([groupby_index]).agg({colname:aggregation})
  else:
    grouped_df         = needed_df.groupby([groupby_index]).agg({colname:lambda z : compute_slope(z)})
  grouped_df.columns   = ['Full_data_'+'_'+groupby_index+'_'+colname+'_'+aggregation for colname in grouped_df.columns]
  grouped_df           = grouped_df.reset_index()

  combdf               = phasewise_df.merge(grouped_df,on=[groupby_index],how='left')

  return combdf
  


## Aggregations based on Pump-ID for train data

In [None]:
train_rollups = pd.DataFrame()
for cols in numerical_aggregation_columns:
  print(f'Aggregations to be performed on Column :- {cols}')
  df = pd.DataFrame()
  for agg in aggregations:
    print(f'Aggregation performed :- {agg}')
    if df.shape[0]==0:
      df = call_phasewise_results('PUMP_ID_PHASE',cols,agg,True)
    else:
      df1 = call_phasewise_results('PUMP_ID_PHASE',cols,agg,True)
      df = df.merge(df1,on=['PUMP_ID_PHASE'],how='left')
      del df1

  if train_rollups.shape[0]==0:
    train_rollups = df
    print(f'Train Data shape {train_rollups.shape}')
  else:
    train_rollups = train_rollups.merge(df,on=['PUMP_ID_PHASE'],how='left')
    print(f'Train Data shape {train_rollups.shape}')
  print('-------------------------------------------------------')



Aggregations to be performed on Column :- Gas_Rate
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (1779, 13)
-------------------------------------------------------
Aggregations to be performed on Column :- Water_Rate
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (1779, 25)
-------------------------------------------------------
Aggregations to be performed on Column :- Gas_Capacity
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (1779, 37)
-------------------------------------------------------
Aggregations to be performed on Column :- Water_C

## Aggregations based on Well Number for train data 

In [None]:
train_rollups_wells = pd.DataFrame()
for cols in numerical_aggregation_columns:
  print(f'Aggregations to be performed on Column :- {cols}')
  df = pd.DataFrame()
  for agg in aggregations:
    print(f'Aggregation performed :- {agg}')
    if df.shape[0]==0:
      df = call_phasewise_results('Well_Number',cols,agg,True)
    else:
      df1 = call_phasewise_results('Well_Number',cols,agg,True)
      df = df.merge(df1,on=['Well_Number'],how='left')
      del df1
  if train_rollups_wells.shape[0]==0:
    train_rollups_wells = df
    print(f'Train Data shape {train_rollups_wells.shape}')
  else:
    train_rollups_wells = train_rollups_wells.merge(df,on=['Well_Number'],how='left')
    print(f'Train Data shape {train_rollups_wells.shape}')
  print('-------------------------------------------------------')



Aggregations to be performed on Column :- Gas_Rate
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (849, 13)
-------------------------------------------------------
Aggregations to be performed on Column :- Water_Rate
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (849, 25)
-------------------------------------------------------
Aggregations to be performed on Column :- Gas_Capacity
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (849, 37)
-------------------------------------------------------
Aggregations to be performed on Column :- Water_Capa

## Aggregations based on Pump ID for test data

In [None]:
test_rollups = pd.DataFrame()
for cols in numerical_aggregation_columns:
  print(f'Aggregations to be performed on Column :- {cols}')
  df = pd.DataFrame()
  for agg in aggregations:
    print(f'Aggregation performed :- {agg}')
    if df.shape[0]==0:
      df = call_phasewise_results('PUMP_ID_PHASE',cols,agg,False)
    else:
      df1 = call_phasewise_results('PUMP_ID_PHASE',cols,agg,False)
      df = df.merge(df1,on=['PUMP_ID_PHASE'],how='left')
      del df1
  if test_rollups.shape[0]==0:
    test_rollups = df
    print(f'Train Data shape {test_rollups.shape}')
  else:
    test_rollups = test_rollups.merge(df,on=['PUMP_ID_PHASE'],how='left')
    print(f'Train Data shape {test_rollups.shape}')
  print('-------------------------------------------------------')



Aggregations to be performed on Column :- Gas_Rate
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (819, 13)
-------------------------------------------------------
Aggregations to be performed on Column :- Water_Rate
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (819, 25)
-------------------------------------------------------
Aggregations to be performed on Column :- Gas_Capacity
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (819, 37)
-------------------------------------------------------
Aggregations to be performed on Column :- Water_Capa

## Aggregations based on Well Number for test data

In [None]:
test_rollups_wells = pd.DataFrame()
for cols in numerical_aggregation_columns:
  print(f'Aggregations to be performed on Column :- {cols}')
  df = pd.DataFrame()
  for agg in aggregations:
    print(f'Aggregation performed :- {agg}')
    if df.shape[0]==0:
      df = call_phasewise_results('Well_Number',cols,agg,False)
    else:
      df1 = call_phasewise_results('Well_Number',cols,agg,False)
      df = df.merge(df1,on=['Well_Number'],how='left')
      del df1
  if test_rollups_wells.shape[0]==0:
    test_rollups_wells = df
    print(f'Train Data shape {test_rollups_wells.shape}')
  else:
    test_rollups_wells = test_rollups_wells.merge(df,on=['Well_Number'],how='left')
    print(f'Train Data shape {test_rollups_wells.shape}')
  print('-------------------------------------------------------')



Aggregations to be performed on Column :- Gas_Rate
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (363, 13)
-------------------------------------------------------
Aggregations to be performed on Column :- Water_Rate
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (363, 25)
-------------------------------------------------------
Aggregations to be performed on Column :- Gas_Capacity
Aggregation performed :- min
Aggregation performed :- max
Aggregation performed :- mean
Aggregation performed :- std
Aggregation performed :- skew
Aggregation performed :- nunique
Train Data shape (363, 37)
-------------------------------------------------------
Aggregations to be performed on Column :- Water_Capa

## Merging rolled-up dataframes to one dataset 

In [None]:
count = 0 
for aggs in aggregations:
  print(f'Aggregation on :- {aggs}')
  if count==0:
    df         = well_depth.groupby('Well_Number').agg({'Intake_Depth':aggs})
    df.columns = [aggs+'_'+colname for colname in df.columns]
    df         = df.reset_index()

  else:
    if aggs!='slope':
      df1         = well_depth.groupby('Well_Number').agg({'Intake_Depth':aggs})
      df1.columns = [aggs+'_'+colname for colname in df1.columns]
      df1         = df1.reset_index()

      df  = df.merge(df1,on=['Well_Number'],how='left')
  count = count+1

well_depth_df = df
print('Well_depth Aggregation data shape:- ',well_depth_df.shape)

Aggregation on :- min
Aggregation on :- max
Aggregation on :- mean
Aggregation on :- std
Aggregation on :- skew
Aggregation on :- nunique
Well_depth Aggregation data shape:-  (1226, 7)


## Creation of final train and test dataset to be used for predictions

In [None]:
train_well_info = train_df[['PUMP_ID_PHASE','Well_Number','Volumetric_Capacity','HR','Date']].dropna().drop_duplicates(subset=['PUMP_ID_PHASE'])
test_well_info  = test_df[['PUMP_ID_PHASE','Well_Number','Date']].dropna().drop_duplicates(subset=['PUMP_ID_PHASE'])

train_data = train_rollups.merge(train_well_info,on=['PUMP_ID_PHASE'],how='left')
train_data = train_data.merge(well_df,on=['Well_Number'],how='left')
train_data = train_data.merge(train_rollups_wells,on=['Well_Number'],how='left')

test_data = test_rollups.merge(test_well_info,on=['PUMP_ID_PHASE'],how='left')
test_data = test_data.merge(well_df,on=['Well_Number'],how='left')
test_data = test_data.merge(test_rollups_wells,on=['Well_Number'],how='left')


train_data = train_data.merge(well_depth_df,on=['Well_Number'],how='left')
test_data  = test_data.merge(well_depth_df,on=['Well_Number'],how='left')

print(train_data.shape,test_data.shape)

(1779, 352) (819, 350)


## Saving the datasets to be used for model training part

In [None]:
# train_data.to_csv('/content/drive/MyDrive/all_over_worker/train_data.csv',index=False)
# print('Train written')
# test_data.to_csv('/content/drive/MyDrive/all_over_worker/test_data.csv',index=False)
# print('test written')

Train written
test written


In [None]:
1+2

3

# Stacking Regression models 

## Regression model 1  on the trained dataset (lightgbm model)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from sklearn.model_selection import *
import lightgbm as lgb

In [None]:
train_df = train_data.copy()
test_df  = test_data.copy()

cols2drop = ['PUMP_ID_PHASE','Date','Well_Number','Completion_Type']
target_y  = train_df['Volumetric_Capacity'].values
train_df  = train_df.drop(cols2drop+['HR','Volumetric_Capacity'],axis=1)
test_df   = test_df.drop(cols2drop,axis=1)

a         = train_df[train_df.columns].isnull().sum()/train_df.shape[0]
cols      = a[a<=0.6].index.tolist()
train_df  = train_df[cols]
test_df   = test_df[cols]
print(train_df.shape,test_df.shape)


#### This segment will be used only if we want to make sure the output corresponds to 
#### a fixed set of volumetric capacity as given in the train data.

unique_vals_target = list(set(target_y))
def get_nearest_value_possible(value):
  '''
      Getting target corresponding to nearest value.
  '''
  abs_distance_dict = {}
  for vals in unique_vals_target:
    abs_distance_dict[vals] = np.abs(value-vals)
  return min(abs_distance_dict, key=abs_distance_dict.get)

train     = train_df.values
test      = test_df.values
train_y   = target_y

oof_pred_1               = np.zeros((len(train), ))
oof_pred_nearest_1       = np.zeros((len(train), ))
y_pred_1                 = np.zeros((len(test), ))
n_splits                 = 20
kf                       = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (tr_ind, val_ind) in enumerate(kf.split(train, train_y)):
  X_train, X_val     = train[tr_ind], train[val_ind]
  y_train, y_val     = train_y[tr_ind], train_y[val_ind]

  model  = lgb.LGBMRegressor(random_state=42,reg_lambda=4,n_estimators=85,learning_rate=0.1,min_split_gain=0.7)
  
  model.fit(X_train,y_train)
  
  val_pred_1                  = model.predict(X_val)
  val_pred_nearest_1          = [get_nearest_value_possible(val) for val in val_pred_1]
  oof_pred_1[val_ind]         = val_pred_1
  oof_pred_nearest_1[val_ind] = val_pred_nearest_1


  print('MAE(actual):- ',mean_absolute_error(y_val,val_pred_1),' fold ',fold+1)
  print('MAE(nearest):- ',mean_absolute_error(y_val,val_pred_nearest_1),' fold ',fold+1)
  print('\n')
  
  y_pred_1 += model.predict(test) / (n_splits)

print('OOF MAE(actual):- ',(mean_absolute_error(train_y,oof_pred_1)))
print('OOF MAE(nearest):- ',(mean_absolute_error(train_y,oof_pred_nearest_1)))
y_pred_1_nearest                      = [get_nearest_value_possible(val) for val in y_pred_1]

(1779, 310) (819, 310)
MAE(actual):-  11.02164031109445  fold  1
MAE(nearest):-  11.07865168539326  fold  1


MAE(actual):-  11.198847102819606  fold  2
MAE(nearest):-  11.101123595505618  fold  2


MAE(actual):-  10.700184052522097  fold  3
MAE(nearest):-  10.584269662921349  fold  3


MAE(actual):-  11.431109266763498  fold  4
MAE(nearest):-  11.292134831460674  fold  4


MAE(actual):-  9.602636503146465  fold  5
MAE(nearest):-  9.337078651685394  fold  5


MAE(actual):-  8.34788728063881  fold  6
MAE(nearest):-  8.247191011235955  fold  6


MAE(actual):-  13.131982334304656  fold  7
MAE(nearest):-  13.067415730337078  fold  7


MAE(actual):-  10.062869525428514  fold  8
MAE(nearest):-  9.741573033707866  fold  8


MAE(actual):-  10.191282941508655  fold  9
MAE(nearest):-  10.280898876404494  fold  9


MAE(actual):-  10.154863443605931  fold  10
MAE(nearest):-  10.044943820224718  fold  10


MAE(actual):-  11.323018904758401  fold  11
MAE(nearest):-  11.617977528089888  fold  11


MA

## Model 2 for Regression 

In [None]:
train_df = train_data.copy()
test_df  = test_data.copy()

cols2drop = ['PUMP_ID_PHASE','Date','Well_Number','Completion_Type']
target_y  = train_df['Volumetric_Capacity'].values
train_df  = train_df.drop(cols2drop+['HR','Volumetric_Capacity'],axis=1)
test_df   = test_df.drop(cols2drop,axis=1)

a         = train_df[train_df.columns].isnull().sum()/train_df.shape[0]
cols      = a[a<=0.6].index.tolist()
train_df  = train_df[cols]
test_df   = test_df[cols]
print(train_df.shape,test_df.shape)


#### This segment will be used only if we want to make sure the output corresponds to 
#### a fixed set of volumetric capacity as given in the train data.

unique_vals_target = list(set(target_y))
def get_nearest_value_possible(value):
  '''
      Getting target corresponding to nearest value.
  '''
  abs_distance_dict = {}
  for vals in unique_vals_target:
    abs_distance_dict[vals] = np.abs(value-vals)
  return min(abs_distance_dict, key=abs_distance_dict.get)

train     = train_df.values
test      = test_df.values
train_y   = target_y

oof_pred_2               = np.zeros((len(train), ))
oof_pred_nearest_2       = np.zeros((len(train), ))
y_pred_2                 = np.zeros((len(test), ))
n_splits                 = 25
kf                       = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (tr_ind, val_ind) in enumerate(kf.split(train, train_y)):
  X_train, X_val     = train[tr_ind], train[val_ind]
  y_train, y_val     = train_y[tr_ind], train_y[val_ind]

  model  = lgb.LGBMRegressor(random_state=42,reg_lambda=4,n_estimators=90,learning_rate=0.1)
  
  model.fit(X_train,y_train)
  
  val_pred_2                  = model.predict(X_val)
  val_pred_nearest_2          = [get_nearest_value_possible(val) for val in val_pred_2]
  oof_pred_2[val_ind]         = val_pred_2
  oof_pred_nearest_2[val_ind] = val_pred_nearest_2


  print('MAE(actual):- ',mean_absolute_error(y_val,val_pred_2),' fold ',fold+1)
  print('MAE(nearest):- ',mean_absolute_error(y_val,val_pred_nearest_2),' fold ',fold+1)
  print('\n')
  
  y_pred_2 += model.predict(test) / (n_splits)

print('OOF MAE(actual):- ',(mean_absolute_error(train_y,oof_pred_2)))
print('OOF MAE(nearest):- ',(mean_absolute_error(train_y,oof_pred_nearest_2)))
y_pred_2_nearest                      = [get_nearest_value_possible(val) for val in y_pred_2]

(1779, 310) (819, 310)
MAE(actual):-  10.664258194497632  fold  1
MAE(nearest):-  10.541666666666666  fold  1


MAE(actual):-  11.003884521159746  fold  2
MAE(nearest):-  11.097222222222221  fold  2


MAE(actual):-  10.815009328989238  fold  3
MAE(nearest):-  11.01388888888889  fold  3


MAE(actual):-  10.998950601239937  fold  4
MAE(nearest):-  10.88888888888889  fold  4


MAE(actual):-  10.87031560838809  fold  5
MAE(nearest):-  10.619718309859154  fold  5


MAE(actual):-  8.961160682627824  fold  6
MAE(nearest):-  8.816901408450704  fold  6


MAE(actual):-  8.489178409074842  fold  7
MAE(nearest):-  8.76056338028169  fold  7


MAE(actual):-  10.192022909362448  fold  8
MAE(nearest):-  10.014084507042254  fold  8


MAE(actual):-  12.622211286169161  fold  9
MAE(nearest):-  12.098591549295774  fold  9


MAE(actual):-  10.605616047459911  fold  10
MAE(nearest):-  11.028169014084508  fold  10


MAE(actual):-  10.238427467380275  fold  11
MAE(nearest):-  10.549295774647888  fold  11


MA

## Regression Model 3 

In [None]:
train_df = train_data.copy()
test_df  = test_data.copy()

cols2drop = ['PUMP_ID_PHASE','Date','Well_Number','Completion_Type']
target_y  = train_df['Volumetric_Capacity'].values
train_df  = train_df.drop(cols2drop+['HR','Volumetric_Capacity'],axis=1)
test_df   = test_df.drop(cols2drop,axis=1)

a         = train_df[train_df.columns].isnull().sum()/train_df.shape[0]
cols      = a[a<=0.6].index.tolist()
train_df  = train_df[cols]
test_df   = test_df[cols]
print(train_df.shape,test_df.shape)


#### This segment will be used only if we want to make sure the output corresponds to 
#### a fixed set of volumetric capacity as given in the train data.

unique_vals_target = list(set(target_y))
def get_nearest_value_possible(value):
  '''
      Getting target corresponding to nearest value.
  '''
  abs_distance_dict = {}
  for vals in unique_vals_target:
    abs_distance_dict[vals] = np.abs(value-vals)
  return min(abs_distance_dict, key=abs_distance_dict.get)

train     = train_df.values
test      = test_df.values
train_y   = target_y

oof_pred_3               = np.zeros((len(train), ))
oof_pred_nearest_3       = np.zeros((len(train), ))
y_pred_3                 = np.zeros((len(test), ))
n_splits                 = 20
kf                       = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (tr_ind, val_ind) in enumerate(kf.split(train, train_y)):
  X_train, X_val     = train[tr_ind], train[val_ind]
  y_train, y_val     = train_y[tr_ind], train_y[val_ind]

  model  = lgb.LGBMRegressor(random_state=42,reg_lambda=5,n_estimators=120,learning_rate=0.095)
  
  model.fit(X_train,y_train)
  
  val_pred_3                  = model.predict(X_val)
  val_pred_nearest_3          = [get_nearest_value_possible(val) for val in val_pred_3]
  oof_pred_3[val_ind]         = val_pred_3
  oof_pred_nearest_3[val_ind] = val_pred_nearest_3


  print('MAE(actual):- ',mean_absolute_error(y_val,val_pred_3),' fold ',fold+1)
  print('MAE(nearest):- ',mean_absolute_error(y_val,val_pred_nearest_3),' fold ',fold+1)
  print('\n')
  
  y_pred_3 += model.predict(test) / (n_splits)

print('OOF MAE(actual):- ',(mean_absolute_error(train_y,oof_pred_3)))
print('OOF MAE(nearest):- ',(mean_absolute_error(train_y,oof_pred_nearest_3)))
y_pred_3_nearest                      = [get_nearest_value_possible(val) for val in y_pred_3]

(1779, 310) (819, 310)
MAE(actual):-  11.395144228104598  fold  1
MAE(nearest):-  11.112359550561798  fold  1


MAE(actual):-  11.529244742045917  fold  2
MAE(nearest):-  11.55056179775281  fold  2


MAE(actual):-  10.58312438600045  fold  3
MAE(nearest):-  10.584269662921349  fold  3


MAE(actual):-  11.895056974754377  fold  4
MAE(nearest):-  11.932584269662922  fold  4


MAE(actual):-  9.250872728361756  fold  5
MAE(nearest):-  9.303370786516854  fold  5


MAE(actual):-  8.501605290069929  fold  6
MAE(nearest):-  8.52808988764045  fold  6


MAE(actual):-  12.556392231600984  fold  7
MAE(nearest):-  12.235955056179776  fold  7


MAE(actual):-  10.349385413494481  fold  8
MAE(nearest):-  10.179775280898877  fold  8


MAE(actual):-  10.061239534138839  fold  9
MAE(nearest):-  10.123595505617978  fold  9


MAE(actual):-  10.438162679256981  fold  10
MAE(nearest):-  10.539325842696629  fold  10


MAE(actual):-  10.576390808953688  fold  11
MAE(nearest):-  10.50561797752809  fold  11


MA

## Regression model 4 

In [None]:
train_df = train_data.copy()
test_df  = test_data.copy()

cols2drop = ['PUMP_ID_PHASE','Date','Well_Number','Completion_Type']
target_y  = train_df['Volumetric_Capacity'].values
train_df  = train_df.drop(cols2drop+['HR','Volumetric_Capacity'],axis=1)
test_df   = test_df.drop(cols2drop,axis=1)

a         = train_df[train_df.columns].isnull().sum()/train_df.shape[0]
cols      = a[a<=0.6].index.tolist()
train_df  = train_df[cols]
test_df   = test_df[cols]
print(train_df.shape,test_df.shape)


#### This segment will be used only if we want to make sure the output corresponds to 
#### a fixed set of volumetric capacity as given in the train data.

unique_vals_target = list(set(target_y))
def get_nearest_value_possible(value):
  '''
      Getting target corresponding to nearest value.
  '''
  abs_distance_dict = {}
  for vals in unique_vals_target:
    abs_distance_dict[vals] = np.abs(value-vals)
  return min(abs_distance_dict, key=abs_distance_dict.get)

train     = train_df.values
test      = test_df.values
train_y   = target_y

oof_pred_4               = np.zeros((len(train), ))
oof_pred_nearest_4       = np.zeros((len(train), ))
y_pred_4                 = np.zeros((len(test), ))
n_splits                 = 20
kf                       = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (tr_ind, val_ind) in enumerate(kf.split(train, train_y)):
  X_train, X_val     = train[tr_ind], train[val_ind]
  y_train, y_val     = train_y[tr_ind], train_y[val_ind]

  model  = lgb.LGBMRegressor(random_state=42,reg_lambda=2,n_estimators=70,learning_rate=0.1)
  
  model.fit(X_train,y_train)
  
  val_pred_4                  = model.predict(X_val)
  val_pred_nearest_4          = [get_nearest_value_possible(val) for val in val_pred_4]
  oof_pred_4[val_ind]         = val_pred_4
  oof_pred_nearest_4[val_ind] = val_pred_nearest_4


  print('MAE(actual):- ',mean_absolute_error(y_val,val_pred_4),' fold ',fold+1)
  print('MAE(nearest):- ',mean_absolute_error(y_val,val_pred_nearest_4),' fold ',fold+1)
  print('\n')
  
  y_pred_4 += model.predict(test) / (n_splits)

print('OOF MAE(actual):- ',(mean_absolute_error(train_y,oof_pred_4)))
print('OOF MAE(nearest):- ',(mean_absolute_error(train_y,oof_pred_nearest_4)))
y_pred_4_nearest                      = [get_nearest_value_possible(val) for val in y_pred_4]

(1779, 310) (819, 310)
MAE(actual):-  11.43256258861148  fold  1
MAE(nearest):-  11.415730337078651  fold  1


MAE(actual):-  10.289939805783831  fold  2
MAE(nearest):-  10.134831460674157  fold  2


MAE(actual):-  10.97030722537915  fold  3
MAE(nearest):-  10.887640449438202  fold  3


MAE(actual):-  11.635595885272505  fold  4
MAE(nearest):-  11.584269662921349  fold  4


MAE(actual):-  10.006759253862688  fold  5
MAE(nearest):-  10.235955056179776  fold  5


MAE(actual):-  8.092256376496739  fold  6
MAE(nearest):-  8.067415730337078  fold  6


MAE(actual):-  12.656343545121945  fold  7
MAE(nearest):-  12.561797752808989  fold  7


MAE(actual):-  10.320370817981  fold  8
MAE(nearest):-  10.01123595505618  fold  8


MAE(actual):-  10.216996478511115  fold  9
MAE(nearest):-  10.191011235955056  fold  9


MAE(actual):-  10.541343462490815  fold  10
MAE(nearest):-  10.359550561797754  fold  10


MAE(actual):-  10.4417843049313  fold  11
MAE(nearest):-  10.179775280898877  fold  11


MAE(

## Regression Model 5 

In [None]:
train_df = train_data.copy()
test_df  = test_data.copy()

cols2drop = ['PUMP_ID_PHASE','Date','Well_Number','Completion_Type']
target_y  = train_df['Volumetric_Capacity'].values
train_df  = train_df.drop(cols2drop+['HR','Volumetric_Capacity'],axis=1)
test_df   = test_df.drop(cols2drop,axis=1)

a         = train_df[train_df.columns].isnull().sum()/train_df.shape[0]
cols      = a[a<=0.6].index.tolist()
train_df  = train_df[cols]
test_df   = test_df[cols]
print(train_df.shape,test_df.shape)


#### This segment will be used only if we want to make sure the output corresponds to 
#### a fixed set of volumetric capacity as given in the train data.

unique_vals_target = list(set(target_y))
def get_nearest_value_possible(value):
  '''
      Getting target corresponding to nearest value.
  '''
  abs_distance_dict = {}
  for vals in unique_vals_target:
    abs_distance_dict[vals] = np.abs(value-vals)
  return min(abs_distance_dict, key=abs_distance_dict.get)

train     = train_df.values
test      = test_df.values
train_y   = target_y

oof_pred_5               = np.zeros((len(train), ))
oof_pred_nearest_5       = np.zeros((len(train), ))
y_pred_5                 = np.zeros((len(test), ))
n_splits                 = 20
kf                       = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (tr_ind, val_ind) in enumerate(kf.split(train, train_y)):
  X_train, X_val     = train[tr_ind], train[val_ind]
  y_train, y_val     = train_y[tr_ind], train_y[val_ind]

  model  = lgb.LGBMRegressor(random_state=42,reg_lambda=4,n_estimators=85,learning_rate=0.1,min_split_gain=0.7,num_leaves=64)
  
  model.fit(X_train,y_train)
  
  val_pred_5                  = model.predict(X_val)
  val_pred_nearest_5          = [get_nearest_value_possible(val) for val in val_pred_5]
  oof_pred_5[val_ind]         = val_pred_5
  oof_pred_nearest_5[val_ind] = val_pred_nearest_5


  print('MAE(actual):- ',mean_absolute_error(y_val,val_pred_5),' fold ',fold+1)
  print('MAE(nearest):- ',mean_absolute_error(y_val,val_pred_nearest_5),' fold ',fold+1)
  print('\n')
  
  y_pred_5 += model.predict(test) / (n_splits)

print('OOF MAE(actual):- ',(mean_absolute_error(train_y,oof_pred_5)))
print('OOF MAE(nearest):- ',(mean_absolute_error(train_y,oof_pred_nearest_5)))
y_pred_5_nearest                      = [get_nearest_value_possible(val) for val in y_pred_5]

(1779, 310) (819, 310)
MAE(actual):-  11.14234637529691  fold  1
MAE(nearest):-  11.03370786516854  fold  1


MAE(actual):-  11.355056998271237  fold  2
MAE(nearest):-  11.269662921348315  fold  2


MAE(actual):-  10.791457536259037  fold  3
MAE(nearest):-  10.96629213483146  fold  3


MAE(actual):-  11.240879493617667  fold  4
MAE(nearest):-  11.179775280898877  fold  4


MAE(actual):-  9.561029400232767  fold  5
MAE(nearest):-  9.707865168539326  fold  5


MAE(actual):-  8.645751821890231  fold  6
MAE(nearest):-  8.629213483146067  fold  6


MAE(actual):-  12.698895541942777  fold  7
MAE(nearest):-  12.606741573033707  fold  7


MAE(actual):-  10.159406383186049  fold  8
MAE(nearest):-  10.168539325842696  fold  8


MAE(actual):-  10.043956061060003  fold  9
MAE(nearest):-  10.067415730337078  fold  9


MAE(actual):-  10.243767416783394  fold  10
MAE(nearest):-  10.01123595505618  fold  10


MAE(actual):-  10.509478673971023  fold  11
MAE(nearest):-  10.44943820224719  fold  11


MAE

## Building meta estimator based on 5 models

In [None]:
meta_train_df = pd.DataFrame(oof_pred_nearest_1,columns=['Model_1_rounded'])
meta_train_df['Model1']          = oof_pred_1

meta_train_df['Model_2_rounded'] = oof_pred_nearest_2
meta_train_df['Model2']          = oof_pred_2

meta_train_df['Model_3_rounded'] = oof_pred_nearest_3
meta_train_df['Model3']          = oof_pred_3

meta_train_df['Model_4_rounded'] = oof_pred_nearest_4
meta_train_df['Model4']          = oof_pred_4

meta_train_df['Model_5_rounded'] = oof_pred_nearest_5
meta_train_df['Model5']          = oof_pred_5

meta_train_df['predictions'] = target_y
meta_train_df

Unnamed: 0,Model_1_rounded,Model1,Model_2_rounded,Model2,Model_3_rounded,Model3,Model_4_rounded,Model4,Model_5_rounded,Model5,predictions
0,23.0,22.589578,20.0,20.513345,22.0,21.642098,20.0,19.636059,27.0,26.148603,33.0
1,30.0,28.713606,30.0,30.081619,32.0,31.399080,32.0,31.442301,30.0,30.124243,40.0
2,34.0,33.838661,40.0,40.223275,32.0,31.765988,33.0,32.659530,32.0,31.643641,33.0
3,40.0,39.724618,38.0,38.339837,40.0,39.230205,40.0,39.477287,42.0,41.693242,40.0
4,23.0,22.538562,24.0,23.729805,23.0,22.685801,24.0,23.965227,30.0,29.160956,33.0
...,...,...,...,...,...,...,...,...,...,...,...
1774,35.0,34.760135,33.0,32.860413,35.0,35.336018,38.0,37.379422,33.0,33.160091,33.0
1775,24.0,24.401393,32.0,31.039997,27.0,25.851944,24.0,23.727127,30.0,30.359318,33.0
1776,14.0,13.848111,16.0,16.341234,14.0,13.958283,14.0,13.876858,13.0,13.282738,8.0
1777,13.0,11.748397,13.0,12.686304,10.0,10.870914,15.0,15.374368,14.0,14.090929,13.0


In [None]:
meta_test_df = pd.DataFrame(y_pred_1_nearest,columns=['Model_1_rounded'])
meta_test_df['Model1'] = y_pred_1

meta_test_df['Model_2_rounded'] = y_pred_2_nearest
meta_test_df['Model2'] = y_pred_2

meta_test_df['Model_3_rounded'] = y_pred_3_nearest
meta_test_df['Model3'] = y_pred_3

meta_test_df['Model_4_rounded'] = y_pred_4_nearest
meta_test_df['Model4'] = y_pred_4

meta_test_df['Model_5_rounded'] = y_pred_5_nearest
meta_test_df['Model5'] = y_pred_5

meta_test_df

Unnamed: 0,Model_1_rounded,Model1,Model_2_rounded,Model2,Model_3_rounded,Model3,Model_4_rounded,Model4,Model_5_rounded,Model5
0,32.0,31.511952,32.0,31.717288,32.0,32.169069,32.0,31.196630,34.0,34.068797
1,20.0,18.595537,20.0,19.537480,20.0,19.210186,20.0,19.771103,16.0,17.490034
2,16.0,17.126426,16.0,17.514147,16.0,17.130394,16.0,17.743207,15.0,15.241458
3,24.0,24.036883,24.0,23.863544,23.0,23.273367,24.0,23.760759,24.0,23.904662
4,20.0,19.939279,22.0,21.585843,20.0,20.378041,20.0,20.930252,20.0,19.466718
...,...,...,...,...,...,...,...,...,...,...
814,22.0,21.850285,23.0,22.666561,22.0,22.146739,22.0,21.646112,23.0,22.946442
815,23.0,22.865520,22.0,22.195187,22.0,22.307984,23.0,22.627338,24.0,23.522002
816,30.0,30.492347,30.0,30.727879,32.0,31.929486,32.0,31.136113,33.0,32.919980
817,24.0,24.128590,27.0,25.937842,24.0,24.828542,24.0,24.711077,27.0,26.678236


In [None]:
from sklearn.linear_model import ElasticNet
train_df = meta_train_df
test_df  = meta_test_df

target_y  = train_df['predictions'].values
train_df  = train_df.drop(['predictions'],axis=1)

print(train_df.shape,test_df.shape)


#### This segment will be used only if we want to make sure the output corresponds to 
#### a fixed set of volumetric capacity as given in the train data.

unique_vals_target = list(set(target_y))
def get_nearest_value_possible(value):
  '''
      Getting target corresponding to nearest value.
  '''
  abs_distance_dict = {}
  for vals in unique_vals_target:
    abs_distance_dict[vals] = np.abs(value-vals)
  return min(abs_distance_dict, key=abs_distance_dict.get)

train     = train_df.values
test      = test_df.values
train_y   = target_y

oof_pred               = np.zeros((len(train), ))
oof_pred_nearest       = np.zeros((len(train), ))
y_pred                 = np.zeros((len(test), ))
n_splits               = 20
kf                     = KFold(n_splits=n_splits, shuffle=True, random_state=42)

for fold, (tr_ind, val_ind) in enumerate(kf.split(train, train_y)):
  X_train, X_val     = train[tr_ind], train[val_ind]
  y_train, y_val     = train_y[tr_ind], train_y[val_ind]

  model  = ElasticNet(random_state=42,alpha = 1,l1_ratio=0.5,max_iter=1000,selection='random')
  
  model.fit(X_train,y_train)
  
  val_pred                  = model.predict(X_val)
  val_pred_nearest          = [get_nearest_value_possible(val) for val in val_pred]
  oof_pred[val_ind]         = val_pred
  oof_pred_nearest[val_ind] = val_pred_nearest


  print('MAE(actual):- ',mean_absolute_error(y_val,val_pred),' fold ',fold+1)
  print('MAE(nearest):- ',mean_absolute_error(y_val,val_pred_nearest),' fold ',fold+1)
  print('\n')
  
  y_pred += model.predict(test) / (n_splits)

print('OOF MAE(actual):- ',(mean_absolute_error(train_y,oof_pred)))
print('OOF MAE(nearest):- ',(mean_absolute_error(train_y,oof_pred_nearest)))
y_pred                      = [get_nearest_value_possible(val) for val in y_pred]

(1779, 10) (819, 10)
MAE(actual):-  11.088227602515747  fold  1
MAE(nearest):-  10.910112359550562  fold  1


MAE(actual):-  11.018739324695796  fold  2
MAE(nearest):-  10.876404494382022  fold  2


MAE(actual):-  10.644863369329695  fold  3
MAE(nearest):-  10.49438202247191  fold  3


MAE(actual):-  11.22547578150398  fold  4
MAE(nearest):-  11.235955056179776  fold  4


MAE(actual):-  9.712081882250574  fold  5
MAE(nearest):-  9.617977528089888  fold  5


MAE(actual):-  8.25123792842528  fold  6
MAE(nearest):-  8.168539325842696  fold  6


MAE(actual):-  12.473143494444724  fold  7
MAE(nearest):-  12.280898876404494  fold  7


MAE(actual):-  10.199415962585679  fold  8
MAE(nearest):-  9.898876404494382  fold  8


MAE(actual):-  10.132742392987609  fold  9
MAE(nearest):-  10.134831460674157  fold  9


MAE(actual):-  10.21366630180158  fold  10
MAE(nearest):-  10.044943820224718  fold  10


MAE(actual):-  10.816029555231164  fold  11
MAE(nearest):-  10.674157303370787  fold  11


MAE(a

## Output preparation

In [None]:
sub_df                        = pd.DataFrame(columns=['Volumetric_Capacity'])
sub_df['Volumetric_Capacity'] = y_pred
sub_df['Volumetric_Capacity'].describe()

count    819.000000
mean      38.637363
std       23.363276
min        8.000000
25%       23.000000
50%       32.000000
75%       45.000000
max      150.000000
Name: Volumetric_Capacity, dtype: float64

In [None]:
test_df               = test_data.copy()
sub_df['pump_id']     = test_df['PUMP_ID_PHASE']
sub_df['Date']        = test_df['Date']
sub_df['Well_Number'] = test_df['Well_Number']
output_df_reg       = sub_df[['pump_id','Well_Number','Date','Volumetric_Capacity']]

In [None]:
output_df_reg.to_csv('nearest_rectified_meta_data_elastc_net.csv',index=False)
output_df_reg

Unnamed: 0,pump_id,Well_Number,Date,Volumetric_Capacity
0,9.0,AGB005,2019-07-30,32.0
1,13.0,AGQ001,2014-06-12,20.0
2,14.0,AGQ001,2017-12-11,16.0
3,33.0,AGQ007,2014-08-22,24.0
4,34.0,AGQ007,2017-05-08,20.0
...,...,...,...,...
814,3720.0,ZUP197,2018-05-16,22.0
815,3730.0,ZUP201,2018-05-18,23.0
816,3743.0,ZUP207,2019-03-02,32.0
817,3744.0,ZUP207,2019-07-27,24.0
