In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from rankfm.rankfm import RankFM

In [2]:
well_data = pd.read_csv("wellspublic.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
well_data.head()

Unnamed: 0,API_WellNo,Cnty,Hole,SideTrck,Completion,Well_Name,Company_name,Operator_number,Well_Type,Map_Symbol,...,Depth_Fee,Spacing,Spacing_Acres,Integration,Dt_Hearing,Dt_Mod,LINK,Location_Verified,GeneralWellType,GeneralWellStatus
0,31001010720000,1,1072,0,0,Finch 1,Corona Typewriter,9372,NL,O,...,0.0,,,,,2019-12-13 12:13:04.407000000,http://www.dec.ny.gov/cfmx/extapps/GasOil/sear...,NO,Other Well,Unplugged Well
1,31001010730000,1,1073,0,0,Hilton James 1,Belmont Quadrangle Drilling,9034,NL,O,...,0.0,Exempt from Title 5; variance needed from 6 NY...,,,,2017-08-01 15:24:16,http://www.dec.ny.gov/cfmx/extapps/GasOil/sear...,NO,Other Well,Unplugged Well
2,31001210070000,1,21007,0,0,1,Turf Western Ave. Inc.,1903,TH,OP,...,375.0,Exempt - not an oil or gas well,,,,2017-07-25 14:20:53,http://www.dec.ny.gov/cfmx/extapps/GasOil/sear...,NO,Geothermal Well,Plugged Well
3,31001210080000,1,21008,0,0,2,Turf Western Ave. Inc.,1903,TH,OP,...,375.0,Exempt - not an oil or gas well,,,,2017-07-26 09:23:40,http://www.dec.ny.gov/cfmx/extapps/GasOil/sear...,NO,Geothermal Well,Plugged Well
4,31001210090000,1,21009,0,0,3,Turf Western Ave. Inc.,1903,TH,OP,...,375.0,Exempt - not an oil or gas well,,,,2019-11-06 15:40:48.017000000,http://www.dec.ny.gov/cfmx/extapps/GasOil/sear...,NO,Geothermal Well,Plugged Well


In [4]:
## Select limited dataset and remove nan values
limited_data = well_data[['Operator_number','API_WellNo','Surface_Longitude','Surface_latitude']].dropna()

In [5]:
## Remove zero values
limited_data = limited_data[limited_data['Surface_Longitude'] < 0]

In [6]:
## Shuffle
limited_data = limited_data.sample(frac = 1).reset_index()

In [7]:
class Well_Database:
    
    def __init__(self, dataset):
        self.data = dataset
        
    def getDataset(self):
        return self.data
    
    def lookupAPI(self, api):
        return self.data.loc[self.data['API_WellNo'] == api].iloc[0]
    
    def lookup(self, welldata):
        return self.lookupAPI(welldata['API_WellNo'])
    
    def datasetShuffle(self):
        self.data = self.data.sample(frac=1)
        
    def getWellSimilarity(self, api1, api2, verbose = False):
        
        well_1 = self.lookupAPI(api1)
        
        well_2 = self.lookupAPI(api2)
        
        # Check how different these are in elevation
        
        # Altitude 
        
        alt_1 = well_1['Surface_latitude']
        alt_2 = well_2['Surface_latitude']

        alt_dif = pow((alt_1 - alt_2),2)
        
        if (verbose):
            print("Original altitude: ", well_1['Surface_latitude'])
            print("Predicted altitude: ", well_2['Surface_latitude'])
            print("square Error: ", alt_dif)
            
        # Longitude
        long_1 = well_1['Surface_Longitude']
        long_2 = well_2['Surface_Longitude']

        long_dif = pow((alt_1 - alt_2),2)
        
        if (verbose):
            print("Original Longitude: ", well_1['Surface_Longitude'])
            print("Predicted Longitude: ", well_2['Surface_Longitude'])
            print("square Error: ", long_dif)
            
        return alt_dif + long_dif
        

In [8]:
wellDatabase = Well_Database(limited_data)

In [15]:
def leave_n_out(n, wellDatabase, verbose = False):

    wellDatabase.datasetShuffle()
        
    holdout_data = wellDatabase.getDataset()[:n]
    
    remaining_data = wellDatabase.getDataset()[n:]
    
    # Interactions consist of [user_id,item_id]
    interactions = remaining_data[['Operator_number','API_WellNo']]
    
    # Item_features consist of [item_id,Longitude,Latitude]
    item_features = remaining_data[['API_WellNo','Surface_Longitude','Surface_latitude']]
    
    # Model Creation
    model = RankFM(factors=20, loss='warp', max_samples=20, alpha=0.01, sigma=0.1, learning_rate=0.1, learning_schedule='invscaling')
    
    model.fit(interactions = interactions, item_features = item_features, epochs = 20, verbose = False)
    
    # Prediction interactions.shape[0]
    
    recs = model.recommend(holdout_data['Operator_number'], n_items = 10)
    
    #print(recs)
    
    error_sum = 0
    
    for i in range(0,n):
        rec = recs.iloc[i]
        
        holdout = holdout_data.iloc[i]
        
        if (verbose):
            print(rec[0])
            print(holdout['API_WellNo'])
            
        error_sum += wellDatabase.getWellSimilarity(holdout['API_WellNo'], rec[0], verbose)
        
        if (verbose):
            print("--------")
            
    return error_sum

In [17]:
print(leave_n_out(5, wellDatabase,verbose = True))

31009643660000
31009644620000.0
Original altitude:  42.03582
Predicted altitude:  42.03861
square Error:  7.784099999985523e-06
Original Longitude:  -78.59001
Predicted Longitude:  -78.58857
square Error:  7.784099999985523e-06
--------
31037055420000
31037263690000.0
Original altitude:  42.912095
Predicted altitude:  42.89235
square Error:  0.00038986502500001365
Original Longitude:  -78.448825
Predicted Longitude:  -78.44128
square Error:  0.00038986502500001365
--------
31003241810000
31009123750000.0
Original altitude:  42.00558
Predicted altitude:  42.02822
square Error:  0.0005125695999997986
Original Longitude:  -78.55880999999998
Predicted Longitude:  -77.80244999999998
square Error:  0.0005125695999997986
--------
31003700180000
31003604000000.0
Original altitude:  42.085586
Predicted altitude:  42.088245
square Error:  7.070281000006909e-06
Original Longitude:  -78.113894
Predicted Longitude:  -78.17660499999998
square Error:  7.070281000006909e-06
--------
31003121180000
310