In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

data_path = Path().cwd() / 'data'

In [2]:
test_sheet = 'test_points_extracted.xls'
test_sites = pd.read_excel(data_path / test_sheet, engine='calamine')

test_sites.drop(columns=[col for col in test_sites.columns if 'NEAR_FID' in col], inplace=True)

print(test_sites.shape)
test_sites.head()

(12067, 12)


Unnamed: 0,OBJECTID_1,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,NEAR_DIST_Chert,Elevation,Wetness,Temp,Slope
0,1,3353,15.726739,43.02339,325051.049972,92375.044496,109.719805,27245.870159,70.596275,0,16.799999,207
1,2,3357,15.74306,43.02339,326171.018246,91240.568066,521.790859,25924.985926,227.165344,0,16.4,243
2,3,3362,15.759382,43.02339,327292.421342,90111.311563,128.050137,24605.088261,41.27142,0,17.1,236
3,4,3410,15.759382,43.032675,326739.480379,89566.947218,238.350808,24494.10294,58.482124,0,16.5,232
4,5,183,16.510161,42.763406,376552.349498,77429.587911,45.335448,792.107078,22.511099,0,17.0,242


In [3]:
def clean_waw(df: pd.DataFrame):
    wcols = [col for col in df.columns if 'WAW' in col]
    waw = df[wcols + ['OBJECTID']]
    df.drop(columns=wcols, inplace=True)
    
    waw['Wetness'] = waw[waw.drop(['OBJECTID'], axis=1).columns].apply(
        lambda x: ''.join(x.dropna().astype(str)), 
        axis=1
    )
    
    waw = waw[waw['Wetness'] != ''][['OBJECTID', 'Wetness']]
    waw['Wetness'] = waw['Wetness'].astype(float).astype(np.uint8)
    return pd.merge(df, waw, on='OBJECTID', how='inner')

In [4]:
if any('WAW' in col for col in test_sites.columns):
    test_cleaned = clean_waw(test_sites)

else:
    test_cleaned = test_sites.copy()

In [5]:
display(test_cleaned.isna().sum().to_frame().T)

test_cleaned.dropna(inplace=True)

display(test_cleaned.isna().sum().to_frame().T)


Unnamed: 0,OBJECTID_1,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,NEAR_DIST_Chert,Elevation,Wetness,Temp,Slope
0,0,0,0,0,0,0,0,0,0,0,1,0


Unnamed: 0,OBJECTID_1,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,NEAR_DIST_Chert,Elevation,Wetness,Temp,Slope
0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
test_cleaned.to_csv(data_path / f"{test_sheet.split('.')[0]}_cleaned.csv", index=False)

In [7]:
known_sites = pd.read_excel(data_path / 'known_sites_augmented.xls', engine='calamine')

known_sites.rename(columns={
        i: i.replace('sites_XYTableToPoint_', '') for i in known_sites.columns if 'sites_XYTableToPoint_' in i
    }, inplace=True)

known_sites.rename(columns={
    'Elevation__Masl_': 'Elevation',
    'Dd_ns': 'y',
    'Dd_ew': 'x',
    }, inplace=True)

known_sites.drop(columns=['Dd', 'Dms'] + [col for col in known_sites.columns if 'NEAR_FID' in col], inplace=True)

print(known_sites.shape)
known_sites.head()

(47, 39)


Unnamed: 0,OBJECTID,Site_Name,Geographical_Region,Geographical_Location,Elevation,Period_New,Site_Type,y,x,NEAR_DIST_Chert,...,WAW_2018_010m_E49N21_03035_v020,WAW_2018_010m_E49N22_03035_v020,WAW_2018_010m_E49N24_03035_v020,WAW_2018_010m_E49N25_03035_v020,WAW_2018_010m_E50N21_03035_v020,WAW_2018_010m_E50N22_03035_v020,WAW_2018_010m_E50N24_03035_v020,WAW_2018_010m_E50N25_03035_v020,Slope,Temp
0,1,Abri Kontija 002,Istra,Limski kanal,46.0,UP,RS,45.1375,13.718611,247901.167568,...,,,,,,,,,204,15.7
1,2,Abri Šebrn,,,750.0,MES,RS,45.337712,14.162687,235748.803581,...,,,,,,,,,241,11.0
2,3,Brjgućeva Loza 1 (Loza),Istra,Kastav,510.0,MES,C,45.467778,14.242222,241293.250685,...,,,,,,,,,247,12.4
3,4,Bukovac,,,864.0,UP,C,45.346569,14.756238,204549.089663,...,,,,,,,,,245,9.7
4,5,Campanož,,,,MP,O,44.849045,13.89999,215799.109406,...,,,,,,,,,250,15.7


In [8]:
known_sites_cleaned = clean_waw(known_sites).drop(columns=['Geographical_Region', 'Geographical_Location', 'Period_New', 'Site_Type'])

cols = known_sites_cleaned.columns
known_sites_cleaned = known_sites_cleaned[cols[:3].to_list() + [cols[-2]] + cols[3:-2].to_list() + [cols[-1]]]

print(known_sites_cleaned.shape)
known_sites_cleaned.head()

(47, 13)


Unnamed: 0,OBJECTID,Site_Name,Elevation,Temp,y,x,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Elevation_Raster,Slope,Wetness
0,1,Abri Kontija 002,46.0,15.7,45.1375,13.718611,247901.167568,80993.524258,73640.08705,7612.808936,46.136692,204,0
1,2,Abri Šebrn,750.0,11.0,45.337712,14.162687,235748.803581,98999.561639,44437.69062,9921.595682,732.561707,241,0
2,3,Brjgućeva Loza 1 (Loza),510.0,12.4,45.467778,14.242222,241293.250685,101016.428031,34876.914519,14119.067154,432.515076,247,0
3,4,Bukovac,864.0,9.7,45.346569,14.756238,204549.089663,143240.912311,13416.969148,15491.977574,791.070129,245,0
4,5,Campanož,,15.7,44.849045,13.89999,215799.109406,114332.595817,94426.986888,2230.743465,43.70879,250,0


In [9]:
known_sites_cleaned.Elevation.fillna(known_sites_cleaned.Elevation_Raster, inplace=True)
known_sites_cleaned.Elevation.fillna(0, inplace=True)

known_sites_cleaned.drop(columns=['Elevation_Raster'], inplace=True)

known_sites_cleaned['Is_Site'] = 1

known_sites_cleaned

Unnamed: 0,OBJECTID,Site_Name,Elevation,Temp,y,x,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Slope,Wetness,Is_Site
0,1,Abri Kontija 002,46.0,15.7,45.1375,13.718611,247901.167568,80993.524258,73640.08705,7612.808936,204,0,1
1,2,Abri Šebrn,750.0,11.0,45.337712,14.162687,235748.803581,98999.561639,44437.69062,9921.595682,241,0,1
2,3,Brjgućeva Loza 1 (Loza),510.0,12.4,45.467778,14.242222,241293.250685,101016.428031,34876.914519,14119.067154,247,0,1
3,4,Bukovac,864.0,9.7,45.346569,14.756238,204549.089663,143240.912311,13416.969148,15491.977574,245,0,1
4,5,Campanož,43.70879,15.7,44.849045,13.89999,215799.109406,114332.595817,94426.986888,2230.743465,250,0,1
5,6,Cerovačke pećine,624.0,10.9,44.274628,15.884619,55816.576622,230558.124333,13364.401168,27505.569553,207,0,1
6,7,Giljanovići (Karanušići),154.0,14.4,43.940833,16.426389,8763.044136,263072.737065,24402.718149,43623.324044,250,4,1
7,8,Glavičica,295.0,14.6,43.749998,16.666664,16125.218641,283521.439121,128.942963,28440.487128,250,0,1
8,9,Gospodska špilja,430.0,13.9,43.983887,16.436379,12714.582519,258279.688979,27729.070919,48470.385061,241,0,1
9,10,Igraliste-Dolac,16.0,16.4,44.142032,14.841416,108346.908403,208949.862435,62981.253902,242.634165,250,0,1


# Modeling

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [11]:
print(f'known columns: {known_sites_cleaned.columns}')
print(f'test columns: {test_cleaned.columns}')

known columns: Index(['OBJECTID', 'Site_Name', 'Elevation', 'Temp', 'y', 'x',
       'NEAR_DIST_Chert', 'NEAR_DIST_Canals', 'NEAR_DIST_River_Net',
       'NEAR_DIST_Coastal', 'Slope', 'Wetness', 'Is_Site'],
      dtype='object')
test columns: Index(['OBJECTID_1', 'OBJECTID', 'x', 'y', 'NEAR_DIST_Canals',
       'NEAR_DIST_River_Net', 'NEAR_DIST_Coastal', 'NEAR_DIST_Chert',
       'Elevation', 'Wetness', 'Temp', 'Slope'],
      dtype='object')


In [12]:
known = known_sites_cleaned.drop(columns=['OBJECTID', 'x', 'y', 'Site_Name'])
test = test_cleaned.drop(columns=['OBJECTID', 'x', 'y'])

col_order = ['Elevation', 'Wetness', 'Temp', 'Slope', 'NEAR_DIST_Chert', 'NEAR_DIST_Canals', 'NEAR_DIST_River_Net', 'NEAR_DIST_Coastal']

known = known[col_order + ['Is_Site']]
test = test[col_order]

test.reset_index(drop=True, inplace=True)

In [13]:
known.head()

Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Is_Site
0,46.0,0,15.7,204,247901.167568,80993.524258,73640.08705,7612.808936,1
1,750.0,0,11.0,241,235748.803581,98999.561639,44437.69062,9921.595682,1
2,510.0,0,12.4,247,241293.250685,101016.428031,34876.914519,14119.067154,1
3,864.0,0,9.7,245,204549.089663,143240.912311,13416.969148,15491.977574,1
4,43.70879,0,15.7,250,215799.109406,114332.595817,94426.986888,2230.743465,1


In [14]:
test.head()

Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal
0,70.596275,0,16.799999,207,27245.870159,325051.049972,92375.044496,109.719805
1,227.165344,0,16.4,243,25924.985926,326171.018246,91240.568066,521.790859
2,41.27142,0,17.1,236,24605.088261,327292.421342,90111.311563,128.050137
3,58.482124,0,16.5,232,24494.10294,326739.480379,89566.947218,238.350808
4,22.511099,0,17.0,242,792.107078,376552.349498,77429.587911,45.335448


In [15]:
test.loc[(test['Elevation'] > 400), 'Is_Site'] = 0
test.loc[(test['Elevation'] <= 400), 'Is_Site'] = 1
# test.loc[(test['Elevation'] < 50), 'Is_Site'] = 0


test.loc[(test['NEAR_DIST_Chert'] > 20000), 'Is_Site'] = 0
test.loc[(test['NEAR_DIST_Coastal'] > 6000), 'Is_Site'] = 0
# test.loc[(test['NEAR_DIST_Canals'] < 20000), 'Is_Site'] = 0
# test.loc[(test['NEAR_DIST_River_Net'] < 20000), 'Is_Site'] = 0


test['Is_Site'] = test['Is_Site'].astype(np.uint8)

test.to_csv(data_path / 'test_labeled.csv', index=False)

test['Is_Site'].value_counts()

Is_Site
0    10769
1     1297
Name: count, dtype: int64

In [24]:
rng = np.random.RandomState(42)

rng_selection = test[test['Is_Site'] == 0]
rng_nums = rng.choice(
    a=rng_selection.index, 
    size=int(test['Is_Site'].value_counts()[0]*0.25), 
    replace=False
)

test_cut = test.drop(index=rng_nums) # pyright: ignore[reportArgumentType, reportCallIssue]

rng_sites = rng_selection.loc[rng_nums]

rng_sites

Using 47 known sites and 2692 test sites


Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Is_Site
10201,868.111755,0,10.3,227,50516.797211,227447.907927,23578.697991,38417.827401,0
2710,1000.587036,0,10.8,250,10880.125622,304070.169667,8273.789065,7257.978740,0
3760,128.218292,0,15.7,250,37963.694584,271789.640569,12318.356676,15085.628359,0
11874,871.318420,0,8.8,237,84059.422445,196449.533050,11519.405670,46241.989604,0
8711,1508.136230,0,6.7,220,84557.992290,226704.128842,20583.756526,10018.917593,0
...,...,...,...,...,...,...,...,...,...
11507,1073.714111,0,8.3,243,103370.098982,197740.912516,13131.627590,34807.141754,0
5137,242.868927,0,15.4,250,24446.587675,282173.813416,15584.815017,14221.968205,0
5636,194.009247,0,15.4,249,37673.744094,271551.983357,11008.663261,16100.979953,0
1229,574.519165,0,13.6,227,45009.285463,310628.729875,13009.213543,9000.795195,0


In [25]:
known_rng = pd.concat([known, rng_sites], ignore_index=True)

print(f'Using a combination of {known.shape[0]} known sites and {rng_sites.shape[0]} test sites for {known_rng.shape[0]} total test sites.')

known_rng

Using a combination of 47 known sites and 2692 test sites for 2739 total test sites.


Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Is_Site
0,46.000000,0,15.7,204,247901.167568,80993.524258,73640.087050,7612.808936,1
1,750.000000,0,11.0,241,235748.803581,98999.561639,44437.690620,9921.595682,1
2,510.000000,0,12.4,247,241293.250685,101016.428031,34876.914519,14119.067154,1
3,864.000000,0,9.7,245,204549.089663,143240.912311,13416.969148,15491.977574,1
4,43.708790,0,15.7,250,215799.109406,114332.595817,94426.986888,2230.743465,1
...,...,...,...,...,...,...,...,...,...
2734,1073.714111,0,8.3,243,103370.098982,197740.912516,13131.627590,34807.141754,0
2735,242.868927,0,15.4,250,24446.587675,282173.813416,15584.815017,14221.968205,0
2736,194.009247,0,15.4,249,37673.744094,271551.983357,11008.663261,16100.979953,0
2737,574.519165,0,13.6,227,45009.285463,310628.729875,13009.213543,9000.795195,0


In [18]:
X_train, X_test, y_train, y_test = train_test_split(known_rng.drop(columns=['Is_Site']), known_rng['Is_Site'], test_size=0.25, stratify=known_rng['Is_Site'], random_state=42)

In [19]:
model = xgb.XGBClassifier(tree_method='hist', early_stopping_rounds=10, random_state=42)

model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred.round())*100}%')

[0]	validation_0-logloss:0.12936
[1]	validation_0-logloss:0.10372
[2]	validation_0-logloss:0.08569
[3]	validation_0-logloss:0.07389
[4]	validation_0-logloss:0.06501
[5]	validation_0-logloss:0.05993
[6]	validation_0-logloss:0.05570
[7]	validation_0-logloss:0.05328
[8]	validation_0-logloss:0.05139
[9]	validation_0-logloss:0.05050
[10]	validation_0-logloss:0.04940
[11]	validation_0-logloss:0.04873
[12]	validation_0-logloss:0.04897
[13]	validation_0-logloss:0.04863
[14]	validation_0-logloss:0.04914
[15]	validation_0-logloss:0.04927
[16]	validation_0-logloss:0.04994
[17]	validation_0-logloss:0.05018
[18]	validation_0-logloss:0.05051
[19]	validation_0-logloss:0.05108
[20]	validation_0-logloss:0.05180
[21]	validation_0-logloss:0.05259
[22]	validation_0-logloss:0.05347
Accuracy: 99.12408759124088%


In [20]:
outcome = test_cut.drop(columns=['Is_Site'])
outcome['prediction'] = model.predict(outcome)

outcome

Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,prediction
1,227.165344,0,16.400000,243,25924.985926,326171.018246,91240.568066,521.790859,0
2,41.271420,0,17.100000,236,24605.088261,327292.421342,90111.311563,128.050137,0
4,22.511099,0,17.000000,242,792.107078,376552.349498,77429.587911,45.335448,0
5,49.590641,0,16.600000,208,243.292773,375700.235383,76428.133607,123.823975,0
6,81.126198,0,16.799999,236,10315.981303,347492.299810,78096.612940,222.726689,0
...,...,...,...,...,...,...,...,...,...
12057,524.999023,0,10.400000,249,92329.473216,184186.566290,2017.980010,59107.544574,0
12058,690.593689,0,9.600000,226,93312.495370,183167.134218,1904.276416,59917.023072,0
12061,551.129211,0,10.300000,250,90956.384346,184989.356662,2271.270992,59132.748107,0
12062,659.387390,0,9.800000,222,91942.567948,183977.412603,3259.110714,59923.992081,0


In [21]:
outcome['prediction'].value_counts()

prediction
0    9363
1      11
Name: count, dtype: int64

In [22]:
outcome_linked = outcome.merge(test_cleaned[['OBJECTID', 'x', 'y']], left_index=True, right_index=True)
outcome_linked

Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,prediction,OBJECTID,x,y
1,227.165344,0,16.400000,243,25924.985926,326171.018246,91240.568066,521.790859,0,3357,15.743060,43.023390
2,41.271420,0,17.100000,236,24605.088261,327292.421342,90111.311563,128.050137,0,3362,15.759382,43.023390
4,22.511099,0,17.000000,242,792.107078,376552.349498,77429.587911,45.335448,0,183,16.510161,42.763406
5,49.590641,0,16.600000,208,243.292773,375700.235383,76428.133607,123.823975,0,184,16.510161,42.772691
6,81.126198,0,16.799999,236,10315.981303,347492.299810,78096.612940,222.726689,0,3620,16.004201,42.967679
...,...,...,...,...,...,...,...,...,...,...,...,...
12057,524.999023,0,10.400000,249,92329.473216,184186.566290,2017.980010,59107.544574,0,25450,15.987880,44.676145
12058,690.593689,0,9.600000,226,93312.495370,183167.134218,1904.276416,59917.023072,0,25451,15.987880,44.685430
12061,551.129211,0,10.300000,250,90956.384346,184989.356662,2271.270992,59132.748107,0,25454,16.004201,44.666859
12062,659.387390,0,9.800000,222,91942.567948,183977.412603,3259.110714,59923.992081,0,25455,16.004201,44.676145


In [23]:
outcome_linked.to_csv(data_path / 'test_predictions.csv', index=False)
outcome_linked[outcome_linked['prediction'] == 1].to_csv(data_path / 'test_predictions_sites.csv', index=False)