In [72]:
import numpy as np
import pandas as pd
import xgboost as xgb
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

data_path = Path().cwd() / 'data'

In [73]:
test_sites = pd.read_excel(data_path / 'test_points_250_augmented.xls', engine='calamine')

test_sites.drop(columns=[col for col in test_sites.columns if 'NEAR_FID' in col], inplace=True)

test_sites.rename(columns={
        'RASTERVALU': 'Elevation'
    }, inplace=True)


print(test_sites.shape)
test_sites.head()

(62500, 31)


Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,WAW_2018_010m_E45N24_03035_v020,WAW_2018_010m_E46N23_03035_v020,WAW_2018_010m_E46N24_03035_v020,WAW_2018_010m_E46N25_03035_v020,...,WAW_2018_010m_E49N21_03035_v020,WAW_2018_010m_E49N22_03035_v020,WAW_2018_010m_E49N24_03035_v020,WAW_2018_010m_E49N25_03035_v020,WAW_2018_010m_E50N21_03035_v020,WAW_2018_010m_E50N22_03035_v020,WAW_2018_010m_E50N24_03035_v020,WAW_2018_010m_E50N25_03035_v020,Elevation,NEAR_DIST_Chert
0,1,13.47,42.9,207093.263943,46226.866869,34265.278322,,,,,...,,,,,,,,,684.918945,211843.966356
1,2,13.47,42.911124,205993.497244,47071.385234,33931.780558,,,,,...,,,,,,,,,714.755615,211730.468091
2,3,13.47,42.922249,204895.278855,46452.892775,33639.451597,,,,,...,,,,,,,,,643.646973,211624.123177
3,4,13.47,42.933373,203798.633851,45363.52284,33361.113072,,,,,...,,,,,,,,,721.60791,211524.942443
4,5,13.47,42.944498,202703.587813,44281.842118,33053.372197,,,,,...,,,,,,,,,668.03302,211432.936014


In [74]:
def clean_waw(df: pd.DataFrame):
    wcols = [col for col in df.columns if 'WAW' in col]
    waw = df[wcols + ['OBJECTID']]
    df.drop(columns=wcols, inplace=True)
    
    waw['Wetness'] = waw[waw.drop(['OBJECTID'], axis=1).columns].apply(
        lambda x: ''.join(x.dropna().astype(str)), 
        axis=1
    )
    
    waw = waw[waw['Wetness'] != ''][['OBJECTID', 'Wetness']]
    waw['Wetness'] = waw['Wetness'].astype(float).astype(np.uint8)
    return pd.merge(df, waw, on='OBJECTID', how='inner')

In [75]:
test_cleaned = clean_waw(test_sites)
test_cleaned

Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Elevation,NEAR_DIST_Chert,Wetness
0,2305,14.638835,42.900000,264085.099158,56892.951994,54210.207719,,117051.620795,255
1,2306,14.638835,42.911124,263214.289583,57499.260687,54760.100889,,116870.090727,255
2,2307,14.638835,42.922249,262346.407780,58124.344920,55294.040077,,116701.364290,255
3,2308,14.638835,42.933373,261481.482935,58768.598461,55827.978331,,116545.497169,255
4,2309,14.638835,42.944498,260619.544523,59431.570137,56361.915634,,116402.541098,255
...,...,...,...,...,...,...,...,...,...
54504,62496,17.750000,45.625502,105027.866058,17814.128968,231236.263512,180.351395,221800.705712,0
54505,62497,17.750000,45.636627,104132.352826,16739.232836,232037.548592,142.631439,222980.357186,0
54506,62498,17.750000,45.647751,103243.877082,15687.704093,232842.641684,194.935699,224160.622618,0
54507,62499,17.750000,45.658876,102362.622161,14660.426707,233651.503463,204.853287,225341.492399,0


In [76]:
display(test_cleaned.isna().sum().to_frame().T)

test_cleaned.fillna(0, inplace=True)

display(test_cleaned.isna().sum().to_frame().T)


Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Elevation,NEAR_DIST_Chert,Wetness
0,0,0,0,0,0,0,20101,0,0


Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Elevation,NEAR_DIST_Chert,Wetness
0,0,0,0,0,0,0,0,0,0


In [77]:
known_sites = pd.read_excel(data_path / 'known_sites_augmented.xls', engine='calamine')

known_sites.rename(columns={
        i: i.replace('sites_XYTableToPoint_', '') for i in known_sites.columns if 'sites_XYTableToPoint_' in i
    }, inplace=True)

known_sites.rename(columns={
    'Elevation__Masl_': 'Elevation',
    'Dd_ns': 'y',
    'Dd_ew': 'x',
    }, inplace=True)

known_sites.drop(columns=['Dd', 'Dms'] + [col for col in known_sites.columns if 'NEAR_FID' in col], inplace=True)

print(known_sites.shape)
known_sites.head()

(47, 37)


Unnamed: 0,OBJECTID,Site_Name,Geographical_Region,Geographical_Location,Elevation,Period_New,Site_Type,y,x,NEAR_DIST_Chert,...,WAW_2018_010m_E48N25_03035_v020,WAW_2018_010m_E48N26_03035_v020,WAW_2018_010m_E49N21_03035_v020,WAW_2018_010m_E49N22_03035_v020,WAW_2018_010m_E49N24_03035_v020,WAW_2018_010m_E49N25_03035_v020,WAW_2018_010m_E50N21_03035_v020,WAW_2018_010m_E50N22_03035_v020,WAW_2018_010m_E50N24_03035_v020,WAW_2018_010m_E50N25_03035_v020
0,1,Abri Kontija 002,Istra,Limski kanal,46.0,UP,RS,45.1375,13.718611,247901.167568,...,,,,,,,,,,
1,2,Abri Šebrn,,,750.0,MES,RS,45.337712,14.162687,235748.803581,...,,,,,,,,,,
2,3,Brjgućeva Loza 1 (Loza),Istra,Kastav,510.0,MES,C,45.467778,14.242222,241293.250685,...,,,,,,,,,,
3,4,Bukovac,,,864.0,UP,C,45.346569,14.756238,204549.089663,...,,,,,,,,,,
4,5,Campanož,,,,MP,O,44.849045,13.89999,215799.109406,...,,,,,,,,,,


In [78]:
known_sites_cleaned = clean_waw(known_sites).drop(columns=['Geographical_Region', 'Geographical_Location', 'Period_New', 'Site_Type'])

cols = known_sites_cleaned.columns
known_sites_cleaned = known_sites_cleaned[cols[:3].to_list() + [cols[-2]] + cols[3:-2].to_list() + [cols[-1]]]

print(known_sites_cleaned.shape)
known_sites_cleaned.head()

(47, 11)


Unnamed: 0,OBJECTID,Site_Name,Elevation,RASTERVALU,y,x,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Wetness
0,1,Abri Kontija 002,46.0,46.136692,45.1375,13.718611,247901.167568,80993.524258,73640.08705,7612.808936,0
1,2,Abri Šebrn,750.0,732.561707,45.337712,14.162687,235748.803581,98999.561639,44437.69062,9921.595682,0
2,3,Brjgućeva Loza 1 (Loza),510.0,432.515076,45.467778,14.242222,241293.250685,101016.428031,34876.914519,14119.067154,0
3,4,Bukovac,864.0,791.070129,45.346569,14.756238,204549.089663,143240.912311,13416.969148,15491.977574,0
4,5,Campanož,,43.70879,44.849045,13.89999,215799.109406,114332.595817,94426.986888,2230.743465,0


In [79]:
known_sites_cleaned.Elevation.fillna(known_sites_cleaned.RASTERVALU, inplace=True)
known_sites_cleaned.Elevation.fillna(0, inplace=True)

known_sites_cleaned.drop(columns='RASTERVALU', inplace=True)

known_sites_cleaned['Is_Site'] = 0

known_sites_cleaned

Unnamed: 0,OBJECTID,Site_Name,Elevation,y,x,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Wetness,Is_Site
0,1,Abri Kontija 002,46.0,45.1375,13.718611,247901.167568,80993.524258,73640.08705,7612.808936,0,0
1,2,Abri Šebrn,750.0,45.337712,14.162687,235748.803581,98999.561639,44437.69062,9921.595682,0,0
2,3,Brjgućeva Loza 1 (Loza),510.0,45.467778,14.242222,241293.250685,101016.428031,34876.914519,14119.067154,0,0
3,4,Bukovac,864.0,45.346569,14.756238,204549.089663,143240.912311,13416.969148,15491.977574,0,0
4,5,Campanož,43.70879,44.849045,13.89999,215799.109406,114332.595817,94426.986888,2230.743465,0,0
5,6,Cerovačke pećine,624.0,44.274628,15.884619,55816.576622,230558.124333,13364.401168,27505.569553,0,0
6,7,Giljanovići (Karanušići),154.0,43.940833,16.426389,8763.044136,263072.737065,24402.718149,43623.324044,4,0
7,8,Glavičica,295.0,43.749998,16.666664,16125.218641,283521.439121,128.942963,28440.487128,0,0
8,9,Gospodska špilja,430.0,43.983887,16.436379,12714.582519,258279.688979,27729.070919,48470.385061,0,0
9,10,Igraliste-Dolac,16.0,44.142032,14.841416,108346.908403,208949.862435,62981.253902,242.634165,0,0


# Modeling

In [80]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [81]:
known = known_sites_cleaned.drop(columns=['OBJECTID', 'x', 'y', 'Site_Name'])
test = test_cleaned.drop(columns=['OBJECTID', 'x', 'y'])

col_order = ['Elevation', 'Wetness', 'NEAR_DIST_Chert', 'NEAR_DIST_Canals',	'NEAR_DIST_River_Net', 'NEAR_DIST_Coastal']

known = known[col_order + ['Is_Site']]
test = test[col_order]

In [82]:
known.head()

Unnamed: 0,Elevation,Wetness,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Is_Site
0,46.0,0,247901.167568,80993.524258,73640.08705,7612.808936,0
1,750.0,0,235748.803581,98999.561639,44437.69062,9921.595682,0
2,510.0,0,241293.250685,101016.428031,34876.914519,14119.067154,0
3,864.0,0,204549.089663,143240.912311,13416.969148,15491.977574,0
4,43.70879,0,215799.109406,114332.595817,94426.986888,2230.743465,0


In [83]:
test.head()

Unnamed: 0,Elevation,Wetness,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal
0,0.0,255,117051.620795,264085.099158,56892.951994,54210.207719
1,0.0,255,116870.090727,263214.289583,57499.260687,54760.100889
2,0.0,255,116701.36429,262346.40778,58124.34492,55294.040077
3,0.0,255,116545.497169,261481.482935,58768.598461,55827.978331
4,0.0,255,116402.541098,260619.544523,59431.570137,56361.915634


In [84]:
train_known, test_known = train_test_split(known, test_size=0.2, random_state=42)

X_train = train_known.drop(columns='Is_Site')
y_train = train_known['Is_Site']

X_test = test_known.drop(columns='Is_Site')
y_test = test_known['Is_Site']

In [85]:
model = xgb.XGBClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred.round())*100}%')

Accuracy: 100.0%


In [86]:
outcome = test.copy()
outcome['prediction'] = model.predict(outcome)
outcome.nunique().to_frame().T

Unnamed: 0,Elevation,Wetness,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,prediction
0,34241,45,54509,54509,54422,48454,1
