In [102]:
import numpy as np
import pandas as pd
import xgboost as xgb
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

data_path = Path().cwd() / 'data'

In [103]:
test_sites = pd.read_excel(data_path / 'test_points_250_augmented.xls', engine='calamine')

for col in test_sites.columns:
    if 'NEAR_FID' in col:
        test_sites.drop(columns=[col], inplace=True)

print(test_sites.shape)
test_sites.head()

(62500, 30)


Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,WAW_2018_010m_E45N24_03035_v020,WAW_2018_010m_E46N23_03035_v020,WAW_2018_010m_E46N24_03035_v020,WAW_2018_010m_E46N25_03035_v020,...,WAW_2018_010m_E48N26_03035_v020,WAW_2018_010m_E49N21_03035_v020,WAW_2018_010m_E49N22_03035_v020,WAW_2018_010m_E49N24_03035_v020,WAW_2018_010m_E49N25_03035_v020,WAW_2018_010m_E50N21_03035_v020,WAW_2018_010m_E50N22_03035_v020,WAW_2018_010m_E50N24_03035_v020,WAW_2018_010m_E50N25_03035_v020,RASTERVALU
0,1,13.47,42.9,207093.263943,46226.866869,34265.278322,,,,,...,,,,,,,,,,684.918945
1,2,13.47,42.911124,205993.497244,47071.385234,33931.780558,,,,,...,,,,,,,,,,714.755615
2,3,13.47,42.922249,204895.278855,46452.892775,33639.451597,,,,,...,,,,,,,,,,643.646973
3,4,13.47,42.933373,203798.633851,45363.52284,33361.113072,,,,,...,,,,,,,,,,721.60791
4,5,13.47,42.944498,202703.587813,44281.842118,33053.372197,,,,,...,,,,,,,,,,668.03302


In [104]:
def clean_waw(df: pd.DataFrame):
    wcols = [col for col in df.columns if 'WAW' in col]
    waw = df[wcols + ['OBJECTID']]
    df.drop(columns=wcols, inplace=True)
    
    waw['Wetness'] = waw[waw.drop(['OBJECTID'], axis=1).columns].apply(
        lambda x: ''.join(x.dropna().astype(str)), 
        axis=1
    )
    
    waw = waw[waw['Wetness'] != ''][['OBJECTID', 'Wetness']]
    waw['Wetness'] = waw['Wetness'].astype(float).astype(np.uint8)
    return pd.merge(df, waw, on='OBJECTID', how='inner')

In [105]:
test_cleaned = clean_waw(test_sites)
test_cleaned

Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,RASTERVALU,Wetness
0,2305,14.638835,42.900000,264085.099158,56892.951994,54210.207719,,255
1,2306,14.638835,42.911124,263214.289583,57499.260687,54760.100889,,255
2,2307,14.638835,42.922249,262346.407780,58124.344920,55294.040077,,255
3,2308,14.638835,42.933373,261481.482935,58768.598461,55827.978331,,255
4,2309,14.638835,42.944498,260619.544523,59431.570137,56361.915634,,255
...,...,...,...,...,...,...,...,...
54504,62496,17.750000,45.625502,105027.866058,17814.128968,231236.263512,180.351395,0
54505,62497,17.750000,45.636627,104132.352826,16739.232836,232037.548592,142.631439,0
54506,62498,17.750000,45.647751,103243.877082,15687.704093,232842.641684,194.935699,0
54507,62499,17.750000,45.658876,102362.622161,14660.426707,233651.503463,204.853287,0


In [106]:
display(test_cleaned.isna().sum().to_frame().T)

test_cleaned.fillna(0, inplace=True)

display(test_cleaned.isna().sum().to_frame().T)


Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,RASTERVALU,Wetness
0,0,0,0,0,0,0,20101,0


Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,RASTERVALU,Wetness
0,0,0,0,0,0,0,0,0


In [107]:
known_sites = pd.read_excel(data_path / 'known_sites_augmented.xls', engine='calamine')

known_sites.rename(columns={
        i: i.replace('sites_XYTableToPoint_', '') for i in known_sites.columns if 'sites_XYTableToPoint_' in i
    }, inplace=True)

known_sites.rename(columns={
    'Elevation__Masl_': 'Elevation'
    }, inplace=True)

known_sites.drop(columns=['Dd', 'Dms'] + [col for col in known_sites.columns if 'NEAR_FID' in col], inplace=True)

print(known_sites.shape)
known_sites.head()

(47, 37)


Unnamed: 0,OBJECTID,Site_Name,Geographical_Region,Geographical_Location,Elevation,Period_New,Site_Type,Dd_ns,Dd_ew,NEAR_DIST_Chert,...,WAW_2018_010m_E48N25_03035_v020,WAW_2018_010m_E48N26_03035_v020,WAW_2018_010m_E49N21_03035_v020,WAW_2018_010m_E49N22_03035_v020,WAW_2018_010m_E49N24_03035_v020,WAW_2018_010m_E49N25_03035_v020,WAW_2018_010m_E50N21_03035_v020,WAW_2018_010m_E50N22_03035_v020,WAW_2018_010m_E50N24_03035_v020,WAW_2018_010m_E50N25_03035_v020
0,1,Abri Kontija 002,Istra,Limski kanal,46.0,UP,RS,45.1375,13.718611,247901.167568,...,,,,,,,,,,
1,2,Abri Šebrn,,,750.0,MES,RS,45.337712,14.162687,235748.803581,...,,,,,,,,,,
2,3,Brjgućeva Loza 1 (Loza),Istra,Kastav,510.0,MES,C,45.467778,14.242222,241293.250685,...,,,,,,,,,,
3,4,Bukovac,,,864.0,UP,C,45.346569,14.756238,204549.089663,...,,,,,,,,,,
4,5,Campanož,,,,MP,O,44.849045,13.89999,215799.109406,...,,,,,,,,,,


In [108]:
known_sites_cleaned = clean_waw(known_sites).drop(columns=['Geographical_Region', 'Geographical_Location'])

print(known_sites_cleaned.shape)
known_sites_cleaned.head()

(47, 13)


Unnamed: 0,OBJECTID,Site_Name,Elevation,Period_New,Site_Type,Dd_ns,Dd_ew,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,RASTERVALU,Wetness
0,1,Abri Kontija 002,46.0,UP,RS,45.1375,13.718611,247901.167568,80993.524258,73640.08705,7612.808936,46.136692,0
1,2,Abri Šebrn,750.0,MES,RS,45.337712,14.162687,235748.803581,98999.561639,44437.69062,9921.595682,732.561707,0
2,3,Brjgućeva Loza 1 (Loza),510.0,MES,C,45.467778,14.242222,241293.250685,101016.428031,34876.914519,14119.067154,432.515076,0
3,4,Bukovac,864.0,UP,C,45.346569,14.756238,204549.089663,143240.912311,13416.969148,15491.977574,791.070129,0
4,5,Campanož,,MP,O,44.849045,13.89999,215799.109406,114332.595817,94426.986888,2230.743465,43.70879,0


In [109]:
cols = known_sites_cleaned.columns
known_sites_cleaned = known_sites_cleaned[cols[:3].to_list() + [cols[-2]] + cols[3:-2].to_list() + [cols[-1]]]
known_sites_cleaned.head()

Unnamed: 0,OBJECTID,Site_Name,Elevation,RASTERVALU,Period_New,Site_Type,Dd_ns,Dd_ew,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Wetness
0,1,Abri Kontija 002,46.0,46.136692,UP,RS,45.1375,13.718611,247901.167568,80993.524258,73640.08705,7612.808936,0
1,2,Abri Šebrn,750.0,732.561707,MES,RS,45.337712,14.162687,235748.803581,98999.561639,44437.69062,9921.595682,0
2,3,Brjgućeva Loza 1 (Loza),510.0,432.515076,MES,C,45.467778,14.242222,241293.250685,101016.428031,34876.914519,14119.067154,0
3,4,Bukovac,864.0,791.070129,UP,C,45.346569,14.756238,204549.089663,143240.912311,13416.969148,15491.977574,0
4,5,Campanož,,43.70879,MP,O,44.849045,13.89999,215799.109406,114332.595817,94426.986888,2230.743465,0


In [110]:
known_sites_cleaned.isna().sum().to_frame().T

Unnamed: 0,OBJECTID,Site_Name,Elevation,RASTERVALU,Period_New,Site_Type,Dd_ns,Dd_ew,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Wetness
0,0,0,12,5,0,0,0,0,0,0,0,0,0


In [111]:
known_sites_cleaned.Elevation.fillna(known_sites_cleaned.RASTERVALU, inplace=True)

known_sites_cleaned

Unnamed: 0,OBJECTID,Site_Name,Elevation,RASTERVALU,Period_New,Site_Type,Dd_ns,Dd_ew,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Wetness
0,1,Abri Kontija 002,46.0,46.136692,UP,RS,45.1375,13.718611,247901.167568,80993.524258,73640.08705,7612.808936,0
1,2,Abri Šebrn,750.0,732.561707,MES,RS,45.337712,14.162687,235748.803581,98999.561639,44437.69062,9921.595682,0
2,3,Brjgućeva Loza 1 (Loza),510.0,432.515076,MES,C,45.467778,14.242222,241293.250685,101016.428031,34876.914519,14119.067154,0
3,4,Bukovac,864.0,791.070129,UP,C,45.346569,14.756238,204549.089663,143240.912311,13416.969148,15491.977574,0
4,5,Campanož,43.70879,43.70879,MP,O,44.849045,13.89999,215799.109406,114332.595817,94426.986888,2230.743465,0
5,6,Cerovačke pećine,624.0,596.050293,UP,C,44.274628,15.884619,55816.576622,230558.124333,13364.401168,27505.569553,0
6,7,Giljanovići (Karanušići),154.0,370.460022,MP,O,43.940833,16.426389,8763.044136,263072.737065,24402.718149,43623.324044,4
7,8,Glavičica,295.0,297.810455,"UP, MES",O,43.749998,16.666664,16125.218641,283521.439121,128.942963,28440.487128,0
8,9,Gospodska špilja,430.0,424.83606,"UP, MES",C,43.983887,16.436379,12714.582519,258279.688979,27729.070919,48470.385061,0
9,10,Igraliste-Dolac,16.0,16.647577,MP,O,44.142032,14.841416,108346.908403,208949.862435,62981.253902,242.634165,0
