In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

data_path = Path().cwd() / 'data'

In [2]:
test_sheet = 'test_points_spec_extracted.xls'
test_sites = pd.read_excel(data_path / test_sheet, engine='calamine')

test_sites.drop(columns=[col for col in test_sites.columns if 'NEAR_FID' in col], inplace=True)

test_sites.rename(columns={
        'RASTERVALU': 'Elevation'
    }, inplace=True)


print(test_sites.shape)
test_sites.head()

(25459, 31)


Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,NEAR_DIST_Chert,WAW_2018_010m_E45N24_03035_v020,WAW_2018_010m_E46N23_03035_v020,WAW_2018_010m_E46N24_03035_v020,...,WAW_2018_010m_E48N26_03035_v020,WAW_2018_010m_E49N21_03035_v020,WAW_2018_010m_E49N22_03035_v020,WAW_2018_010m_E49N24_03035_v020,WAW_2018_010m_E49N25_03035_v020,WAW_2018_010m_E50N21_03035_v020,WAW_2018_010m_E50N22_03035_v020,WAW_2018_010m_E50N24_03035_v020,WAW_2018_010m_E50N25_03035_v020,Elevation
0,1,16.216378,42.75412,374896.416513,86925.777533,21529.164668,24078.397439,,,,...,,,,,,,,,,
1,2,16.232699,42.75412,376010.324268,86329.484792,20193.162744,22746.485743,,,,...,,,,,,,,,,
2,3,16.24902,42.75412,377125.53209,85749.632286,18857.185726,21415.079367,,,,...,,,,,,,,,,
3,4,16.265341,42.75412,378242.028437,85186.555754,17521.239367,20084.27886,,,,...,,,,,,,,,,
4,5,16.281663,42.75412,379359.801958,84640.589997,16185.331232,18754.213174,,,,...,,,,,,,,,,


In [3]:
def clean_waw(df: pd.DataFrame):
    wcols = [col for col in df.columns if 'WAW' in col]
    waw = df[wcols + ['OBJECTID']]
    df.drop(columns=wcols, inplace=True)
    
    waw['Wetness'] = waw[waw.drop(['OBJECTID'], axis=1).columns].apply(
        lambda x: ''.join(x.dropna().astype(str)), 
        axis=1
    )
    
    waw = waw[waw['Wetness'] != ''][['OBJECTID', 'Wetness']]
    waw['Wetness'] = waw['Wetness'].astype(float).astype(np.uint8)
    return pd.merge(df, waw, on='OBJECTID', how='inner')

In [4]:
test_cleaned = clean_waw(test_sites)
test_cleaned

Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,NEAR_DIST_Chert,Elevation,Wetness
0,1,16.216378,42.754120,374896.416513,86925.777533,21529.164668,24078.397439,,255
1,2,16.232699,42.754120,376010.324268,86329.484792,20193.162744,22746.485743,,255
2,3,16.249020,42.754120,377125.532090,85749.632286,18857.185726,21415.079367,,255
3,4,16.265341,42.754120,378242.028437,85186.555754,17521.239367,20084.278860,,255
4,5,16.281663,42.754120,379359.801958,84640.589997,16185.331232,18754.213174,,255
...,...,...,...,...,...,...,...,...,...
25355,25455,16.004201,44.676145,184989.356662,2271.270992,59132.748107,90956.384346,551.129211,0
25356,25456,16.004201,44.685430,183977.412603,3259.110714,59923.992081,91942.567948,659.387390,0
25357,25457,16.020522,44.657574,186768.663311,41.657071,58440.290816,88604.594675,337.125427,0
25358,25458,16.020522,44.666859,185754.992778,1064.431584,59204.693096,89592.956286,400.164185,0


In [5]:
display(test_cleaned.isna().sum().to_frame().T)

test_cleaned.dropna(inplace=True)

display(test_cleaned.isna().sum().to_frame().T)


Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,NEAR_DIST_Chert,Elevation,Wetness
0,0,0,0,0,0,0,0,10208,0


Unnamed: 0,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,NEAR_DIST_Chert,Elevation,Wetness
0,0,0,0,0,0,0,0,0,0


In [6]:
test_cleaned.to_csv(data_path / f'{test_sheet}_cleaned.csv', index=False)

In [7]:
known_sites = pd.read_excel(data_path / 'known_sites_augmented.xls', engine='calamine')

known_sites.rename(columns={
        i: i.replace('sites_XYTableToPoint_', '') for i in known_sites.columns if 'sites_XYTableToPoint_' in i
    }, inplace=True)

known_sites.rename(columns={
    'Elevation__Masl_': 'Elevation',
    'Dd_ns': 'y',
    'Dd_ew': 'x',
    }, inplace=True)

known_sites.drop(columns=['Dd', 'Dms'] + [col for col in known_sites.columns if 'NEAR_FID' in col], inplace=True)

print(known_sites.shape)
known_sites.head()

(47, 37)


Unnamed: 0,OBJECTID,Site_Name,Geographical_Region,Geographical_Location,Elevation,Period_New,Site_Type,y,x,NEAR_DIST_Chert,...,WAW_2018_010m_E48N25_03035_v020,WAW_2018_010m_E48N26_03035_v020,WAW_2018_010m_E49N21_03035_v020,WAW_2018_010m_E49N22_03035_v020,WAW_2018_010m_E49N24_03035_v020,WAW_2018_010m_E49N25_03035_v020,WAW_2018_010m_E50N21_03035_v020,WAW_2018_010m_E50N22_03035_v020,WAW_2018_010m_E50N24_03035_v020,WAW_2018_010m_E50N25_03035_v020
0,1,Abri Kontija 002,Istra,Limski kanal,46.0,UP,RS,45.1375,13.718611,247901.167568,...,,,,,,,,,,
1,2,Abri Šebrn,,,750.0,MES,RS,45.337712,14.162687,235748.803581,...,,,,,,,,,,
2,3,Brjgućeva Loza 1 (Loza),Istra,Kastav,510.0,MES,C,45.467778,14.242222,241293.250685,...,,,,,,,,,,
3,4,Bukovac,,,864.0,UP,C,45.346569,14.756238,204549.089663,...,,,,,,,,,,
4,5,Campanož,,,,MP,O,44.849045,13.89999,215799.109406,...,,,,,,,,,,


In [8]:
known_sites_cleaned = clean_waw(known_sites).drop(columns=['Geographical_Region', 'Geographical_Location', 'Period_New', 'Site_Type'])

cols = known_sites_cleaned.columns
known_sites_cleaned = known_sites_cleaned[cols[:3].to_list() + [cols[-2]] + cols[3:-2].to_list() + [cols[-1]]]

print(known_sites_cleaned.shape)
known_sites_cleaned.head()

(47, 11)


Unnamed: 0,OBJECTID,Site_Name,Elevation,RASTERVALU,y,x,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Wetness
0,1,Abri Kontija 002,46.0,46.136692,45.1375,13.718611,247901.167568,80993.524258,73640.08705,7612.808936,0
1,2,Abri Šebrn,750.0,732.561707,45.337712,14.162687,235748.803581,98999.561639,44437.69062,9921.595682,0
2,3,Brjgućeva Loza 1 (Loza),510.0,432.515076,45.467778,14.242222,241293.250685,101016.428031,34876.914519,14119.067154,0
3,4,Bukovac,864.0,791.070129,45.346569,14.756238,204549.089663,143240.912311,13416.969148,15491.977574,0
4,5,Campanož,,43.70879,44.849045,13.89999,215799.109406,114332.595817,94426.986888,2230.743465,0


In [9]:
known_sites_cleaned.Elevation.fillna(known_sites_cleaned.RASTERVALU, inplace=True)
known_sites_cleaned.Elevation.fillna(0, inplace=True)

known_sites_cleaned.drop(columns='RASTERVALU', inplace=True)

known_sites_cleaned['Is_Site'] = 0

known_sites_cleaned

Unnamed: 0,OBJECTID,Site_Name,Elevation,y,x,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Wetness,Is_Site
0,1,Abri Kontija 002,46.0,45.1375,13.718611,247901.167568,80993.524258,73640.08705,7612.808936,0,0
1,2,Abri Šebrn,750.0,45.337712,14.162687,235748.803581,98999.561639,44437.69062,9921.595682,0,0
2,3,Brjgućeva Loza 1 (Loza),510.0,45.467778,14.242222,241293.250685,101016.428031,34876.914519,14119.067154,0,0
3,4,Bukovac,864.0,45.346569,14.756238,204549.089663,143240.912311,13416.969148,15491.977574,0,0
4,5,Campanož,43.70879,44.849045,13.89999,215799.109406,114332.595817,94426.986888,2230.743465,0,0
5,6,Cerovačke pećine,624.0,44.274628,15.884619,55816.576622,230558.124333,13364.401168,27505.569553,0,0
6,7,Giljanovići (Karanušići),154.0,43.940833,16.426389,8763.044136,263072.737065,24402.718149,43623.324044,4,0
7,8,Glavičica,295.0,43.749998,16.666664,16125.218641,283521.439121,128.942963,28440.487128,0,0
8,9,Gospodska špilja,430.0,43.983887,16.436379,12714.582519,258279.688979,27729.070919,48470.385061,0,0
9,10,Igraliste-Dolac,16.0,44.142032,14.841416,108346.908403,208949.862435,62981.253902,242.634165,0,0


# Modeling

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [11]:
known = known_sites_cleaned.drop(columns=['OBJECTID', 'x', 'y', 'Site_Name'])
test = test_cleaned.drop(columns=['OBJECTID', 'x', 'y'])

col_order = ['Elevation', 'Wetness', 'NEAR_DIST_Chert', 'NEAR_DIST_Canals',	'NEAR_DIST_River_Net', 'NEAR_DIST_Coastal']

known = known[col_order + ['Is_Site']]
test = test[col_order]

In [12]:
known.head()

Unnamed: 0,Elevation,Wetness,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Is_Site
0,46.0,0,247901.167568,80993.524258,73640.08705,7612.808936,0
1,750.0,0,235748.803581,98999.561639,44437.69062,9921.595682,0
2,510.0,0,241293.250685,101016.428031,34876.914519,14119.067154,0
3,864.0,0,204549.089663,143240.912311,13416.969148,15491.977574,0
4,43.70879,0,215799.109406,114332.595817,94426.986888,2230.743465,0


In [13]:
test.head()

Unnamed: 0,Elevation,Wetness,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal
182,22.511099,0,792.107078,376552.349498,77429.587911,45.335448
183,49.590641,0,243.292773,375700.235383,76428.133607,123.823975
515,22.92296,0,5785.026601,367912.519382,76214.050216,62.423273
544,76.798393,0,4213.535219,364598.470742,73602.265456,177.921794
548,134.343857,0,5871.037123,364798.267839,72905.089059,238.423757


In [14]:
train_known, test_known = train_test_split(known, test_size=0.2, random_state=42)

X_train = train_known.drop(columns='Is_Site')
y_train = train_known['Is_Site']

X_test = test_known.drop(columns='Is_Site')
y_test = test_known['Is_Site']

In [15]:
model = xgb.XGBClassifier()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred.round())*100}%')

Accuracy: 100.0%


In [16]:
outcome = test.copy()
outcome['prediction'] = model.predict(outcome)
outcome.nunique().to_frame().T

Unnamed: 0,Elevation,Wetness,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,prediction
0,15071,6,15152,15152,15148,15152,1
