In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

data_path = Path().cwd() / 'data'

In [2]:
test_sheet = 'test_points_extracted.xls'
test_sites = pd.read_excel(data_path / test_sheet, engine='calamine')

test_sites.drop(columns=[col for col in test_sites.columns if 'NEAR_FID' in col], inplace=True)

print(test_sites.shape)
test_sites.head()

(12067, 12)


Unnamed: 0,OBJECTID_1,OBJECTID,x,y,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,NEAR_DIST_Chert,Elevation,Wetness,Temp,Slope
0,1,3353,15.726739,43.02339,325051.049972,92375.044496,109.719805,27245.870159,70.596275,0,16.799999,207
1,2,3357,15.74306,43.02339,326171.018246,91240.568066,521.790859,25924.985926,227.165344,0,16.4,243
2,3,3362,15.759382,43.02339,327292.421342,90111.311563,128.050137,24605.088261,41.27142,0,17.1,236
3,4,3410,15.759382,43.032675,326739.480379,89566.947218,238.350808,24494.10294,58.482124,0,16.5,232
4,5,183,16.510161,42.763406,376552.349498,77429.587911,45.335448,792.107078,22.511099,0,17.0,242


In [3]:
def clean_waw(df: pd.DataFrame):
    wcols = [col for col in df.columns if 'WAW' in col]
    waw = df[wcols + ['OBJECTID']]
    df.drop(columns=wcols, inplace=True)
    
    waw['Wetness'] = waw[waw.drop(['OBJECTID'], axis=1).columns].apply(
        lambda x: ''.join(x.dropna().astype(str)), 
        axis=1
    )
    
    waw = waw[waw['Wetness'] != ''][['OBJECTID', 'Wetness']]
    waw['Wetness'] = waw['Wetness'].astype(float).astype(np.uint8)
    return pd.merge(df, waw, on='OBJECTID', how='inner')

In [4]:
if any('WAW' in col for col in test_sites.columns):
    test_cleaned = clean_waw(test_sites)

else:
    test_cleaned = test_sites.copy()
    
test_cleaned.dropna(inplace=True)

test_cleaned.to_csv(data_path / f"{test_sheet.split('.')[0]}_cleaned.csv", index=False)

In [5]:
known_sites = pd.read_excel(data_path / 'known_sites_augmented.xls', engine='calamine')

known_sites.rename(columns={
        i: i.replace('sites_XYTableToPoint_', '') for i in known_sites.columns if 'sites_XYTableToPoint_' in i
    }, inplace=True)

known_sites.rename(columns={
    'Elevation__Masl_': 'Elevation',
    'Dd_ns': 'y',
    'Dd_ew': 'x',
    }, inplace=True)

known_sites.drop(columns=['Dd', 'Dms'] + [col for col in known_sites.columns if 'NEAR_FID' in col], inplace=True)

print(known_sites.shape)
known_sites.head()

(47, 39)


Unnamed: 0,OBJECTID,Site_Name,Geographical_Region,Geographical_Location,Elevation,Period_New,Site_Type,y,x,NEAR_DIST_Chert,...,WAW_2018_010m_E49N21_03035_v020,WAW_2018_010m_E49N22_03035_v020,WAW_2018_010m_E49N24_03035_v020,WAW_2018_010m_E49N25_03035_v020,WAW_2018_010m_E50N21_03035_v020,WAW_2018_010m_E50N22_03035_v020,WAW_2018_010m_E50N24_03035_v020,WAW_2018_010m_E50N25_03035_v020,Slope,Temp
0,1,Abri Kontija 002,Istra,Limski kanal,46.0,UP,RS,45.1375,13.718611,247901.167568,...,,,,,,,,,204,15.7
1,2,Abri Šebrn,,,750.0,MES,RS,45.337712,14.162687,235748.803581,...,,,,,,,,,241,11.0
2,3,Brjgućeva Loza 1 (Loza),Istra,Kastav,510.0,MES,C,45.467778,14.242222,241293.250685,...,,,,,,,,,247,12.4
3,4,Bukovac,,,864.0,UP,C,45.346569,14.756238,204549.089663,...,,,,,,,,,245,9.7
4,5,Campanož,,,,MP,O,44.849045,13.89999,215799.109406,...,,,,,,,,,250,15.7


In [6]:
known_sites_cleaned = clean_waw(known_sites).drop(columns=['Geographical_Region', 'Geographical_Location', 'Period_New', 'Site_Type'])

cols = known_sites_cleaned.columns
known_sites_cleaned = known_sites_cleaned[cols[:3].to_list() + [cols[-2]] + cols[3:-2].to_list() + [cols[-1]]]

known_sites_cleaned.Elevation.fillna(known_sites_cleaned.Elevation_Raster, inplace=True)
known_sites_cleaned.Elevation.fillna(0, inplace=True)

known_sites_cleaned.drop(columns=['Elevation_Raster'], inplace=True)

known_sites_cleaned['Is_Site'] = 1

# Modeling

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [8]:
print(f'known columns: {known_sites_cleaned.columns}')
print(f'test columns: {test_cleaned.columns}')

known columns: Index(['OBJECTID', 'Site_Name', 'Elevation', 'Temp', 'y', 'x',
       'NEAR_DIST_Chert', 'NEAR_DIST_Canals', 'NEAR_DIST_River_Net',
       'NEAR_DIST_Coastal', 'Slope', 'Wetness', 'Is_Site'],
      dtype='object')
test columns: Index(['OBJECTID_1', 'OBJECTID', 'x', 'y', 'NEAR_DIST_Canals',
       'NEAR_DIST_River_Net', 'NEAR_DIST_Coastal', 'NEAR_DIST_Chert',
       'Elevation', 'Wetness', 'Temp', 'Slope'],
      dtype='object')


In [9]:
known = known_sites_cleaned.drop(columns=['OBJECTID', 'x', 'y', 'Site_Name'])
test = test_cleaned.drop(columns=['OBJECTID', 'x', 'y'])

col_order = ['Elevation', 'Wetness', 'Temp', 'Slope', 'NEAR_DIST_Chert', 'NEAR_DIST_Canals', 'NEAR_DIST_River_Net', 'NEAR_DIST_Coastal']

known = known[col_order + ['Is_Site']]
test = test[col_order]

test.reset_index(drop=True, inplace=True)

In [10]:
known.head()

Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Is_Site
0,46.0,0,15.7,204,247901.167568,80993.524258,73640.08705,7612.808936,1
1,750.0,0,11.0,241,235748.803581,98999.561639,44437.69062,9921.595682,1
2,510.0,0,12.4,247,241293.250685,101016.428031,34876.914519,14119.067154,1
3,864.0,0,9.7,245,204549.089663,143240.912311,13416.969148,15491.977574,1
4,43.70879,0,15.7,250,215799.109406,114332.595817,94426.986888,2230.743465,1


In [11]:
test.head()

Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal
0,70.596275,0,16.799999,207,27245.870159,325051.049972,92375.044496,109.719805
1,227.165344,0,16.4,243,25924.985926,326171.018246,91240.568066,521.790859
2,41.27142,0,17.1,236,24605.088261,327292.421342,90111.311563,128.050137
3,58.482124,0,16.5,232,24494.10294,326739.480379,89566.947218,238.350808
4,22.511099,0,17.0,242,792.107078,376552.349498,77429.587911,45.335448


In [12]:
test.loc[(test['Elevation'] > 400), 'Is_Site'] = 0
test.loc[(test['Elevation'] <= 400), 'Is_Site'] = 1
# test.loc[(test['Elevation'] < 50), 'Is_Site'] = 0


test.loc[(test['NEAR_DIST_Chert'] > 20000), 'Is_Site'] = 0
test.loc[(test['NEAR_DIST_Coastal'] > 6000), 'Is_Site'] = 0
# test.loc[(test['NEAR_DIST_Canals'] < 20000), 'Is_Site'] = 0
# test.loc[(test['NEAR_DIST_River_Net'] < 20000), 'Is_Site'] = 0


test['Is_Site'] = test['Is_Site'].astype(np.uint8)

test.to_csv(data_path / 'test_labeled.csv', index=False)

test['Is_Site'].value_counts()

Is_Site
0    10769
1     1297
Name: count, dtype: int64

In [13]:
rng = np.random.RandomState(42)

RNG_MIX = 0.15

rng_selection = test[test['Is_Site'] == 0]
rng_nums = rng.choice(
    a=rng_selection.index, 
    size=int(test['Is_Site'].value_counts()[0]*RNG_MIX), 
    replace=False
)

test_cut = test.drop(index=rng_nums) # pyright: ignore[reportArgumentType, reportCallIssue]

rng_sites = rng_selection.loc[rng_nums]

rng_sites

Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Is_Site
10201,868.111755,0,10.3,227,50516.797211,227447.907927,23578.697991,38417.827401,0
2710,1000.587036,0,10.8,250,10880.125622,304070.169667,8273.789065,7257.978740,0
3760,128.218292,0,15.7,250,37963.694584,271789.640569,12318.356676,15085.628359,0
11874,871.318420,0,8.8,237,84059.422445,196449.533050,11519.405670,46241.989604,0
8711,1508.136230,0,6.7,220,84557.992290,226704.128842,20583.756526,10018.917593,0
...,...,...,...,...,...,...,...,...,...
2311,465.544861,0,13.4,208,10432.919609,302537.713005,6398.125074,2151.944003,0
2839,707.264038,0,12.3,249,23652.193136,283080.073445,25417.582881,18265.969730,0
8445,768.924561,0,11.7,228,56354.697072,235173.402322,7235.766463,22763.312425,0
8247,336.521606,0,13.6,232,55296.573784,238449.069253,4932.336045,21988.659310,0


In [14]:
known_rng = pd.concat([known, rng_sites], ignore_index=True)

print(f'Using a combination of {known.shape[0]} known sites and {rng_sites.shape[0]} test sites for {known_rng.shape[0]} total test sites.')

known_rng

Using a combination of 47 known sites and 1615 test sites for 1662 total test sites.


Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,Is_Site
0,46.000000,0,15.7,204,247901.167568,80993.524258,73640.087050,7612.808936,1
1,750.000000,0,11.0,241,235748.803581,98999.561639,44437.690620,9921.595682,1
2,510.000000,0,12.4,247,241293.250685,101016.428031,34876.914519,14119.067154,1
3,864.000000,0,9.7,245,204549.089663,143240.912311,13416.969148,15491.977574,1
4,43.708790,0,15.7,250,215799.109406,114332.595817,94426.986888,2230.743465,1
...,...,...,...,...,...,...,...,...,...
1657,465.544861,0,13.4,208,10432.919609,302537.713005,6398.125074,2151.944003,0
1658,707.264038,0,12.3,249,23652.193136,283080.073445,25417.582881,18265.969730,0
1659,768.924561,0,11.7,228,56354.697072,235173.402322,7235.766463,22763.312425,0
1660,336.521606,0,13.6,232,55296.573784,238449.069253,4932.336045,21988.659310,0


In [15]:
X_train, X_test, y_train, y_test = train_test_split(known_rng.drop(columns=['Is_Site']), known_rng['Is_Site'], test_size=0.25, stratify=known_rng['Is_Site'], random_state=42)

In [16]:
model = xgb.XGBClassifier(tree_method='hist', early_stopping_rounds=10, random_state=42)

model.fit(X_train, y_train, eval_set=[(X_test, y_test)])

y_pred = model.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_pred.round())*100}%')

[0]	validation_0-logloss:0.15420
[1]	validation_0-logloss:0.12833
[2]	validation_0-logloss:0.10840
[3]	validation_0-logloss:0.09575
[4]	validation_0-logloss:0.08873
[5]	validation_0-logloss:0.08426
[6]	validation_0-logloss:0.07978
[7]	validation_0-logloss:0.07566
[8]	validation_0-logloss:0.07257
[9]	validation_0-logloss:0.07214
[10]	validation_0-logloss:0.06884
[11]	validation_0-logloss:0.06903
[12]	validation_0-logloss:0.06720
[13]	validation_0-logloss:0.06554
[14]	validation_0-logloss:0.06434
[15]	validation_0-logloss:0.06564
[16]	validation_0-logloss:0.06614
[17]	validation_0-logloss:0.06739
[18]	validation_0-logloss:0.06728
[19]	validation_0-logloss:0.06689
[20]	validation_0-logloss:0.06726
[21]	validation_0-logloss:0.06674
[22]	validation_0-logloss:0.06666
[23]	validation_0-logloss:0.06780
[24]	validation_0-logloss:0.06871
Accuracy: 98.3173076923077%


In [17]:
outcome = test_cut.drop(columns=['Is_Site'])
outcome['prediction'] = model.predict(outcome)

outcome

Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,prediction
1,227.165344,0,16.400000,243,25924.985926,326171.018246,91240.568066,521.790859,0
2,41.271420,0,17.100000,236,24605.088261,327292.421342,90111.311563,128.050137,1
4,22.511099,0,17.000000,242,792.107078,376552.349498,77429.587911,45.335448,1
5,49.590641,0,16.600000,208,243.292773,375700.235383,76428.133607,123.823975,1
6,81.126198,0,16.799999,236,10315.981303,347492.299810,78096.612940,222.726689,1
...,...,...,...,...,...,...,...,...,...
12060,515.450317,0,10.600000,243,89971.225908,186001.517251,1283.281393,58349.022212,0
12061,551.129211,0,10.300000,250,90956.384346,184989.356662,2271.270992,59132.748107,0
12062,659.387390,0,9.800000,222,91942.567948,183977.412603,3259.110714,59923.992081,0
12063,337.125427,0,11.300000,216,88604.594675,186768.663311,41.657071,58440.290816,0


In [18]:
outcome['prediction'].value_counts()

prediction
0    10357
1       94
Name: count, dtype: int64

In [19]:
outcome_linked = outcome.merge(test_cleaned[['OBJECTID', 'x', 'y']], left_index=True, right_index=True)
outcome_linked

Unnamed: 0,Elevation,Wetness,Temp,Slope,NEAR_DIST_Chert,NEAR_DIST_Canals,NEAR_DIST_River_Net,NEAR_DIST_Coastal,prediction,OBJECTID,x,y
1,227.165344,0,16.400000,243,25924.985926,326171.018246,91240.568066,521.790859,0,3357,15.743060,43.023390
2,41.271420,0,17.100000,236,24605.088261,327292.421342,90111.311563,128.050137,1,3362,15.759382,43.023390
4,22.511099,0,17.000000,242,792.107078,376552.349498,77429.587911,45.335448,1,183,16.510161,42.763406
5,49.590641,0,16.600000,208,243.292773,375700.235383,76428.133607,123.823975,1,184,16.510161,42.772691
6,81.126198,0,16.799999,236,10315.981303,347492.299810,78096.612940,222.726689,1,3620,16.004201,42.967679
...,...,...,...,...,...,...,...,...,...,...,...,...
12060,515.450317,0,10.600000,243,89971.225908,186001.517251,1283.281393,58349.022212,0,25453,16.004201,44.657574
12061,551.129211,0,10.300000,250,90956.384346,184989.356662,2271.270992,59132.748107,0,25454,16.004201,44.666859
12062,659.387390,0,9.800000,222,91942.567948,183977.412603,3259.110714,59923.992081,0,25455,16.004201,44.676145
12063,337.125427,0,11.300000,216,88604.594675,186768.663311,41.657071,58440.290816,0,25456,16.004201,44.685430


In [20]:
outcome_linked.to_csv(data_path / f'test_predictions_{RNG_MIX}.csv', index=False)
outcome_linked[outcome_linked['prediction'] == 1].to_csv(data_path / f'test_predictions_sites_{RNG_MIX}.csv', index=False)