# UJIIndoorLoc

In [None]:

import os


requirements_path = os.path.join(os.getcwd(), 'requirements.txt')

os.system(f'pip install -r {requirements_path}')
print("All dependencies have been installed.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

#import sklearn measures




# 1. Read Data into Dataframe

In [None]:
datafolder = "../../datasets/UJIIndoorLoc"

trainfile = os.path.join(datafolder, "trainingData.csv")
validfile = os.path.join(datafolder, "validationData.csv")

#read data into pandas dataframes
train_data = pd.read_csv(trainfile)
test_data = pd.read_csv(validfile)
print("Training data shape:", train_data.shape)
test_data.head()




## 1.2 Split Target and Feature

In [None]:
# Response variables in our problem are Building, Floor, Latitude, Longitude and Relative Position
train_data[['FLOOR','BUILDINGID', 'SPACEID','RELATIVEPOSITION','USERID','PHONEID']].astype(str).describe(include=['object'])
test_data[['FLOOR','BUILDINGID', 'SPACEID','RELATIVEPOSITION','USERID','PHONEID']].astype(str).describe(include=['object'])

## X and Y-data split
# Features are the WAP columns
X_train = train_data.iloc[:,:520]
X_test = test_data.iloc[:,:520]

# Select FLOOR and BUILDINGID for location prediction
y_train = train_data[['FLOOR', 'BUILDINGID', 'SPACEID']].copy()
y_test  = test_data[['FLOOR', 'BUILDINGID']].copy()
y_sID = y_train.copy()



#histogram of space ids for test set
# Combine BUILDINGID and FLOOR into a simpler location code
# Format: "B{BUILDING}F{FLOOR}" for better readability
y_sID['LOC_CODE'] = (y_sID['BUILDINGID'].astype(str) + 
                       y_sID['FLOOR'].astype(str)+
                       y_sID['SPACEID'].astype(str)
                       )
y_train['LOC_CODE'] = (y_train['BUILDINGID'].astype(str) + 
                       y_train['FLOOR'].astype(str)
                       )
y_test['LOC_CODE'] = (y_test['BUILDINGID'].astype(str) + 
                      y_test['FLOOR'].astype(str) 
                      )

# Keep only the combined column
y_train_init = y_train[['LOC_CODE']].astype('category')
y_test_init  = y_test[['LOC_CODE']].astype('category')
y_sID = y_sID[['LOC_CODE']].astype('category')



y_sID.shape,X_train.shape


# Exploratory Data Analysis

drop NaN and show distribution

In [None]:
X_raw_train = X_train
X_raw_test = X_test

# fill 100 with NaN for further processing
# Transform Train data
X_raw_train = (X_raw_train
			 .replace(to_replace=100,value=np.nan))
X_raw_test = (X_raw_test
			 .replace(to_replace=100,value=np.nan))


X_raw_combined = pd.concat([X_raw_train, X_raw_test], axis=0)
Y_raw_combined = pd.concat([y_train_init, y_test_init], axis=0)

# new train/test split from the combined dataset (used as overall "tot_split")
X_train_tot_split, X_test_tot_split, Y_train_tot_split, Y_test_tot_split = train_test_split(
	X_raw_combined, Y_raw_combined, test_size=0.2, random_state=42, stratify=Y_raw_combined
)

# create the train/validation split used elsewhere as "split_new"
# use the original training set and the location codes including SPACEID (y_train_new_predict)
X_train_split_sID, X_test_split_sID, Y_train_split_sID, Y_test_split_sID = train_test_split(
	X_train, y_sID, test_size=0.2, random_state=42, stratify=y_sID
)

# PCA on the split_new (for SPACEID experiments)
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train_split_sID)
X_test_pca = pca.transform(X_test_split_sID)
print(f"Original number of features: {X_train_split_sID.shape[1]}")
print(f"Reduced number of features after PCA: {X_train_pca.shape[1]}")

# fill NaNs for algorithms that require no missing values
X_train_noNAN = X_train_split_sID.fillna(-110)
X_test_noNAN = X_test_split_sID.fillna(-110)

# convert PCA results back to DataFrame for consistency
X_train_pca = pd.DataFrame(X_train_pca)
X_test_pca = pd.DataFrame(X_test_pca)

# show shapes to verify consistency
X_raw_train.shape, X_raw_test.shape, X_train_split_sID.shape, X_test_split_sID.shape, Y_train_split_sID.shape, Y_test_split_sID.shape, X_train_pca.shape, X_test_pca.shape

### Initial Test Set

In [None]:
waps_in_range_train = (X_raw_train
                 .notnull()
                 .sum(axis = 1))

waps_in_range_test = (X_raw_test
                 .notnull()
                 .sum(axis = 1))

waps_in_range_tot_train_split = (X_train_tot_split
                 .notnull() 
                 .sum(axis = 1))
waps_in_range_tot_test_split = (X_test_tot_split
                 .notnull() 
                 .sum(axis = 1))

fig, ax = plt.subplots(1,1)


sns.violinplot(waps_in_range_test, ax = ax, label = 'Test Data')
ax.set_xlabel("Number of APs in range")
ax.legend()
waps_in_range_test.describe()

### Initial Train Set

In [None]:
fig, ax = plt.subplots(1,1)


sns.violinplot(waps_in_range_train, ax = ax, label = 'Test Data')
ax.set_xlabel("Number of APs in range")
ax.legend()
waps_in_range_train.describe()

### New Test Set

In [None]:
fig, ax = plt.subplots(1,1)


sns.violinplot(waps_in_range_tot_test_split, ax = ax, label = 'Test Data')
ax.set_xlabel("Number of APs in range")
ax.legend()
waps_in_range_tot_test_split.describe()

### New Train Set

In [None]:
fig, ax = plt.subplots(1,1)


sns.violinplot(waps_in_range_tot_train_split, ax = ax, label = 'Train Data')
ax.set_xlabel("Number of APs in range")
ax.legend()
waps_in_range_tot_train_split.describe()

### Initial not NaN-Values distribution 

In [None]:
X_stack = X_raw_train.stack(future_stack=True)
sns.histplot(X_stack.dropna(),kde = False)
X_stack.describe()

## Do Preprocessing

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


def make_preprocessor(scale: bool = False):
    """
    Return a transformer that imputes missing WAPs to -110 and
    optionally applies StandardScaler to all columns.
    Use in Pipeline as: ('preprocessor', make_preprocessor(scale=...))
    """
    if scale:
        transformer = Pipeline([
            ("imp", SimpleImputer(strategy="constant", fill_value=-110)),
            ("scaler", StandardScaler())
        ])
    else:
        transformer = SimpleImputer(strategy="constant", fill_value=-110)

    # Apply to all columns; ColumnTransformer is a valid transformer for Pipeline
    return ColumnTransformer(
        [("all_waps", transformer, slice(None))],
        remainder="drop"
    )

## Grid Search for "best" parameters

In [None]:
# Silence noisy worker warnings
import os, warnings
warnings.filterwarnings("ignore", message=".pkg_resources is deprecated as an API.", category=UserWarning)
os.environ.setdefault("PYTHONWARNINGS", "ignore:pkg_resources is deprecated as an API:UserWarning")

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np

from classifierConfigs import get_classifier_configs, get_scorings, model_requires_int_labels
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

##reload classifierConfigs
import importlib
import classifierConfigs as cc
importlib.reload(cc)

get_classifier_configs = cc.get_classifier_configs
get_scorings = cc.get_scorings
model_requires_int_labels = cc.model_requires_int_labels





# Use the train/validation split already computed earlier in the notebook
# (X_train_split_new, X_test_split_new, Y_train_split_new, Y_test_split_new)
# Ensure label Series are 1D and contain the LOC_CODE category


X_train = X_raw_train
X_val = X_raw_test
y_train = y_train_init['LOC_CODE'].astype('category')
y_val = y_test_init['LOC_CODE'].astype('category')

# Integer-coded labels for models that require them (e.g., some XGBoost multiclass setups)
y_train_int = y_train.cat.codes
y_val_int = y_val.cat.codes

# Task selection: 'multiclass' or 'binary'
task = 'multiclass'

# Multi-metric scoring and configs by task
scoring = get_scorings(multiclass=(task == 'multiclass'))
refit_metric = "f1_macro"
configs = get_classifier_configs(task=task)

results_dir = "artifacts/gridsearch"
os.makedirs(results_dir, exist_ok=True)
all_results = []

for name, cfg in configs.items():
    print(f"Running Grid Search for {name}...")
    scale_TF = name.lower().startswith('knn')  # Scale only for KNN
    pipe = Pipeline(steps=[
        ('preprocessor', make_preprocessor(scale=scale_TF)),
        ('classifier', cfg['model'])
    ])

    param_grid = {f'classifier__{k}': v for k, v in cfg['param_grid'].items()}

    gs = GridSearchCV(
        estimator=pipe,
        param_grid=param_grid,
        scoring=scoring,
        refit=refit_metric,
        n_jobs=1,
        verbose=3,
        cv=3
    )

    # Pick correct label vector for fitting: integer codes only if the model requires them
    y_fit = y_train_int if (task == 'multiclass' and model_requires_int_labels(name)) else y_train
    gs.fit(X_train, y_fit)

    # Save per-model CV results
    res_df = pd.DataFrame(gs.cv_results_)
    rank_col = f'rank_test_{refit_metric}' if f'rank_test_{refit_metric}' in res_df.columns else ('rank_test_score' if 'rank_test_score' in res_df.columns else None)
    if rank_col is not None:
        res_df = res_df.sort_values(rank_col)
    res_df.insert(0, 'model', name)
    res_df.insert(1, 'scoring', refit_metric)
    res_df['preproc_scaled'] = scale_TF
    out_path = os.path.join(results_dir, f"{name.replace(' ', '')}_cv_results{refit_metric}.csv")
    res_df.to_csv(out_path, index=False)
    all_results.append(res_df)

    print(f"Best parameters for {name}: {gs.best_params_} | Best {refit_metric}: {gs.best_score_:.4f}")

    # Predict on validation set and decode integer labels back to category names if needed
    y_val_pred = gs.predict(X_val)
    if task == 'multiclass' and model_requires_int_labels(name):
        # Map integer codes back to category labels using the training categories
        categories = y_train.cat.categories
        y_val_pred = pd.Series(y_val_pred).map(lambda c: categories[c] if (isinstance(c, (int, np.integer)) and c < len(categories)) else str(c)).values

    # Ensure y_val is plain array-like of labels (strings/categories) for reporting
    y_val_labels = y_val.values if hasattr(y_val, 'values') else y_val

    print(f"Validation Classification Report for {name}:\n")
    print(classification_report(y_val_labels, y_val_pred, zero_division=0))
    print("-" * 80)

if all_results:
    pd.concat(all_results, ignore_index=True).to_csv(
        os.path.join(results_dir, f"UJIIndoorLoc_ALL_cv_results_{refit_metric}.csv"),
        index=False
    )

## KNN-Classifier using Parameters from Grid Search
#### single run for preprocessing differences

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

#get preprocessor function
print("Preparing data with SpaceID \n")
X_preprocessor = make_preprocessor(scale=True)
X_std_train = X_preprocessor.fit_transform(X_train_split_sID)
X_std_test = X_preprocessor.transform(X_test_split_sID)


# Convert categorical LOC_CODE to numeric codes
y_train_codes = Y_train_split_sID['LOC_CODE'].cat.codes
y_test_codes = Y_test_split_sID['LOC_CODE'].cat.codes



#=============================================================================
# Train classifier with SpaceID and scaling
#=============================================================================
knn_clf = KNeighborsClassifier(n_neighbors=2, n_jobs=-1, p=1, weights='distance')
knn_clf.fit(X_std_train, y_train_codes)


# Predict on test set
y_pred_knn = knn_clf.predict(X_std_test)

# Evaluate
print("=============================================================================")
print("With Scaling Results:")
accuracy_knn = accuracy_score(y_test_codes, y_pred_knn)
f1_knn = f1_score(y_test_codes, y_pred_knn, average='weighted')
print(f"\nKNN Accuracy: {accuracy_knn:.4f}")
print(f"KNN F1 Score: {f1_knn:.4f}")


#=============================================================================
# Train classifier without scaling and with SpaceID
#=============================================================================
X_woscaling = make_preprocessor(scale=False)
X_woscl_train = X_woscaling.fit_transform(X_train_split_sID)
X_woscl_test = X_woscaling.transform(X_test_split_sID)

knn_clf_wo_scaling = KNeighborsClassifier(n_neighbors=2, n_jobs=-1, p=1, weights='distance')
knn_clf_wo_scaling.fit(X_woscl_train, y_train_codes)

# Predict on test set
y_pred_knn_wo_scaling = knn_clf_wo_scaling.predict(X_woscl_test)

# Evaluate
accuracy_knn_wo_scaling = accuracy_score(y_test_codes, y_pred_knn_wo_scaling)
f1_knn_wo_scaling = f1_score(y_test_codes, y_pred_knn_wo_scaling, average='weighted')
print("=============================================================================")
print("Without Scaling Results:")
print(f"\nKNN without Scaling Accuracy: {accuracy_knn_wo_scaling:.4f}")
print(f"KNN without Scaling F1 Score: {f1_knn_wo_scaling:.4f}")


# =============================================================================
# KNN without SpaceID
# =============================================================================

X_preprocessor_new_split = make_preprocessor(scale=True)
X_std_train_wospaceID = X_preprocessor_new_split.fit_transform(X_train_tot_split)
X_std_test_wospaceID = X_preprocessor_new_split.transform(X_test_tot_split)

y_train_codes_wospaceID = Y_train_tot_split['LOC_CODE'].cat.codes
y_test_codes_wospaceID = Y_test_tot_split['LOC_CODE'].cat.codes

knn_clf_wospaceID = KNeighborsClassifier(n_neighbors=2, n_jobs=-1, p=1, weights='distance')
knn_clf_wospaceID.fit(X_std_train_wospaceID, y_train_codes_wospaceID)

# Predict on test set
y_pred_knn_wospaceID = knn_clf_wospaceID.predict(X_std_test_wospaceID)
# Evaluate
accuracy_knn_wospaceID = accuracy_score(y_test_codes_wospaceID, y_pred_knn_wospaceID)
f1_knn_wospaceID = f1_score(y_test_codes_wospaceID, y_pred_knn_wospaceID, average='weighted')
print("\n")
print("=============================================================================")
print("KNN Results without SpaceID:")
print("=============================================================================")
print("Without SpaceID Results:")
print(f"\nKNN without SpaceID Accuracy: {accuracy_knn_wospaceID:.4f}")
print(f"KNN without SpaceID F1 Score: {f1_knn_wospaceID:.4f}\n")

#==================
# KNN initial split
#==================


X_train_init = X_raw_train 
X_test_init = X_raw_test 


X_preprocessor_init = make_preprocessor(scale=True)
X_std_train_init = X_preprocessor_init.fit_transform(X_train_init)
X_std_test_init = X_preprocessor_init.transform(X_test_init)

knn_clf_init = KNeighborsClassifier(n_neighbors=2, n_jobs=-1, p=1, weights='distance')
knn_clf_init.fit(X_std_train_init, y_train_init)

# Predict on test set
y_pred_knn_init = knn_clf_init.predict(X_std_test_init)
# Evaluate
accuracy_knn_init = accuracy_score(y_test_init, y_pred_knn_init)
f1_knn_init = f1_score(y_test_init, y_pred_knn_init, average='weighted')
print("\n")
print("=============================================================================")
print("With initial Test/Train-Split Results:")
print(f"\nKNN without SpaceID Accuracy: {accuracy_knn_init:.4f}")
print(f"KNN without SpaceID F1 Score: {f1_knn_init:.4f}\n")

#write results to a csv file
results = {
    'Model': ['KNN with SpaceID and Scaling', 'KNN with SpaceID without Scaling', 'KNN without SpaceID', 'KNN initial split'],
    'Accuracy': [accuracy_knn, accuracy_knn_wo_scaling, accuracy_knn_wospaceID, accuracy_knn_init],
    'F1 Score': [f1_knn, f1_knn_wo_scaling, f1_knn_wospaceID, f1_knn_init]
}
results_df = pd.DataFrame(results)
results_df.to_csv('artifacts/knn_sgl_ujiindoorloc_results.csv', index=False)
print("Results have been saved to 'artifacts/knn_sgl_ujiindoorloc_results.csv'")

## Random Forest using parameter from Grid Search
#### single run for preprocessing differences

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

# Use the train/validation split already computed earlier in the notebook
# (X_train_split_sID, X_test_split_sID, Y_train_split_sID, Y_test_split_sID)

X_train = X_train_split_sID
X_val = X_test_split_sID
y_train = Y_train_split_sID['LOC_CODE'].astype('category')
y_val = Y_test_split_sID['LOC_CODE'].astype('category')

# Integer-coded labels for models that require them (e.g., some Random Forest multiclass setups)
y_train_int = y_train.cat.codes
y_val_int = y_val.cat.codes





# =================================================================================================================================================
# Random Forest with SpaceID
# =============================================================================================================================



print("Training Random Forest with SpaceID...")
clf = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1, max_features='log2')
clf.fit(X_train, y_train_int)
y_pred = clf.predict(X_val)
accuracy = accuracy_score(y_val_int, y_pred)
f1 = f1_score(y_val_int, y_pred, average='weighted')
print("=============================================================================")
print("Random Forest with SpaceID Results:")
print(f"\nRandom Forest Accuracy with SpaceID: {accuracy:.4f}")
print(f"Random Forest F1 Score with SpaceID: {f1:.4f}\n")

#=============================================================================
# Random Forest without SpaceID
#=============================================================================
#X_train_tot_split from earlier in the notebook
#X_test_tot_split from earlier in the notebook

X_preprocessor_clf = make_preprocessor(scale=False)

X_std_train_wospaceID = X_preprocessor_clf.fit_transform(X_train_tot_split)
X_std_test_wospaceID = X_preprocessor_clf.transform(X_test_tot_split)

y_train_codes_wospaceID = Y_train_tot_split['LOC_CODE'].cat.codes
y_test_codes_wospaceID = Y_test_tot_split['LOC_CODE'].cat.codes


print("\n")
print("Training Random Forest without SpaceID...")

clf_wo_spaceID = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1, max_features='log2')
clf_wo_spaceID.fit(X_std_train_wospaceID, y_train_codes_wospaceID)
y_pred_wo_spaceID = clf_wo_spaceID.predict(X_std_test_wospaceID)
accuracy_wo_spaceID = accuracy_score(y_test_codes_wospaceID, y_pred_wo_spaceID)
f1_wo_spaceID = f1_score(y_test_codes_wospaceID, y_pred_wo_spaceID, average='weighted')
print("=============================================================================")
print("Random Forest without SpaceID Results:")
print(f"\nRandom Forest without SpaceID Accuracy: {accuracy_wo_spaceID:.4f}")
print(f"Random Forest without SpaceID F1 Score: {f1_wo_spaceID:.4f}")


print("\n")
print("Training Random Forest with initial Split...")
X_preprocessor_init_clf = make_preprocessor(scale=False)
X_std_train_init = X_preprocessor_init_clf.fit_transform(X_raw_train)
X_std_test_init = X_preprocessor_init_clf.transform(X_raw_test) 
y_train_init_clf = y_train_init['LOC_CODE'].astype('category')
y_test_init_clf = y_test_init['LOC_CODE'].astype('category')
y_train_init_codes = y_train_init_clf.cat.codes
y_test_init_codes = y_test_init_clf.cat.codes

clf_init = RandomForestClassifier(n_estimators=500, random_state=42, n_jobs=-1, max_features='log2')
clf_init.fit(X_std_train_init, y_train_init_codes)
y_pred_init = clf_init.predict(X_std_test_init)
accuracy_init = accuracy_score(y_test_init_codes, y_pred_init)
f1_init = f1_score(y_test_init_codes, y_pred_init, average='weighted')
print("=============================================================================")
print("Random Forest with initial Split Results:")
print(f"\nRandom Forest with initial Split Accuracy: {accuracy_init:.4f}")
print(f"Random Forest with initial Split F1 Score: {f1_init:.4f}\n")

## XGBoost using parameter from Grid Search
#### single run for preprocessing differences

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score


# Use the train/validation split already computed earlier in the notebook
# (X_train_split_sID, X_test_split_sID, Y_train_split_sID, Y_test_split_sID)

X_train = X_train_split_sID
X_val = X_test_split_sID
y_train = Y_train_split_sID['LOC_CODE'].astype('category')
y_val = Y_test_split_sID['LOC_CODE'].astype('category')

# Integer-coded labels for models that require them (e.g., some XGBoost multiclass setups)
y_train_int = y_train.cat.codes
y_val_int = y_val.cat.codes





# =================================================================================================================================================
# XGBoost with SpaceID
# =============================================================================================================================



print("Training XGBoost with SpaceID...")
xgb_clf = XGBClassifier(eval_metric='mlogloss', objective='multi:softprob', random_state=42, n_jobs=-1, learning_rate=0.1, n_estimators=500, max_depth=10)
xgb_clf.fit(X_train, y_train_int)
y_pred_xgb = xgb_clf.predict(X_val)
accuracy_xgb = accuracy_score(y_val_int, y_pred_xgb)
f1_xgb_wo_scaling = f1_score(y_val_int, y_pred_xgb, average='weighted')
print("=============================================================================")
print("XGBoost with SpaceID Results:")
print(f"\nXGBoost Accuracy with SpaceID: {accuracy_xgb:.4f}")
print(f"XGBoost F1 Score with SpaceID: {f1_xgb_wo_scaling:.4f}\n")

#=============================================================================
# XGBoost without SpaceID
#=============================================================================
#X_train_tot_split from earlier in the notebook
#X_test_tot_split from earlier in the notebook

X_preprocessor_woSpaceID_xgb_clf = make_preprocessor(scale=False)
X_std_train_wospaceID = X_preprocessor_woSpaceID_xgb_clf.fit_transform(X_train_tot_split)
X_std_test_wospaceID = X_preprocessor_woSpaceID_xgb_clf.transform(X_test_tot_split)

y_train_codes_wospaceID = Y_train_tot_split['LOC_CODE'].cat.codes
y_test_codes_wospaceID = Y_test_tot_split['LOC_CODE'].cat.codes



print("Training XGBoost without SpaceID...")

xgb_clf_wo_spaceID = XGBClassifier(eval_metric='mlogloss', objective='multi:softprob', random_state=42 , learning_rate=0.1, n_estimators=500, max_depth=10, n_jobs=-1) 
xgb_clf_wo_spaceID.fit(X_std_train_wospaceID, y_train_codes_wospaceID)
y_pred_xgb_wo_spaceID = xgb_clf_wo_spaceID.predict(X_std_test_wospaceID)
accuracy_xgb_wo_spaceID = accuracy_score(y_test_codes_wospaceID, y_pred_xgb_wo_spaceID)
f1_xgb_wo_spaceID = f1_score(y_test_codes_wospaceID, y_pred_xgb_wo_spaceID, average='weighted')
print("=============================================================================")
print("XGBoost without SpaceID Results:")
print(f"\nXGBoost without SpaceID Accuracy: {accuracy_xgb_wo_spaceID:.4f}")
print(f"XGBoost without SpaceID F1 Score: {f1_xgb_wo_spaceID:.4f}")



print("\n")
print("Training XGBoost with initial Split...")
X_preprocessor_init_xgb_clf = make_preprocessor(scale=False)
X_std_train_init = X_preprocessor_init_xgb_clf.fit_transform(X_raw_train)
X_std_test_init = X_preprocessor_init_xgb_clf.transform(X_raw_test) 
y_train_init_xgb_clf = y_train_init['LOC_CODE'].astype('category')
y_test_init_xgb_clf = y_test_init['LOC_CODE'].astype('category')
y_train_init_codes = y_train_init_xgb_clf.cat.codes
y_test_init_codes = y_test_init_xgb_clf.cat.codes

xgb_clf_init = XGBClassifier(eval_metric='mlogloss', objective='multi:softprob', random_state=42, learning_rate=0.1, n_estimators=500, max_depth=10, n_jobs=-1)
xgb_clf_init.fit(X_std_train_init, y_train_init_codes)
y_pred_init = xgb_clf_init.predict(X_std_test_init)
accuracy_init = accuracy_score(y_test_init_codes, y_pred_init)
f1_init = f1_score(y_test_init_codes, y_pred_init, average='weighted')
print("=============================================================================")
print("XGBoost with initial Split Results:")
print(f"\nXGBoost with initial Split Accuracy: {accuracy_init:.4f}")
print(f"XGBoost with initial Split F1 Score: {f1_init:.4f}\n")