In [1]:
import pandas as pd

df = pd.read_csv("../../data/processed/model_input_samples.csv") 
df.head()

Unnamed: 0,LONGITUDE,LATITUDE,SOURCE,gravity_iso_residual,gravity_cscba,gravity_cscba_1vd,mag_uc_1_2km,mag_uc_2_4km,mag_uc_4_8km,mag_uc_8_12km,mag_uc_12_16km,radio_K_pct,radio_Th_ppm,radio_U_ppm,radio_Th_K_ratio,radio_U_K_ratio,radio_U_Th_ratio,LABEL
0,134.324653,-27.294063,blank_area,-204.018005,-426.38882,-309.04285,-19.665098,-33.036575,-46.971394,-30.871347,-21.795435,0.759041,9.374757,1.196567,12.357069,1.57767,0.127658,0
1,148.050504,-32.937903,positive,93.298981,-186.38301,-2674.9626,-44.330212,23.895699,107.432144,82.715973,57.587337,1.154886,8.526972,1.396929,7.434864,1.215049,0.16399,1
2,119.0271,-22.9757,other_deposit,-200.687836,-739.63226,-1203.5088,-442.748383,-354.920288,-211.401382,-84.787079,-53.003239,0.126347,3.217272,0.57241,16.086359,2.862052,0.178862,0
3,121.464232,-23.649192,blank_area,-163.918274,-592.99493,-536.7037,18.632324,31.867907,50.295372,35.592964,25.073792,0.18421,3.271043,0.3693,16.355215,1.846498,0.113002,0
4,142.4699,-35.1686,other_deposit,-81.172989,-139.28423,-507.28357,-0.266748,-0.374094,-0.820666,-0.999457,-1.124462,0.413259,4.779623,0.953914,11.560739,2.311097,0.200272,0


### Formatting and Column Standardization

- Ensured all column names use lowercase with underscores (`snake_case`) for consistency.
- Converted categorical variables (e.g., `source`) to Pandas `category` dtype.
- Ensured the target column `label` is of integer type.
- No one-hot encoding was applied at this stage, as most ML models used (e.g., XGBoost, RF) handle categorical variables natively or do not require dummy variables.

In [2]:
# Step 1: Standardize Column Names
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
print("Standardized columns:", list(df.columns))

Standardized columns: ['longitude', 'latitude', 'source', 'gravity_iso_residual', 'gravity_cscba', 'gravity_cscba_1vd', 'mag_uc_1_2km', 'mag_uc_2_4km', 'mag_uc_4_8km', 'mag_uc_8_12km', 'mag_uc_12_16km', 'radio_k_pct', 'radio_th_ppm', 'radio_u_ppm', 'radio_th_k_ratio', 'radio_u_k_ratio', 'radio_u_th_ratio', 'label']


In [5]:
# Step 2: Check and Correct Data Types
print("Before type check:\n", df.dtypes)

# Convert LABEL to int
df['label'] = df['label'].astype(int)

# Convert SOURCE to categorical (if it's used later)
df['source'] = df['source'].astype('category')

print("\nAfter type check:\n", df.dtypes)


Before type check:
 longitude                float64
latitude                 float64
source                  category
gravity_iso_residual     float64
gravity_cscba            float64
gravity_cscba_1vd        float64
mag_uc_1_2km             float64
mag_uc_2_4km             float64
mag_uc_4_8km             float64
mag_uc_8_12km            float64
mag_uc_12_16km           float64
radio_k_pct              float64
radio_th_ppm             float64
radio_u_ppm              float64
radio_th_k_ratio         float64
radio_u_k_ratio          float64
radio_u_th_ratio         float64
label                      int32
dtype: object

After type check:
 longitude                float64
latitude                 float64
source                  category
gravity_iso_residual     float64
gravity_cscba            float64
gravity_cscba_1vd        float64
mag_uc_1_2km             float64
mag_uc_2_4km             float64
mag_uc_4_8km             float64
mag_uc_8_12km            float64
mag_uc_12_16km        

In [6]:
# Step 3: Optional One-Hot Encoding for 'source'
# Only if you plan to use 'source' in the model and your model doesn't handle categorical vars natively

encode_source = False  # Set to False if you don't want one-hot

if encode_source:
    df = pd.get_dummies(df, columns=['source'], drop_first=True)
    print("✅ Applied one-hot encoding to 'source'")
else:
    print("'source' kept as categorical, not encoded.")


'source' kept as categorical, not encoded.


In [7]:
# Step 4: Save Cleaned Dataset
df.to_csv("../../data/processed/train_dataset_formatted.csv", index=False)
print("Saved cleaned dataset to 'train_dataset_formatted.csv'")


Saved cleaned dataset to 'train_dataset_formatted.csv'
