In [1]:
# STEP 1: CLEAN THE UPLINK DATASET
import pandas as pd
import numpy as np

# Load the raw UL data
df = pd.read_excel("../data/raw/5G_UL.xlsx", sheet_name="Series Formatted Data")

# Drop completely useless or unnamed columns
drop_cols = [
    'Message', 'Time', 'Technology_Mode',
    'Unnamed: 27', 'Unnamed: 48', 'Unnamed: 55',
    'NR_UE_RRCReEst_EndResult', 'NR_RRC_MsgType',
    'NAS_5GS_MM_MessageType', 'NAS_5GS_SM_MessageType'
]
df = df.drop(columns=drop_cols, errors="ignore")

# Drop extremely sparse and noisy columns (>99.9% missing)
too_sparse_cols = [
    'NR_UE_Nbr_PCI_3', 'NR_UE_Nbr_PCI_4',
    'NR_UE_Nbr_RSRP_3', 'NR_UE_Nbr_RSRP_4',
    'NR_UE_Nbr_RSRQ_3', 'NR_UE_Nbr_RSRQ_4',
    'NR_UE_Power_Tx_PRACH_0', 'NR_UE_RACH_Procedure_Count'
]
df = df.drop(columns=[col for col in too_sparse_cols if col in df.columns])
# Drop residual DL-related columns that are >96% missing and not relevant for UL
dl_only_cols = [
    'NR_UE_NACK_Rate_DL_0', 'NR_UE_Ack_As_Nack_DL_0',
    'NR_UE_RB_Num_DL_0', 'NR_UE_BLER_DL_0',
    'NR_UE_Pathloss_DL_0', 'NR_UE_Throughput_PDCP_DL'
]
df = df.drop(columns=[col for col in dl_only_cols if col in df.columns])

# Apply domain-specific fills for signal metrics
domain_fills = {
    "NR_UE_RSRP_0": -200,
    "NR_UE_RSRQ_0": -30,
    "NR_UE_SINR_0": -10,
    "NR_UE_PCI_0": -1,
    "NR_UE_Timing_Advance": 0
}
for col, val in domain_fills.items():
    if col in df.columns:
        df[col] = df[col].fillna(val)

# Fill binary indicators with 0
binary_cols = [
    'NR_UE_RACH_Attempt', 'NR_UE_RACH_OK', 'NR_UE_RACH_Fail',
    'NR_UE_RRCReEstAttempt', 'NR_UE_RRCReEstFail',
    'NR_UE_RRCConnectionAttempt', 'NR_UE_RRCConnectionSetupOk',
    'NR_UE_RRCConnectionComplete', 'NR_UE_RRCConnectionDrop',
    'NR_UE_RRCHOAttempt', 'NR_UE_RRCHOOK'
]
for col in binary_cols:
    if col in df.columns:
        df[col] = df[col].fillna(0)

# Fill continuous UL stats with median
cont_cols = [
    'NR_UE_Pathloss_UL_0', 'NR_UE_Power_Tx_PUSCH_0',
    'NR_UE_NACK_Rate_UL_0', 'App_Throughput_UL',
    'NR_UE_RB_Num_UL_0', 'NR_UE_Throughput_PDCP_UL',
    'NR_UE_Throughput_RLC_UL'
]
for col in cont_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# Neighbor PCI/RSRP/RSRQ imputation for indices 0‚Äì2
for i in range(3):
    pci = f"NR_UE_Nbr_PCI_{i}"
    rsrp = f"NR_UE_Nbr_RSRP_{i}"
    rsrq = f"NR_UE_Nbr_RSRQ_{i}"
    if pci in df.columns:
        df[pci] = df[pci].fillna(-1)
    if rsrp in df.columns:
        df[rsrp] = df[rsrp].fillna(-200)
    if rsrq in df.columns:
        df[rsrq] = df[rsrq].fillna(-30)

# Drop object/string columns if any remain
df = df.drop(columns=df.select_dtypes(include="object").columns)

# Drop rows with missing target labels
df = df.dropna(subset=["Latitude", "Longitude"])

# ‚úÖ Final clean dataset shape
print("‚úÖ Cleaned UL data shape:", df.shape)


‚úÖ Cleaned UL data shape: (59806, 31)


In [4]:
ul_data = pd.read_excel("../data/raw/5G_UL.xlsx", sheet_name="Series Formatted Data")


In [4]:
ul_data.shape

(59808, 58)

In [None]:
corr = df.corr(numeric_only=True)
corr_targets = corr[["Latitude", "Longitude"]].abs().sort_values(by="Latitude", ascending=False)
print(corr_targets.head(20))


                            Latitude  Longitude
Latitude                    1.000000   0.169186
NR_UE_Nbr_PCI_3             0.691817   0.073955
NR_UE_Nbr_RSRP_3            0.515529   0.354618
NR_UE_RSRP_0                0.468417   0.525664
NR_UE_Nbr_RSRP_0            0.462965   0.589375
NR_UE_RACH_Procedure_Count  0.446930   0.273460
NR_UE_Pathloss_DL_0         0.427640   0.584995
NR_UE_Timing_Advance        0.422446   0.514119
Message                     0.418771   0.376307
NR_UE_Power_Tx_PUSCH_0      0.407813   0.463658
App_Throughput_UL           0.402850   0.401237
NR_UE_Power_Tx_PRACH_0      0.373070   0.388609
NR_UE_PCI_0                 0.357701   0.150954
NR_UE_Throughput_PDCP_DL    0.337426   0.373663
NR_UE_Throughput_RLC_UL     0.336048   0.427333
NR_UE_NACK_Rate_UL_0        0.294881   0.169668
NR_UE_Nbr_RSRP_1            0.283800   0.749661
NR_UE_RSRQ_0                0.244936   0.357285
NR_UE_SINR_0                0.237582   0.331359
NR_UE_Nbr_RSRQ_2            0.220082   0

In [2]:
# Show summary of missing values
missing = df.isna().sum()
missing = missing[missing > 0].sort_values(ascending=False)

if not missing.empty:
    print("\nüîç Columns with Missing Values:\n")
    print(missing.to_frame("Missing Count").assign(Missing_Percent=lambda x: 100 * x["Missing Count"] / len(df)))
else:
    print("‚úÖ No missing values remain. Data is clean.")


‚úÖ No missing values remain. Data is clean.


In [8]:
ul_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59808 entries, 0 to 59807
Data columns (total 58 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   Message                      59808 non-null  int64         
 1   Time                         59808 non-null  datetime64[ns]
 2   Longitude                    59806 non-null  float64       
 3   Latitude                     59806 non-null  float64       
 4   Technology_Mode              59808 non-null  object        
 5   NR_UE_PCI_0                  2415 non-null   float64       
 6   NR_UE_RSRP_0                 1834 non-null   float64       
 7   NR_UE_RSRQ_0                 1834 non-null   float64       
 8   NR_UE_SINR_0                 1796 non-null   float64       
 9   NR_UE_Nbr_PCI_0              1330 non-null   float64       
 10  NR_UE_Nbr_PCI_1              444 non-null    float64       
 11  NR_UE_Nbr_PCI_2              103 non-null

In [3]:
# STEP 2: FEATURE IMPORTANCE + PCA for UPLINK
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# 1. Separate features and targets
X_ul = df.drop(columns=["Latitude", "Longitude"], errors="ignore")
y_lat_ul = df["Latitude"]
y_lon_ul = df["Longitude"]

# 2. Feature Importance - Latitude
rf_lat_ul = RandomForestRegressor(n_estimators=100, random_state=42)
rf_lat_ul.fit(X_ul, y_lat_ul)
importances_lat_ul = pd.Series(rf_lat_ul.feature_importances_, index=X_ul.columns)

# 3. Feature Importance - Longitude
rf_lon_ul = RandomForestRegressor(n_estimators=100, random_state=42)
rf_lon_ul.fit(X_ul, y_lon_ul)
importances_lon_ul = pd.Series(rf_lon_ul.feature_importances_, index=X_ul.columns)

# 4. Print sorted importances
print("\nüåç UL Feature Importances (Latitude):")
print(importances_lat_ul.sort_values(ascending=False))

print("\nüß≠ UL Feature Importances (Longitude):")
print(importances_lon_ul.sort_values(ascending=False))

# 5. PCA
scaler_ul = StandardScaler()
X_ul_scaled = scaler_ul.fit_transform(X_ul)

pca_ul = PCA(n_components=10)
pca_ul.fit(X_ul_scaled)
print("\nüìä UL PCA Explained Variance Ratios:")
print(pca_ul.explained_variance_ratio_)



üåç UL Feature Importances (Latitude):
NR_UE_PCI_0                    0.287185
NR_UE_Throughput_RLC_UL        0.243809
NR_UE_Power_Tx_PUSCH_0         0.129276
NR_UE_NACK_Rate_UL_0           0.078114
App_Throughput_UL              0.065445
NR_UE_RSRP_0                   0.056439
NR_UE_RSRQ_0                   0.054020
NR_UE_Nbr_PCI_0                0.024319
NR_UE_SINR_0                   0.012160
NR_UE_Nbr_RSRP_0               0.009433
NR_UE_Nbr_PCI_1                0.007511
NR_UE_Nbr_RSRQ_1               0.006127
NR_UE_Timing_Advance           0.005683
NR_UE_Nbr_RSRP_1               0.005371
NR_UE_Nbr_RSRQ_0               0.004265
NR_UE_Nbr_RSRP_2               0.003409
NR_UE_RRCHOOK                  0.001621
NR_UE_RRCConnectionDrop        0.001471
NR_UE_RRCReEstFail             0.001064
NR_UE_RRCHOAttempt             0.000761
NR_UE_Nbr_RSRQ_2               0.000740
NR_UE_RRCConnectionSetupOk     0.000611
NR_UE_RRCConnectionAttempt     0.000607
NR_UE_Nbr_PCI_2                0.000342

In [5]:
ul2 = pd.read_excel("../data/Sample_Data_2/5G_UL.xlsx", sheet_name="Series Formatted Data")

In [8]:
# Get sets of column names
columns_dl2 = set(ul2.columns)
columns_dl_data = set(ul_data.columns)
columns_X_selected = set(X_ul_selected.columns)

# Find common columns between dl2 and dl_data
common_columns = columns_dl2 & columns_dl_data

# Compare X_selected columns to common columns
covered_columns = columns_X_selected & common_columns
missing_in_common = columns_X_selected - common_columns
extra_in_common = common_columns - columns_X_selected

# Print results
print("‚úÖ Columns in X_selected that are covered by both dl2 and dl_data:\n", covered_columns)
print("\n‚ùå Columns in X_selected that are NOT in both dl2 and dl_data:\n", missing_in_common)
print("\n‚ÑπÔ∏è Extra common columns NOT used in X_selected (might be useful features):\n", extra_in_common)


‚úÖ Columns in X_selected that are covered by both dl2 and dl_data:
 {'NR_UE_Nbr_RSRQ_0', 'NR_UE_Power_Tx_PUSCH_0', 'NR_UE_Nbr_PCI_0', 'NR_UE_Nbr_PCI_1', 'NR_UE_SINR_0', 'NR_UE_RSRP_0', 'NR_UE_PCI_0', 'NR_UE_NACK_Rate_UL_0', 'NR_UE_Nbr_RSRP_0', 'NR_UE_Throughput_RLC_UL', 'NR_UE_Timing_Advance', 'NR_UE_Nbr_RSRP_1', 'NR_UE_Nbr_RSRQ_1', 'NR_UE_RSRQ_0', 'App_Throughput_UL'}

‚ùå Columns in X_selected that are NOT in both dl2 and dl_data:
 set()

‚ÑπÔ∏è Extra common columns NOT used in X_selected (might be useful features):
 {'NR_UE_RACH_Attempt', 'NR_UE_RRCReEstFail', 'NR_UE_BLER_DL_0', 'NR_RRC_MsgType', 'NR_UE_Throughput_PDCP_DL', 'NR_UE_CCE_AggregationLev_0', 'Longitude', 'NR_UE_Modulation_Avg_DL_0', 'NR_UE_RACH_Fail', 'Message', 'Technology_Mode', 'NR_UE_Nbr_PCI_3', 'NAS_5GS_MM_MessageType', 'NR_UE_Nbr_RSRP_2', 'NR_UE_RI_DL_0', 'Latitude', 'NR_UE_RRCConnectionAttempt', 'NR_UE_Nbr_RSRQ_3', 'NR_UE_RRCHOAttempt', 'NR_UE_Ack_As_Nack_DL_0', 'NR_UE_MCS_DL_0', 'NR_UE_RACH_Procedure_Count', 'NR

In [11]:
X_ul_selected.columns

Index(['NR_UE_Throughput_RLC_UL', 'NR_UE_PCI_0', 'NR_UE_Power_Tx_PUSCH_0',
       'App_Throughput_UL', 'NR_UE_NACK_Rate_UL_0', 'NR_UE_RSRP_0',
       'NR_UE_Nbr_PCI_0', 'NR_UE_RSRQ_0', 'NR_UE_Nbr_RSRP_0',
       'NR_UE_Nbr_RSRQ_0', 'NR_UE_Nbr_RSRP_1', 'NR_UE_SINR_0',
       'NR_UE_Nbr_PCI_1', 'NR_UE_Nbr_RSRQ_1', 'NR_UE_Timing_Advance'],
      dtype='object')

In [9]:
# Step 1: Get the sets of column names
columns_dl2 = set(ul2.columns)
columns_dl_data = set(ul_data.columns)

# Step 2: Find common columns
common_columns = columns_dl2 & columns_dl_data

# Step 3: Find columns only in each DataFrame
only_in_dl2 = columns_dl2 - columns_dl_data
only_in_dl_data = columns_dl_data - columns_dl2

# Print the results
print("‚úÖ Common columns:\n", common_columns)
print("\nüî¥ Columns only in dl2:\n", only_in_dl2)
print("\nüîµ Columns only in dl_data:\n", only_in_dl_data)


‚úÖ Common columns:
 {'NR_UE_RACH_Attempt', 'NR_UE_Power_Tx_PUSCH_0', 'NR_UE_Nbr_PCI_1', 'NR_UE_Nbr_RSRP_1', 'NR_UE_RRCReEstFail', 'NR_UE_BLER_DL_0', 'NR_RRC_MsgType', 'NR_UE_Throughput_PDCP_DL', 'NR_UE_CCE_AggregationLev_0', 'Longitude', 'NR_UE_RACH_Fail', 'NR_UE_Modulation_Avg_DL_0', 'Technology_Mode', 'Message', 'NR_UE_Nbr_PCI_3', 'NAS_5GS_MM_MessageType', 'NR_UE_SINR_0', 'NR_UE_Nbr_RSRP_2', 'NR_UE_RI_DL_0', 'NR_UE_Nbr_RSRP_0', 'NR_UE_NACK_Rate_UL_0', 'Latitude', 'NR_UE_Nbr_RSRQ_1', 'NR_UE_RRCConnectionAttempt', 'NR_UE_Nbr_RSRQ_3', 'NR_UE_RRCHOAttempt', 'NR_UE_Ack_As_Nack_DL_0', 'NR_UE_MCS_DL_0', 'NR_UE_PCI_0', 'NR_UE_RACH_Procedure_Count', 'NR_UE_RACH_OK', 'NR_UE_RRCConnectionDrop', 'NR_UE_RRCReEstAttempt', 'NR_UE_Timing_Advance', 'NR_UE_RSRQ_0', 'NR_UE_Nbr_RSRP_3', 'NR_UE_Nbr_PCI_2', 'NR_UE_RRCHOOK', 'NR_UE_RRCConnectionSetupOk', 'App_Throughput_UL', 'NR_UE_NACK_Rate_DL_0', 'NR_UE_RSRP_0', 'NR_UE_Pathloss_DL_0', 'NR_UE_RB_Num_DL_0', 'NR_UE_RRCReEst_EndResult', 'NR_UE_Power_Tx_PRAC

In [7]:
# STEP 3: Uplink Feature Selection based on average importance
import pandas as pd

# Average the importances across both targets
avg_importance = (importances_lat_ul + importances_lon_ul) / 2
avg_importance = avg_importance.sort_values(ascending=False)

# Optionally: print cumulative contribution
cumulative = avg_importance.cumsum() / avg_importance.sum()
print("üìà Cumulative Contribution:\n", cumulative)

# Keep features up to ~99% cumulative importance
selected_features = cumulative[cumulative < 0.99].index.tolist()
print(f"‚úÖ Selected {len(selected_features)} important features.")

# Subset your X matrix
X_ul_selected = X_ul[selected_features]


üìà Cumulative Contribution:
 NR_UE_Throughput_RLC_UL        0.273920
NR_UE_PCI_0                    0.480053
NR_UE_Power_Tx_PUSCH_0         0.616119
App_Throughput_UL              0.697335
NR_UE_NACK_Rate_UL_0           0.776899
NR_UE_RSRP_0                   0.852376
NR_UE_Nbr_PCI_0                0.883538
NR_UE_RSRQ_0                   0.914626
NR_UE_Nbr_RSRP_0               0.937338
NR_UE_Nbr_RSRQ_0               0.950182
NR_UE_Nbr_RSRP_1               0.959609
NR_UE_SINR_0                   0.967950
NR_UE_Nbr_PCI_1                0.976283
NR_UE_Nbr_RSRQ_1               0.983424
NR_UE_Timing_Advance           0.988842
NR_UE_Nbr_RSRQ_2               0.992178
NR_UE_Nbr_RSRP_2               0.994028
NR_UE_RRCHOOK                  0.995244
NR_UE_Nbr_PCI_2                0.996210
NR_UE_RRCConnectionDrop        0.997082
NR_UE_RRCHOAttempt             0.997871
NR_UE_RRCReEstFail             0.998486
NR_UE_RRCConnectionAttempt     0.998996
NR_UE_RRCConnectionSetupOk     0.999486
NR_UE_RAC

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

# ---- 1. Data Preparation ----
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_ul_selected)
y = df[["Latitude", "Longitude"]].values  # Assuming this is your uplink dataframe

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# ---- 2. Model Definition ----
class StructuredMLP(nn.Module):
    def __init__(self, input_dim, output_uncertainty=False):
        super().__init__()
        self.output_uncertainty = output_uncertainty

        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )

        self.coord_head = nn.Linear(64, 2)
        self.uncertainty_head = nn.Linear(64, 1) if output_uncertainty else None

    def forward(self, x):
        x = self.net(x)
        coords = self.coord_head(x)
        log_var = self.uncertainty_head(x) if self.output_uncertainty else torch.zeros(len(x), 1)
        return coords, log_var

# ---- 3. Training ----
model = StructuredMLP(input_dim=X_train.shape[1], output_uncertainty=True)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    preds, _ = model(X_train_tensor)
    loss = criterion(preds, y_train_tensor)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Train Loss: {loss.item():.4f}")

torch.save(model.state_dict(), "ul_model.pth")
print("‚úÖ Model saved as scanner_model.pth")
# ---- 4. Evaluation ----
model.eval()
with torch.no_grad():
    preds_test, _ = model(X_test_tensor)
    mse = mean_squared_error(y_test_tensor.numpy(), preds_test.numpy())
    print(f"\nüìç Uplink Model Test MSE: {mse:.4f}")

    # Euclidean distance error
    error_dist = np.linalg.norm(preds_test.numpy() - y_test_tensor.numpy(), axis=1)
    print(f"üìè Mean Localization Error: {error_dist.mean():.2f} meters")
    print(f"üìè Median Localization Error: {np.median(error_dist):.2f} meters")


Epoch 0, Train Loss: 1268.2469
Epoch 10, Train Loss: 1247.4113
Epoch 20, Train Loss: 1214.1821
Epoch 30, Train Loss: 1162.5857
Epoch 40, Train Loss: 1089.3528
Epoch 50, Train Loss: 992.8121
Epoch 60, Train Loss: 868.6617
Epoch 70, Train Loss: 710.9476
Epoch 80, Train Loss: 527.1829
Epoch 90, Train Loss: 342.7671
‚úÖ Model saved as scanner_model.pth

üìç Uplink Model Test MSE: 199.5878
üìè Mean Localization Error: 18.94 meters
üìè Median Localization Error: 18.75 meters


df_scanner = pd.read_excel("../data/raw/5G_Scanner.xlsx", sheet_name="Series Formatted Data")


In [19]:
df_scanner = pd.read_excel("../data/raw/5G_Scanner.xlsx", sheet_name="Series Formatted Data")


In [21]:
# Show summary of missing values
missing = df_scanner.isna().sum()
missing = missing[missing > 0].sort_values(ascending=False)

if not missing.empty:
    print("\nüîç Columns with Missing Values:\n")
    print(missing.to_frame("Missing Count").assign(Missing_Percent=lambda x: 100 * x["Missing Count"] / len(df_scanner)))
else:
    print("‚úÖ No missing values remain. Data is clean.")



üîç Columns with Missing Values:

                                  Missing Count  Missing_Percent
Unnamed: 28                               22392       100.000000
Unnamed: 20                               22392       100.000000
Unnamed: 12                               22392       100.000000
NR_Scan_SSB_RSRQ_SortedBy_RSRP_6          22366        99.883887
NR_Scan_SSB_RSRP_SortedBy_RSRP_6          22333        99.736513
NR_Scan_PCI_SortedBy_RSRP_6               22333        99.736513
NR_Scan_SSB_SINR_SortedBy_RSRP_6          22333        99.736513
NR_Scan_SSB_RSRQ_SortedBy_RSRP_5          22204        99.160414
NR_Scan_SSB_SINR_SortedBy_RSRP_5          22070        98.561986
NR_Scan_SSB_RSRP_SortedBy_RSRP_5          22068        98.553055
NR_Scan_PCI_SortedBy_RSRP_5               22068        98.553055
NR_Scan_SSB_RSRQ_SortedBy_RSRP_4          21867        97.655413
NR_Scan_SSB_SINR_SortedBy_RSRP_4          21743        97.101643
NR_Scan_PCI_SortedBy_RSRP_4               21738       

In [22]:
corr = df_scanner.corr(numeric_only=True)
corr_targets = corr[["Latitude", "Longitude"]].abs().sort_values(by="Latitude", ascending=False)
print(corr_targets.head(20))


                                  Latitude  Longitude
Latitude                          1.000000   0.016652
NR_Scan_SSB_RSRQ_SortedBy_RSRP_6  0.944191   0.933413
NR_Scan_SSB_RSRP_SortedBy_RSRP_6  0.901231   0.876537
NR_Scan_SSB_SINR_SortedBy_RSRP_6  0.831979   0.832912
NR_Scan_SSB_RSRP_SortedBy_RSRP_5  0.737973   0.077975
Message                           0.600320   0.121260
NR_Scan_SSB_RSRP_SortedBy_RSRP_4  0.550297   0.242400
NR_Scan_PCI_SortedBy_RSRP_5       0.470664   0.153609
NR_Scan_SSB_RSRP_SortedBy_RSRP_2  0.446118   0.488851
NR_Scan_SSB_RSRP_SortedBy_RSRP_1  0.433516   0.347266
NR_Scan_SSB_RSRQ_SortedBy_RSRP_5  0.381361   0.798560
NR_Scan_SSB_RSRP_SortedBy_RSRP_0  0.277685   0.410579
NR_Scan_PCI_SortedBy_RSRP_3       0.214166   0.071214
NR_Scan_PCI_SortedBy_RSRP_1       0.209826   0.128385
NR_Scan_SSB_SINR_SortedBy_RSRP_1  0.207834   0.008592
NR_Scan_PCI_SortedBy_RSRP_2       0.203746   0.030234
NR_Scan_SSB_SINR_SortedBy_RSRP_4  0.172340   0.677668
NR_Scan_SSB_SINR_SortedBy_RS

In [None]:
import pandas as pd
import numpy as np


# --- 1. Drop completely useless or empty columns ---
drop_cols = [
    'Unnamed: 12', 'Unnamed: 20', 'Unnamed: 28',  # 100% missing
    'Message', 'Time'  # non-predictive
]
df_scanner = df_scanner.drop(columns=[col for col in drop_cols if col in df_scanner.columns], errors="ignore")

# --- 2. Drop very sparse features (>95% missing) ---
too_sparse_cols = [
    'NR_Scan_SSB_RSRQ_SortedBy_RSRP_6',
    'NR_Scan_SSB_RSRP_SortedBy_RSRP_6',
    'NR_Scan_PCI_SortedBy_RSRP_6',
    'NR_Scan_SSB_SINR_SortedBy_RSRP_6',
    'NR_Scan_SSB_RSRQ_SortedBy_RSRP_5',
    'NR_Scan_SSB_SINR_SortedBy_RSRP_5',
    'NR_Scan_SSB_RSRP_SortedBy_RSRP_5',
    'NR_Scan_PCI_SortedBy_RSRP_5',
    'NR_Scan_SSB_RSRQ_SortedBy_RSRP_4',
    'NR_Scan_SSB_SINR_SortedBy_RSRP_4',
    'NR_Scan_PCI_SortedBy_RSRP_4',
    'NR_Scan_SSB_RSRP_SortedBy_RSRP_4'
]
df_scanner = df_scanner.drop(columns=[col for col in too_sparse_cols if col in df_scanner.columns], errors="ignore")

# --- 3. Impute missing values for valid columns ---

# Fill PCI columns with -1 (non-detect)
pci_cols = [col for col in df.columns if "PCI" in col]
df[pci_cols] = df[pci_cols].fillna(-1)

# Fill RSRP/RSRQ/SINR with domain-informed defaults
rsrp_cols = [col for col in df.columns if "RSRP" in col]
rsrq_cols = [col for col in df.columns if "RSRQ" in col]
sinr_cols = [col for col in df.columns if "SINR" in col]

df[rsrp_cols] = df[rsrp_cols].fillna(-200)  # dBm
df[rsrq_cols] = df[rsrq_cols].fillna(-30)   # dB
df[sinr_cols] = df[sinr_cols].fillna(-10)   # dB

# Fill NR_ARFCN (frequency) with median
if "NR_Scan_NR_ARFCN" in df.columns:
    df["NR_Scan_NR_ARFCN"] = df["NR_Scan_NR_ARFCN"].fillna(df["NR_Scan_NR_ARFCN"].median())

# --- 4. Drop object-type columns if any remain ---
df = df.drop(columns=df.select_dtypes(include='object').columns)

# --- 5. Drop rows with missing target (lat/lon) ---
df = df.dropna(subset=["Latitude", "Longitude"])

print("‚úÖ Cleaned Scanner data shape:", df.shape)
