In [1]:
import pandas as pd
import numpy as np

# Load raw Scanner data
df = pd.read_excel("../data/raw/5G_Scanner.xlsx", sheet_name="Series Formatted Data")

# --- 1. Drop completely useless or empty columns ---
drop_cols = [
    'Unnamed: 12', 'Unnamed: 20', 'Unnamed: 28',  # 100% missing
    'Message', 'Time'  # non-predictive
]
df = df.drop(columns=[col for col in drop_cols if col in df.columns], errors="ignore")

# --- 2. Drop very sparse features (>95% missing) ---
too_sparse_cols = [
    'NR_Scan_SSB_RSRQ_SortedBy_RSRP_6',
    'NR_Scan_SSB_RSRP_SortedBy_RSRP_6',
    'NR_Scan_PCI_SortedBy_RSRP_6',
    'NR_Scan_SSB_SINR_SortedBy_RSRP_6',
    'NR_Scan_SSB_RSRQ_SortedBy_RSRP_5',
    'NR_Scan_SSB_SINR_SortedBy_RSRP_5',
    'NR_Scan_SSB_RSRP_SortedBy_RSRP_5',
    'NR_Scan_PCI_SortedBy_RSRP_5',
    'NR_Scan_SSB_RSRQ_SortedBy_RSRP_4',
    'NR_Scan_SSB_SINR_SortedBy_RSRP_4',
    'NR_Scan_PCI_SortedBy_RSRP_4',
    'NR_Scan_SSB_RSRP_SortedBy_RSRP_4'
]
df = df.drop(columns=[col for col in too_sparse_cols if col in df.columns], errors="ignore")

# --- 3. Impute missing values for valid columns ---

# Fill PCI columns with -1 (non-detect)
pci_cols = [col for col in df.columns if "PCI" in col]
df[pci_cols] = df[pci_cols].fillna(-1)

# Fill RSRP/RSRQ/SINR with domain-informed defaults
rsrp_cols = [col for col in df.columns if "RSRP" in col]
rsrq_cols = [col for col in df.columns if "RSRQ" in col]
sinr_cols = [col for col in df.columns if "SINR" in col]

df[rsrp_cols] = df[rsrp_cols].fillna(-200)  # dBm
df[rsrq_cols] = df[rsrq_cols].fillna(-30)   # dB
df[sinr_cols] = df[sinr_cols].fillna(-10)   # dB

# Fill NR_ARFCN (frequency) with median
if "NR_Scan_NR_ARFCN" in df.columns:
    df["NR_Scan_NR_ARFCN"] = df["NR_Scan_NR_ARFCN"].fillna(df["NR_Scan_NR_ARFCN"].median())

# --- 4. Drop object-type columns if any remain ---
df = df.drop(columns=df.select_dtypes(include='object').columns)

# --- 5. Drop rows with missing target (lat/lon) ---
df = df.dropna(subset=["Latitude", "Longitude"])

print("‚úÖ Cleaned Scanner data shape:", df.shape)


‚úÖ Cleaned Scanner data shape: (22390, 19)


In [2]:
# Show summary of missing values
missing = df.isna().sum()
missing = missing[missing > 0].sort_values(ascending=False)

if not missing.empty:
    print("\nüîç Columns with Missing Values:\n")
    print(missing.to_frame("Missing Count").assign(Missing_Percent=lambda x: 100 * x["Missing Count"] / len(df)))
else:
    print("‚úÖ No missing values remain. Data is clean.")


‚úÖ No missing values remain. Data is clean.


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

# ---- 1. Split features and target ----
X = df.drop(columns=["Latitude", "Longitude"], errors="ignore")
y_lat = df["Latitude"]
y_lon = df["Longitude"]

# ---- 2. Train Random Forests ----
rf_lat = RandomForestRegressor(n_estimators=100, random_state=42)
rf_lat.fit(X, y_lat)
importances_lat = pd.Series(rf_lat.feature_importances_, index=X.columns)

rf_lon = RandomForestRegressor(n_estimators=100, random_state=42)
rf_lon.fit(X, y_lon)
importances_lon = pd.Series(rf_lon.feature_importances_, index=X.columns)

# ---- 3. Print Feature Importances ----
print("\nüìç Scanner Feature Importances (Latitude):")
print(importances_lat.sort_values(ascending=False))

print("\nüß≠ Scanner Feature Importances (Longitude):")
print(importances_lon.sort_values(ascending=False))

# ---- 4. PCA Analysis ----
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=10)
pca.fit(X_scaled)

print("\nüî¢ PCA Explained Variance Ratios:")
print(pca.explained_variance_ratio_)



üìç Scanner Feature Importances (Latitude):
NR_Scan_PCI_SortedBy_RSRP_0         0.348242
NR_Scan_SSB_RSRP_SortedBy_RSRP_0    0.277253
NR_Scan_PCI_SortedBy_RSRP_1         0.159040
NR_Scan_NR_ARFCN                    0.059013
NR_Scan_SSB_RSRQ_SortedBy_RSRP_2    0.035488
NR_Scan_SSB_SINR_SortedBy_RSRP_2    0.032136
NR_Scan_SSB_RSRQ_SortedBy_RSRP_0    0.020707
NR_Scan_PCI_SortedBy_RSRP_2         0.016793
NR_Scan_SSB_SINR_SortedBy_RSRP_0    0.012527
NR_Scan_SSB_SINR_SortedBy_RSRP_1    0.009139
NR_Scan_SSB_RSRP_SortedBy_RSRP_1    0.007996
NR_Scan_SSB_RSRQ_SortedBy_RSRP_1    0.006572
NR_Scan_PCI_SortedBy_RSRP_3         0.006163
NR_Scan_SSB_SINR_SortedBy_RSRP_3    0.002719
NR_Scan_SSB_RSRP_SortedBy_RSRP_2    0.002238
NR_Scan_SSB_RSRP_SortedBy_RSRP_3    0.002123
NR_Scan_SSB_RSRQ_SortedBy_RSRP_3    0.001852
dtype: float64

üß≠ Scanner Feature Importances (Longitude):
NR_Scan_PCI_SortedBy_RSRP_0         0.268340
NR_Scan_PCI_SortedBy_RSRP_1         0.247916
NR_Scan_SSB_RSRP_SortedBy_RSRP_2    0

In [7]:
# --- Average feature importances ---
avg_importance = (importances_lat + importances_lon) / 2
avg_importance = avg_importance.sort_values(ascending=False)

# --- Cumulative contribution ---
cumulative = avg_importance.cumsum() / avg_importance.sum()
print("\nüìà Cumulative Contribution:\n", cumulative)

# --- Select features covering ~99% contribution ---
selected_features = cumulative[cumulative < 0.99].index.tolist()
print(f"\n‚úÖ Selected {len(selected_features)} important features.")

# --- Final selected feature matrix ---
X_selected = X[selected_features]



üìà Cumulative Contribution:
 NR_Scan_PCI_SortedBy_RSRP_0         0.308291
NR_Scan_PCI_SortedBy_RSRP_1         0.511769
NR_Scan_SSB_RSRP_SortedBy_RSRP_0    0.683769
NR_Scan_SSB_RSRP_SortedBy_RSRP_2    0.750536
NR_Scan_SSB_RSRP_SortedBy_RSRP_1    0.810360
NR_Scan_NR_ARFCN                    0.859339
NR_Scan_SSB_RSRQ_SortedBy_RSRP_0    0.881544
NR_Scan_SSB_SINR_SortedBy_RSRP_0    0.903389
NR_Scan_SSB_RSRQ_SortedBy_RSRP_2    0.922119
NR_Scan_SSB_SINR_SortedBy_RSRP_2    0.939490
NR_Scan_PCI_SortedBy_RSRP_2         0.956807
NR_Scan_SSB_SINR_SortedBy_RSRP_1    0.973482
NR_Scan_SSB_RSRQ_SortedBy_RSRP_1    0.987515
NR_Scan_SSB_RSRP_SortedBy_RSRP_3    0.993422
NR_Scan_PCI_SortedBy_RSRP_3         0.996730
NR_Scan_SSB_SINR_SortedBy_RSRP_3    0.998608
NR_Scan_SSB_RSRQ_SortedBy_RSRP_3    1.000000
dtype: float64

‚úÖ Selected 13 important features.


In [4]:
scan = pd.read_excel("../data/raw/5G_Scanner.xlsx", sheet_name="Series Formatted Data")

In [5]:
scan2 = pd.read_excel("../data/Sample_Data_2/5G_Scanner.xlsx", sheet_name="Series Formatted Data")

In [8]:
# Get sets of column names
columns_dl2 = set(scan2.columns)
columns_dl_data = set(scan.columns)
columns_X_selected = set(X_selected.columns)

# Find common columns between dl2 and dl_data
common_columns = columns_dl2 & columns_dl_data

# Compare X_selected columns to common columns
covered_columns = columns_X_selected & common_columns
missing_in_common = columns_X_selected - common_columns
extra_in_common = common_columns - columns_X_selected

# Print results
print("‚úÖ Columns in X_selected that are covered by both dl2 and dl_data:\n", covered_columns)
print("\n‚ùå Columns in X_selected that are NOT in both dl2 and dl_data:\n", missing_in_common)
print("\n‚ÑπÔ∏è Extra common columns NOT used in X_selected (might be useful features):\n", extra_in_common)


‚úÖ Columns in X_selected that are covered by both dl2 and dl_data:
 {'NR_Scan_SSB_RSRP_SortedBy_RSRP_2', 'NR_Scan_PCI_SortedBy_RSRP_1', 'NR_Scan_SSB_RSRP_SortedBy_RSRP_1', 'NR_Scan_NR_ARFCN', 'NR_Scan_SSB_RSRQ_SortedBy_RSRP_1', 'NR_Scan_SSB_RSRQ_SortedBy_RSRP_2', 'NR_Scan_PCI_SortedBy_RSRP_0', 'NR_Scan_SSB_SINR_SortedBy_RSRP_0', 'NR_Scan_SSB_SINR_SortedBy_RSRP_1', 'NR_Scan_SSB_RSRP_SortedBy_RSRP_0', 'NR_Scan_SSB_RSRQ_SortedBy_RSRP_0', 'NR_Scan_SSB_SINR_SortedBy_RSRP_2', 'NR_Scan_PCI_SortedBy_RSRP_2'}

‚ùå Columns in X_selected that are NOT in both dl2 and dl_data:
 set()

‚ÑπÔ∏è Extra common columns NOT used in X_selected (might be useful features):
 {'NR_Scan_SSB_SINR_SortedBy_RSRP_4', 'NR_Scan_SSB_SINR_SortedBy_RSRP_6', 'Time', 'NR_Scan_PCI_SortedBy_RSRP_6', 'NR_Scan_SSB_RSRP_SortedBy_RSRP_4', 'NR_Scan_SSB_SINR_SortedBy_RSRP_5', 'NR_Scan_SSB_RSRP_SortedBy_RSRP_6', 'NR_Scan_SSB_RSRP_SortedBy_RSRP_3', 'NR_Scan_PCI_SortedBy_RSRP_4', 'NR_Scan_SSB_RSRQ_SortedBy_RSRP_6', 'NR_Scan_PCI_Sort

In [10]:
X_selected.columns

Index(['NR_Scan_PCI_SortedBy_RSRP_0', 'NR_Scan_PCI_SortedBy_RSRP_1',
       'NR_Scan_SSB_RSRP_SortedBy_RSRP_0', 'NR_Scan_SSB_RSRP_SortedBy_RSRP_2',
       'NR_Scan_SSB_RSRP_SortedBy_RSRP_1', 'NR_Scan_NR_ARFCN',
       'NR_Scan_SSB_RSRQ_SortedBy_RSRP_0', 'NR_Scan_SSB_SINR_SortedBy_RSRP_0',
       'NR_Scan_SSB_RSRQ_SortedBy_RSRP_2', 'NR_Scan_SSB_SINR_SortedBy_RSRP_2',
       'NR_Scan_PCI_SortedBy_RSRP_2', 'NR_Scan_SSB_SINR_SortedBy_RSRP_1',
       'NR_Scan_SSB_RSRQ_SortedBy_RSRP_1'],
      dtype='object')

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np

# ---- 1. Data Preparation ----
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)
y = df[["Latitude", "Longitude"]].values

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# ---- 2. Model Definition ----
class StructuredMLP(nn.Module):
    def __init__(self, input_dim, output_uncertainty=False):
        super().__init__()
        self.output_uncertainty = output_uncertainty

        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )

        self.coord_head = nn.Linear(64, 2)
        self.uncertainty_head = nn.Linear(64, 1) if output_uncertainty else None

    def forward(self, x):
        x = self.net(x)
        coords = self.coord_head(x)
        log_var = self.uncertainty_head(x) if self.output_uncertainty else torch.zeros(len(x), 1)
        return coords, log_var

# ---- 3. Training ----
model = StructuredMLP(input_dim=X_train.shape[1], output_uncertainty=True)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    preds, _ = model(X_train_tensor)
    loss = criterion(preds, y_train_tensor)
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Train Loss: {loss.item():.4f}")


torch.save(model.state_dict(), "scanner_model.pth")
print("‚úÖ Model saved as scanner_model.pth")
# ---- 4. Evaluation ----
model.eval()
with torch.no_grad():
    preds_test, _ = model(X_test_tensor)
    mse = mean_squared_error(y_test_tensor.numpy(), preds_test.numpy())
    print(f"\nüìç Scanner Model Test MSE: {mse:.4f}")

    # Euclidean localization error
    error_dist = np.linalg.norm(preds_test.numpy() - y_test_tensor.numpy(), axis=1)
    print(f"üìè Mean Localization Error: {error_dist.mean():.2f} meters")
    print(f"üìè Median Localization Error: {np.median(error_dist):.2f} meters")


Epoch 0, Train Loss: 1261.3229
Epoch 10, Train Loss: 1223.7283
Epoch 20, Train Loss: 1163.9115
Epoch 30, Train Loss: 1060.7528
Epoch 40, Train Loss: 899.8802
Epoch 50, Train Loss: 680.5199
Epoch 60, Train Loss: 440.2289
Epoch 70, Train Loss: 245.7580
Epoch 80, Train Loss: 117.0056
Epoch 90, Train Loss: 54.4688
‚úÖ Model saved as scanner_model.pth

üìç Scanner Model Test MSE: 35.1528
üìè Mean Localization Error: 6.29 meters
üìè Median Localization Error: 4.23 meters


In [None]:
# Load input dimensions (e.g., 15 for uplink, 16 for scanner, etc.)
input_dim = X_test_tensor.shape[1]

# Initialize model and load weights
model = StructuredMLP(input_dim=input_dim, output_uncertainty=True)
model.load_state_dict(torch.load("scanner_model.pth"))  # <- Change filename if needed
model.eval()

# Run predictions
with torch.no_grad():
    preds, log_var = model(X_test_tensor)
    predictions = preds.numpy()
    uncertainty = torch.exp(log_var).numpy() if model.output_uncertainty else None


In [None]:
# Downlink
model_dl = StructuredMLP(input_dim=X_dl_test_tensor.shape[1])
model_dl.load_state_dict(torch.load("downlink_model.pth"))
model_dl.eval()

# Uplink
model_ul = StructuredMLP(input_dim=X_ul_test_tensor.shape[1])
model_ul.load_state_dict(torch.load("uplink_model.pth"))
model_ul.eval()

# Scanner
model_sc = StructuredMLP(input_dim=X_sc_test_tensor.shape[1])
model_sc.load_state_dict(torch.load("scanner_model.pth"))
model_sc.eval()


In [None]:
from sklearn.metrics import r2_score

# For Latitude
r2_lat = r2_score(y_test[:, 0], y_pred[:, 0])

# For Longitude
r2_lon = r2_score(y_test[:, 1], y_pred[:, 1])

print(f"R¬≤ Score - Latitude: {r2_lat:.4f}")
print(f"R¬≤ Score - Longitude: {r2_lon:.4f}")
