In [4]:
import heapq
import datetime

# Check platform.
import platform
if platform.machine() not in ['x86_64', 'aarch64']:
    raise SystemExit("Unsupported platform!")

import math
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Connecting to clickhouse on remote server
import clickhouse_connect

# Import the channel estimator and some utilities for converting
# the DMRS fields in the right format from the SCF FAPI format that the dataset follows.
from aerial.phy5g.algorithms import ChannelEstimator
from aerial.util.fapi import dmrs_fapi_to_bit_array

!pip3 install torch
!pip3 install torchinfo
# !pip install torch
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

# !pip install torchinfo
from torchinfo import summary

clickhouse_client = clickhouse_connect.get_client(host='localhost')

!pip install scikit-learn
from sklearn.metrics import r2_score  # Import R² score function

import pickle

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [5]:
def moving_average(arr, window_size):
    weights = np.ones(window_size) / window_size
    return np.convolve(arr, weights, mode='valid')

def get_throughput(arr):
    mv_avg = moving_average(arr, 20)
    largest_numbers = heapq.nlargest(1, mv_avg)
    return sum(largest_numbers) / 1
    # print(len(arr))
    # n = 1
    # if len(arr) < n:
    #     return sum(arr) / len(arr)  # If there are less than 10 elements, take the average of all
    # largest_numbers = heapq.nlargest(n, arr)  # Get the n largest numbers
    # return sum(largest_numbers) / n  # Compute the average

In [6]:
TEST_IDs = [
    # '582c3021631e4b8890518c458a83b560',
    # '8501c8ec60bc4766a35c2e30eaf42b28',
    '81ce1181052e4a868dfe5a90243b4136',
    'c3b0ccf5e133483595578598d741af89',
    '7b6be542d6264ede9482eaac2f0891f5',
    # '26d5937541f34de4ad13acc1ea42b9d7',
    'd9d07b91e64e488c87a04936c478f45d',
    '35599c707523406b887334ca6d3801c3',
    'c18c4abb9757456b81a79adad377ea5a',
    'cb8bbf840bd74f51849d62efcaf6fcdc',
    '743fa1e3f6b142fe83a5237528fd5516',
    # '8476930f9a894802b477e26e5db5813f',
]

In [None]:
lower_edge = 15
upper_edge = 1582
n_sub = num_subcarriers = 1596
keep_indices = np.concatenate((np.arange(0, lower_edge-1), np.arange(upper_edge+1, n_sub)))

dataset = []
for test_id in TEST_IDs:
    print(f" --- Test ID: {test_id}")
    query = f"""
            SELECT *
            FROM iperf3db
            WHERE test_id = '{test_id}' AND direction = 0
            ORDER BY timestamp
            """
    # print(query)
    results =  clickhouse_client.query_df(query)
    # print(len(results))
    # print(results)

    start_timestamp = results['timestamp'][0] 
    stop_timestamp = results['timestamp'].iloc[-1]
    print(start_timestamp)
    print(stop_timestamp)
    ul_throughputs = results["throughput_mbps"].to_numpy()
    max_ul_throughput = get_throughput(ul_throughputs)
    print(max_ul_throughput)

    query = f"""
    SELECT * FROM MAC_KPIs 
    WHERE TsTaiNs BETWEEN '{start_timestamp}' AND '{stop_timestamp}'
    ORDER BY TsTaiNs DESC
    """
    # print(query)
    print(f'Query l2s from {start_timestamp} to {stop_timestamp}')
    l2s = clickhouse_client.query_df(query)

    query = f"""
    SELECT * FROM fapi 
    WHERE TsTaiNs BETWEEN '{start_timestamp}' AND '{stop_timestamp}'
    ORDER BY TsTaiNs DESC
    """
    print(f'Query fapis from {start_timestamp} to {stop_timestamp}')
    fapis = clickhouse_client.query_df(query)

    # query = f"""
    # SELECT * FROM fh 
    # WHERE TsTaiNs BETWEEN '{start_timestamp}' AND '{stop_timestamp}'
    # ORDER BY TsTaiNs DESC
    # """
    # fhs = clickhouse_client.query_df(query)

    fhs = pd.DataFrame()
    interval_seconds = 30
    current_start = start_timestamp
    while current_start < stop_timestamp:
        current_end = min(current_start + datetime.timedelta(seconds=interval_seconds), stop_timestamp)
        query = f"""
        SELECT TsTaiNs,fhData FROM fh_tai
        WHERE TsTaiNs BETWEEN '{current_start}' AND '{current_end}'
        ORDER BY TsTaiNs DESC
        """
        print(f'Query fhs from {current_start} to {current_end}')
        df = clickhouse_client.query_df(query)

        indices_to_drop = []
        for index, fh in df.iterrows():
            fh_samp = np.array(fh.fhData, dtype=np.float32)
            rx_slot = np.swapaxes(fh_samp.view(np.complex64).reshape(4, 14, 273*12),2,0)

            try:
                fapi = fapis.loc[fapis['TsTaiNs'] == fh.TsTaiNs].iloc[0].to_dict()
            except:
                indices_to_drop.append(index)
                print(f"Cannot find fapi at {fh.TsTaiNs}")
                continue
            rx_slot[fapi['rbStart'] * 12 : fapi['rbSize'] * 12, fapi['StartSymbolIndex'] : fapi['NrOfSymbols'], :] = 0
            rx = rx_slot[0 : 133 * 12, : 6, 0]
            
            rdft2 = np.fft.fft2(rx)
            rtemp = rdft2[keep_indices, :]  
            rtrunc = np.fft.ifft2(rtemp)

            df.at[index, 'fhData'] = rtrunc
        # Drop rows after the loop
        df.drop(indices_to_drop, inplace=True)

        fhs = pd.concat([fhs, df], ignore_index=True)
        current_start += datetime.timedelta(seconds=interval_seconds)
        # break

    # print(results)
    dataset.append({
        "y" : max_ul_throughput,
        "xs" : {
            "l2": l2s,
            "fapi": fapis,
            "fh": fhs,
        },
    })
    with open(f'dataset_{test_id}.pkl', 'wb') as file:
        pickle.dump(dataset, file)
    # break

 --- Test ID: 81ce1181052e4a868dfe5a90243b4136
2025-04-04 17:32:04
2025-04-04 17:38:16
17.918505502766358
Query l2s from 2025-04-04 17:32:04 to 2025-04-04 17:38:16
Query fapis from 2025-04-04 17:32:04 to 2025-04-04 17:38:16
Query fhs from 2025-04-04 17:32:04 to 2025-04-04 17:32:34
Query fhs from 2025-04-04 17:32:34 to 2025-04-04 17:33:04


In [None]:
# print(len(dataset))
# print(dataset[0]['xs']['fh']['fhData'][0])

In [11]:
with open('dataset.pkl', 'rb') as file:
    d1 = pickle.load(file)

FileNotFoundError: [Errno 2] No such file or directory: 'dataset_2.pk'

In [14]:
with open('dataset_2.pkl', 'rb') as file:
    d2 = pickle.load(file)

In [15]:
with open('dataset_3.pkl', 'rb') as file:
    d3 = pickle.load(file)

In [18]:
print(len(d3))

1


In [19]:
combined_dataset = np.concatenate((d1, d2, d3))
with open('combined_dataset.pkl', 'wb') as file:
    pickle.dump(combined_dataset, file)

PicklingError: memo id too large for LONG_BINGET