In [None]:
import gzip
import tarfile
from io import StringIO

import h3
import pandas as pd

#### 1. Read in used day

In [13]:
# Path to the tar archive
tar_path = "/mnt/common-hdd/raw-sources/tkom-data/2020-09.csv.gz.tar"

target_day = "2020-9-21.csv.gz"

with tarfile.open(tar_path, "r:*") as tar:
    # Extract the file object for the specific day
    file_member = tar.extractfile(target_day)
    with gzip.open(file_member, "rb") as f_in:
        data = StringIO(f_in.read().decode("utf-8"))
        df_sept21 = pd.read_csv(data, index_col=0)
        df_sept21["count"] = 1

# Reset index if needed
df_sept21.reset_index(drop=True, inplace=True)

### 2. Spatial smoothing

In [None]:
columns_to_sum = [
    "traffic",
    "remainers",
    "loc_home",
    "loc_work",
    "loc_freq",
    "sex_female",
    "sex_male",
    "sex_na",
    "arpu_low",
    "arpu_mid",
    "arpu_high",
    "arpu_na",
    "age_young",
    "age_mid",
    "age_old",
    "age_na",
    "plan_priv",
    "plan_corp",
    "plan_roam",
]

In [15]:
# create a custom mean function
# this function sums the group values, however always divides by 7, as each group has 7 neighbors, but some values could be NA
def custom_mean(group):
    # Sum the values of the group
    summed_values = group.sum()
    # Divide the sum by 7
    averaged_values = summed_values / 7
    return averaged_values

In [None]:
def spat_moving_average(data: pd.DataFrame, k: int = 1) -> pd.DataFrame:
    records = []
    for raster_id in data["raster_id"].unique():
        neighbors = h3.k_ring(raster_id, k)
        temp = data[data["raster_id"].isin(neighbors)].copy()

        moving_average = temp.groupby(["year", "month", "day", "hour"])[
            columns_to_sum
        ].apply(custom_mean)
        for i in moving_average.reset_index().values.tolist():
            records.append([raster_id] + i)
    return pd.DataFrame.from_records(
        records, columns=["raster_id", "year", "month", "day", "hour"] + columns_to_sum
    )

In [None]:
df_sept21_smoothed = spat_moving_average(df_sept21, 1)

### 3. Temporal smoothing (if necessary) or time selection

In [None]:
# from scipy.ndimage import gaussian_filter1d

# def apply_gaussian_filter(group, columns, sigma=1):
#    for col in columns:
#        group[col] = gaussian_filter1d(group[col], sigma=sigma)
#    return group

# cols_to_smooth = ['traffic', 'arpu_low', 'arpu_mid', 'arpu_high']
# or all : 'traffic', 'remainers',
#   'loc_home', 'loc_work', 'loc_freq', 'sex_female', 'sex_male', 'sex_na',
#   'arpu_low', 'arpu_mid', 'arpu_high', 'arpu_na', 'age_young', 'age_mid',
#   'age_old', 'age_na', 'plan_priv', 'plan_corp', 'plan_roam'

# df_temporalsmooth = df_sept21_smoothed.groupby("raster_id").apply(apply_gaussian_filter, columns=cols_to_smooth).reset_index(drop=True)

In [None]:
# here, we focus on one specific hour so will skip temporal smoothing
df_15_spatialsmooth = df_sept21_smoothed[df_sept21_smoothed["hour"] == 15]

### 4. Drop unnecessary columns

In [None]:
df_sept21_final = df_15_spatialsmooth.drop(
    columns=[
        "year",
        "month",
        "day",
        "hour",
        "remainers",
        "loc_home",
        "loc_work",
        "loc_freq",
        "sex_female",
        "sex_male",
        "sex_na",
        "arpu_na",
        "age_young",
        "age_mid",
        "age_old",
        "age_na",
        "plan_priv",
        "plan_corp",
        "plan_roam",
    ]
).reset_index(drop=True)

### 5. Save 21st of Sept 3PM traffic and arpu values to use 

In [None]:
df_sept21_final.to_pickle("/mnt/common-ssd/zadorzsofi/telekom/BKK/data/tkom_sept21.pkl")