# 01_preprocessing_and_symbolization

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

DATA_PATH = r"c:/Users/asus/OneDrive/Desktop/ITML Project/mpact_iq_features_full.csv"
OUTPUT_DIR = r"c:/Users/asus/OneDrive/Desktop/ITML Project/processed_data"
PLOTS_DIR = r"c:/Users/asus/OneDrive/Desktop/ITML Project/plots"

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

def load_data(path):
    print(f"Loading {path}...")
    df = pd.read_csv(path)
    print(f"Loaded {df.shape[0]} rows, {df.shape[1]} columns.")
    return df

def preprocess(df):
    required = ['device_class', 'filename', 'i_mean', 'q_mean', 'mag_mean', 'phase_mean']
    if any(col not in df.columns for col in required):
        raise ValueError(f"Missing columns. Required: {required}")

    # Sort to maintain a consistent sequence
    return df.sort_values(by=['device_class', 'filename']).reset_index(drop=True)

def quantize(df, n_clusters=8):
    print(f"Quantizing into {n_clusters} symbols...")
    
    # Use I/Q means to represent the signal state
    X = df[['i_mean', 'q_mean']].values
    X_scaled = StandardScaler().fit_transform(X)
    
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df['symbol'] = kmeans.fit_predict(X_scaled)
    
    # Visualize the symbol clusters
    plt.figure(figsize=(10, 8))
    sns.scatterplot(data=df, x='i_mean', y='q_mean', hue='symbol', palette='tab10', s=10, alpha=0.6)
    plt.title('Signal Symbolization (K-Means)')
    plt.xlabel('I Mean')
    plt.ylabel('Q Mean')
    plt.savefig(os.path.join(PLOTS_DIR, 'symbolization_clusters.png'))
    plt.close()
    
    return df

def main():
    df = load_data(DATA_PATH)
    print("Class distribution:\n", df['device_class'].value_counts())
    
    df = preprocess(df)
    df = quantize(df, n_clusters=8)
    
    output_path = os.path.join(OUTPUT_DIR, 'symbolized_data.csv')
    df.to_csv(output_path, index=False)
    print(f"Saved to {output_path}")

main()