In [36]:
import numpy as np
import os

def analyze_npz_file(file_path):
    """
    Phân tích file .npz (compressed numpy arrays)
    """
    print(f"\n{'='*60}")
    print(f"PHÂN TÍCH FILE: {file_path}")
    print(f"{'='*60}")
    
    if not os.path.exists(file_path):
        print(f"❌ File không tồn tại: {file_path}")
        return
    
    # Kiểm tra kích thước file
    file_size = os.path.getsize(file_path)
    print(f"📁 Kích thước file: {file_size:,} bytes ({file_size/1024/1024:.2f} MB)")
    
    try:
        # Đọc file .npz
        with np.load(file_path) as data:
            print(f"🔍 Số lượng arrays trong file: {len(data.files)}")
            print(f"📝 Danh sách keys: {list(data.files)}")
            
            total_memory = 0
            
            for key in data.files:
                array = data[key]
                array_size = array.nbytes
                total_memory += array_size
                
                print(f"\n🔸 Key: '{key}'")
                print(f"   - Shape: {array.shape}")
                print(f"   - Dtype: {array.dtype}")
                print(f"   - Size: {array.size:,} elements")
                print(f"   - Memory: {array_size:,} bytes ({array_size/1024/1024:.2f} MB)")
                
                # Hiển thị một số thống kê cơ bản
                if array.size > 0:
                    print(f"   - Min: {array.min():.6f}")
                    print(f"   - Max: {array.max():.6f}")
                    print(f"   - Mean: {array.mean():.6f}")
                    print(f"   - Std: {array.std():.6f}")
                
                # Hiển thị mẫu dữ liệu (nếu không quá lớn)
                if array.size <= 100:
                    print(f"   - Sample data:\n{array}")
                elif len(array.shape) >= 2:
                    print(f"   - Sample (first 3x3):\n{array[:3, :3] if array.shape[0] >= 3 and array.shape[1] >= 3 else array[:2, :2]}")
                else:
                    print(f"   - Sample (first 10): {array.flat[:10]}")
            
            print(f"\n💾 Tổng bộ nhớ các arrays: {total_memory:,} bytes ({total_memory/1024/1024:.2f} MB)")
            
    except Exception as e:
        print(f"❌ Lỗi khi đọc file: {e}")

def analyze_npy_file(file_path):
    """
    Phân tích file .npy (single numpy array)
    """
    print(f"\n{'='*60}")
    print(f"PHÂN TÍCH FILE: {file_path}")
    print(f"{'='*60}")
    
    if not os.path.exists(file_path):
        print(f"❌ File không tồn tại: {file_path}")
        return
    
    # Kiểm tra kích thước file
    file_size = os.path.getsize(file_path)
    print(f"📁 Kích thước file: {file_size:,} bytes ({file_size/1024/1024:.2f} MB)")
    
    try:
        # Đọc file .npy
        array = np.load(file_path)
        
        print(f"🔸 Array info:")
        print(f"   - Shape: {array.shape}")
        print(f"   - Dtype: {array.dtype}")
        print(f"   - Size: {array.size:,} elements")
        print(f"   - Memory: {array.nbytes:,} bytes ({array.nbytes/1024/1024:.2f} MB)")
        
        # Thống kê cơ bản
        if array.size > 0:
            print(f"   - Min: {array.min():.6f}")
            print(f"   - Max: {array.max():.6f}")
            print(f"   - Mean: {array.mean():.6f}")
            print(f"   - Std: {array.std():.6f}")
        
        # Hiển thị mẫu dữ liệu
        if array.size <= 100:
            print(f"   - Full data:\n{array}")
        elif len(array.shape) >= 2:
            print(f"   - Sample (first 5x5):")
            print(array[:5, :5] if array.shape[0] >= 5 and array.shape[1] >= 5 else array)
        else:
            print(f"   - Sample (first 20): {array.flat[:20]}")
            
        # Phân tích đặc biệt cho adjacency matrix
        if len(array.shape) == 2 and array.shape[0] == array.shape[1]:
            print(f"\n🔍 PHÂN TÍCH ADJACENCY MATRIX:")
            print(f"   - Là ma trận vuông: {array.shape[0]}x{array.shape[1]}")
            print(f"   - Số edges (non-zero): {np.count_nonzero(array):,}")
            print(f"   - Density: {np.count_nonzero(array) / array.size * 100:.2f}%")
            print(f"   - Symmetric: {np.allclose(array, array.T)}")
            print(f"   - Diagonal sum: {np.trace(array):.6f}")
            
    except Exception as e:
        print(f"❌ Lỗi khi đọc file: {e}")

def compare_files():
    """
    So sánh 2 file và tìm mối liên hệ
    """
    print(f"\n{'='*60}")
    print(f"SO SÁNH VÀ PHÂN TÍCH MỐI LIÊN HỆ")
    print(f"{'='*60}")
    
    npz_path = "data/bike.npz"
    npy_path = "data/bike_svd.npy"
    
    if os.path.exists(npz_path) and os.path.exists(npy_path):
        try:
            # Đọc cả 2 file
            npz_data = np.load(npz_path)
            npy_data = np.load(npy_path)
            
            print(f"🔍 Tìm mối liên hệ giữa 2 file:")
            
            # So sánh kích thước
            for key in npz_data.files:
                array = npz_data[key]
                if len(array.shape) >= 2:
                    print(f"   - {key} shape: {array.shape}")
                    if array.shape[0] == npy_data.shape[0] or array.shape[1] == npy_data.shape[0]:
                        print(f"     ✅ Có liên hệ về số nodes với adjacency matrix")
            
            print(f"   - bike_svd.npy shape: {npy_data.shape}")
            print(f"   - Adjacency matrix cho {npy_data.shape[0]} nodes")
            
        except Exception as e:
            print(f"❌ Lỗi khi so sánh: {e}")
    else:
        print("❌ Một hoặc cả 2 file không tồn tại")

# Chạy phân tích
print("🚀 BẮT ĐẦU PHÂN TÍCH DỮ LIỆU BIKE")

# Phân tích file bike.npz
analyze_npz_file("data/bike.npz")

# Phân tích file bike_svd.npy  
analyze_npy_file("data/bike_svd.npy")

# So sánh 2 file
compare_files()

print(f"\n{'='*60}")
print("✅ HOÀN THÀNH PHÂN TÍCH")
print(f"{'='*60}")

🚀 BẮT ĐẦU PHÂN TÍCH DỮ LIỆU BIKE

PHÂN TÍCH FILE: data/bike.npz
📁 Kích thước file: 35,174,604 bytes (33.55 MB)
🔍 Số lượng arrays trong file: 11
📝 Danh sách keys: ['train_x', 'train_target', 'train_timestamp', 'val_x', 'val_target', 'val_timestamp', 'test_x', 'test_target', 'test_timestamp', 'mean', 'std']

🔸 Key: 'train_x'
   - Shape: (3001, 250, 2, 12)
   - Dtype: float64
   - Size: 18,006,000 elements
   - Memory: 144,048,000 bytes (137.37 MB)
   - Min: -1.000000
   - Max: 1.000000
   - Mean: -0.937762
   - Std: 0.092912
   - Sample (first 3x3):
[[[[-0.97916667 -1.         -1.         -1.         -1.
    -1.         -1.         -1.         -1.         -1.
    -1.         -1.        ]
   [-1.         -1.         -1.         -1.         -1.
    -1.         -1.         -1.         -1.         -1.
    -1.         -1.        ]]

  [[-1.         -1.         -1.         -1.         -1.
    -1.         -1.         -1.         -1.         -1.
    -1.         -0.97916667]
   [-1.         -1.  

In [None]:
def create_bike_raw_npz():
    with h5py.File('data/nogrid/bike_data.h5', 'r') as f:
        # Đọc pick và drop riêng biệt
        bike_pick = f['bike_pick'][:]  # Shape: (T, N)
        bike_drop = f['bike_drop'][:]  # Shape: (T, N)
        
        print(f"📊 bike_pick shape: {bike_pick.shape}")
        print(f"📊 bike_drop shape: {bike_drop.shape}")
        
        # Kết hợp thành tensor 3D: (T, N, F)
        # F=2: [pick_ups, drop_offs]
        raw_data = np.stack([bike_pick, bike_drop], axis=-1)  # Shape: (T, N, 2)
        
        print(f"✅ Combined data shape: {raw_data.shape}")
        print(f"   - Time steps: {raw_data.shape[0]}")
        print(f"   - Stations: {raw_data.shape[1]}")
        print(f"   - Features: {raw_data.shape[2]} (pick_up, drop_off)")
        
    # Lưu thành npz format
    np.savez_compressed('bike_raw.npz', data=raw_data)
    print(f"✅ Created bike_raw.npz with shape: {raw_data.shape}")
    
    return raw_data

# Gọi hàm
raw_data = create_bike_raw_npz()

📊 bike_pick shape: (4368, 266)
📊 bike_drop shape: (4368, 266)
✅ Combined data shape: (4368, 266, 2)
   - Time steps: 4368
   - Stations: 266
   - Features: 2 (pick_up, drop_off)
✅ Created bike_raw.npz with shape: (4368, 266, 2)


In [56]:
def quick_check():
    file1 = 'taxi_raw_r1_d0_w0.npz'
    file2 = 'data/taxi.npz'
    
    try:
        data1 = np.load(file1)
        data2 = np.load(file2)
        
        print("🚀 QUICK CHECK:")
        print(f"   Keys match: {set(data1.files) == set(data2.files)}")
        
        if 'train_x' in data1.files and 'train_x' in data2.files:
            shapes_match = data1['train_x'].shape == data2['train_x'].shape
            print(f"   train_x shapes match: {shapes_match}")
            
            if shapes_match:
                data_identical = np.allclose(data1['train_x'], data2['train_x'])
                print(f"   train_x data identical: {data_identical}")
        
        if 'train_timestamp' in data1.files and 'train_timestamp' in data2.files:
            ts_identical = np.array_equal(data1['train_timestamp'], data2['train_timestamp'])
            print(f"   Timestamps identical: {ts_identical}")
            
    except Exception as e:
        print(f"❌ Error: {e}")

quick_check()

🚀 QUICK CHECK:
   Keys match: True
   train_x shapes match: True
   train_x data identical: True
   Timestamps identical: True


In [57]:
import numpy as np
import os

def check_and_compare_files_detailed():
    file1 = 'taxi_raw_r1_d0_w0.npz'
    file2 = 'data/taxi.npz'
    
    print("🔍 KIỂM TRA CHI TIẾT VÀ SO SÁNH FILES")
    print("="*60)
    
    # Kiểm tra file tồn tại
    if not os.path.exists(file1):
        print(f"❌ File không tồn tại: {file1}")
        return
    if not os.path.exists(file2):
        print(f"❌ File không tồn tại: {file2}")
        return
    
    # Load files
    print(f"📂 Loading {file1}...")
    data1 = np.load(file1)
    
    print(f"📂 Loading {file2}...")
    data2 = np.load(file2)
    
    # So sánh keys
    print(f"\n📋 KEYS COMPARISON:")
    keys1 = set(data1.files)
    keys2 = set(data2.files)
    
    print(f"   {file1} keys: {sorted(keys1)}")
    print(f"   {file2} keys: {sorted(keys2)}")
    
    common_keys = keys1.intersection(keys2)
    only_in_1 = keys1 - keys2
    only_in_2 = keys2 - keys1
    
    if common_keys:
        print(f"   ✅ Common keys: {sorted(common_keys)}")
    if only_in_1:
        print(f"   ⚠️ Only in {file1}: {sorted(only_in_1)}")
    if only_in_2:
        print(f"   ⚠️ Only in {file2}: {sorted(only_in_2)}")
    
    # So sánh shapes và data cho TẤT CẢ common keys
    print(f"\n📊 DETAILED COMPARISON FOR ALL KEYS:")
    print("="*60)
    
    for key in sorted(common_keys):
        print(f"\n🔍 KEY: {key}")
        print("-" * 40)
        
        arr1 = data1[key]
        arr2 = data2[key]
        
        # So sánh shapes
        shape_match = arr1.shape == arr2.shape
        print(f"   📐 Shapes:")
        print(f"      {file1}: {arr1.shape}")
        print(f"      {file2}: {arr2.shape}")
        print(f"      Match: {'✅' if shape_match else '❌'}")
        
        # So sánh data types
        dtype_match = arr1.dtype == arr2.dtype
        print(f"   🔤 Data types:")
        print(f"      {file1}: {arr1.dtype}")
        print(f"      {file2}: {arr2.dtype}")
        print(f"      Match: {'✅' if dtype_match else '❌'}")
        
        # So sánh data nếu shapes giống nhau
        if shape_match:
            if arr1.size > 0 and arr2.size > 0:
                try:
                    # Kiểm tra identical
                    is_identical = np.array_equal(arr1, arr2)
                    
                    if is_identical:
                        print(f"   📊 Data: ✅ IDENTICAL")
                    else:
                        # Tính sự khác biệt
                        if np.issubdtype(arr1.dtype, np.number) and np.issubdtype(arr2.dtype, np.number):
                            max_diff = np.abs(arr1 - arr2).max()
                            mean_diff = np.abs(arr1 - arr2).mean()
                            
                            print(f"   📊 Data: ❌ DIFFERENT")
                            print(f"      Max difference: {max_diff}")
                            print(f"      Mean difference: {mean_diff}")
                            
                            # Kiểm tra gần đúng
                            is_close = np.allclose(arr1, arr2, rtol=1e-5, atol=1e-8)
                            print(f"      Close (rtol=1e-5): {'✅' if is_close else '❌'}")
                            
                            # Sample values để debug
                            if arr1.ndim == 1:
                                print(f"   📋 First 5 values:")
                                print(f"      {file1}: {arr1[:5]}")
                                print(f"      {file2}: {arr2[:5]}")
                            elif arr1.ndim == 2:
                                print(f"   📋 First 3x3 values:")
                                print(f"      {file1}:")
                                print(f"         {arr1[:3, :3]}")
                                print(f"      {file2}:")
                                print(f"         {arr2[:3, :3]}")
                            elif arr1.ndim >= 3:
                                print(f"   📋 Sample values (first element):")
                                if arr1.ndim == 3:
                                    print(f"      {file1}: shape {arr1[0].shape}")
                                    print(f"         {arr1[0, :3, :3] if arr1.shape[1] >= 3 and arr1.shape[2] >= 3 else arr1[0]}")
                                    print(f"      {file2}: shape {arr2[0].shape}")
                                    print(f"         {arr2[0, :3, :3] if arr2.shape[1] >= 3 and arr2.shape[2] >= 3 else arr2[0]}")
                                elif arr1.ndim == 4:
                                    print(f"      {file1}: {arr1[0, :2, :2, :3] if min(arr1.shape[1:3]) >= 2 else 'Shape too small'}")
                                    print(f"      {file2}: {arr2[0, :2, :2, :3] if min(arr2.shape[1:3]) >= 2 else 'Shape too small'}")
                        else:
                            print(f"   📊 Data: ❌ DIFFERENT (non-numeric)")
                            print(f"   📋 First few values:")
                            print(f"      {file1}: {arr1.flat[:5] if arr1.size >= 5 else arr1.flat[:]}")
                            print(f"      {file2}: {arr2.flat[:5] if arr2.size >= 5 else arr2.flat[:]}")
                    
                    # Thống kê cơ bản cho numeric data
                    if np.issubdtype(arr1.dtype, np.number):
                        print(f"   📈 Statistics:")
                        print(f"      {file1}: min={arr1.min():.6f}, max={arr1.max():.6f}, mean={arr1.mean():.6f}")
                        print(f"      {file2}: min={arr2.min():.6f}, max={arr2.max():.6f}, mean={arr2.mean():.6f}")
                        
                except Exception as e:
                    print(f"   ❌ Error comparing data: {e}")
            else:
                print(f"   📊 Data: Empty arrays")
        else:
            print(f"   📊 Data: Cannot compare - different shapes")
    
    # Tổng kết cuối
    print(f"\n" + "="*60)
    print(f"📝 FINAL SUMMARY:")
    
    all_shapes_match = True
    all_data_identical = True
    
    for key in sorted(common_keys):
        arr1 = data1[key]
        arr2 = data2[key]
        
        shape_match = arr1.shape == arr2.shape
        if not shape_match:
            all_shapes_match = False
            
        if shape_match and arr1.size > 0:
            try:
                data_identical = np.array_equal(arr1, arr2)
                if not data_identical:
                    all_data_identical = False
            except:
                all_data_identical = False
    
    print(f"   Keys match: {'✅' if keys1 == keys2 else '❌'}")
    print(f"   All shapes match: {'✅' if all_shapes_match else '❌'}")
    print(f"   All data identical: {'✅' if all_data_identical else '❌'}")
    
    # File sizes
    size1 = os.path.getsize(file1) / (1024*1024)
    size2 = os.path.getsize(file2) / (1024*1024)
    print(f"   File sizes: {file1} ({size1:.2f}MB) vs {file2} ({size2:.2f}MB)")
    
    if all_data_identical and keys1 == keys2:
        print(f"\n🎉 CONCLUSION: Files are IDENTICAL!")
    else:
        print(f"\n⚠️ CONCLUSION: Files are DIFFERENT!")

# Chạy kiểm tra chi tiết
check_and_compare_files_detailed()

🔍 KIỂM TRA CHI TIẾT VÀ SO SÁNH FILES
📂 Loading taxi_raw_r1_d0_w0.npz...
📂 Loading data/taxi.npz...

📋 KEYS COMPARISON:
   taxi_raw_r1_d0_w0.npz keys: ['mean', 'std', 'test_target', 'test_timestamp', 'test_x', 'train_target', 'train_timestamp', 'train_x', 'val_target', 'val_timestamp', 'val_x']
   data/taxi.npz keys: ['mean', 'std', 'test_target', 'test_timestamp', 'test_x', 'train_target', 'train_timestamp', 'train_x', 'val_target', 'val_timestamp', 'val_x']
   ✅ Common keys: ['mean', 'std', 'test_target', 'test_timestamp', 'test_x', 'train_target', 'train_timestamp', 'train_x', 'val_target', 'val_timestamp', 'val_x']

📊 DETAILED COMPARISON FOR ALL KEYS:

🔍 KEY: mean
----------------------------------------
   📐 Shapes:
      taxi_raw_r1_d0_w0.npz: (1, 1, 2, 1)
      data/taxi.npz: (1, 1, 2, 1)
      Match: ✅
   🔤 Data types:
      taxi_raw_r1_d0_w0.npz: int64
      data/taxi.npz: int64
      Match: ✅
   📊 Data: ✅ IDENTICAL
   📈 Statistics:
      taxi_raw_r1_d0_w0.npz: min=669.000000, 

In [6]:
def search_data(sequence_length, num_of_depend, label_start_idx,
                num_for_predict, units, points_per_hour):
    if points_per_hour < 0:
        raise ValueError("points_per_hour should be greater than 0!")

    if label_start_idx + num_for_predict > sequence_length:
        return None

    x_idx = []
    for i in range(1, num_of_depend + 1):
        start_idx = label_start_idx - points_per_hour * units * i
        end_idx = start_idx + num_for_predict
        if start_idx >= 0:
            x_idx.append((start_idx, end_idx))
        else:
            return None

    if len(x_idx) != num_of_depend:
        return None

    return x_idx[::-1]


def get_sample_indices(data_sequence, num_of_weeks, num_of_days, num_of_hours,
                       label_start_idx, num_for_predict, points_per_hour=12):
    week_sample, day_sample, hour_sample = None, None, None

    if label_start_idx + num_for_predict > data_sequence.shape[0]:
        return week_sample, day_sample, hour_sample, None

    if num_of_weeks > 0:
        week_indices = search_data(data_sequence.shape[0], num_of_weeks,
                                   label_start_idx, num_for_predict,
                                   7 * 24, points_per_hour)
        if not week_indices:
            return None, None, None, None

        week_sample = np.concatenate([data_sequence[i: j]
                                      for i, j in week_indices], axis=0)

    if num_of_days > 0:
        day_indices = search_data(data_sequence.shape[0], num_of_days,
                                  label_start_idx, num_for_predict,
                                  24, points_per_hour)
        if not day_indices:
            return None, None, None, None

        day_sample = np.concatenate([data_sequence[i: j]
                                     for i, j in day_indices], axis=0)

    if num_of_hours > 0:
        hour_indices = search_data(data_sequence.shape[0], num_of_hours,
                                   label_start_idx, num_for_predict,
                                   1, points_per_hour)
        if not hour_indices:
            return None, None, None, None

        hour_sample = np.concatenate([data_sequence[i: j]
                                      for i, j in hour_indices], axis=0)

    target = data_sequence[label_start_idx: label_start_idx + num_for_predict]

    return week_sample, day_sample, hour_sample, target


def MinMaxnormalization(train, val, test):
    assert train.shape[1:] == val.shape[1:] and val.shape[1:] == test.shape[1:]

    _max = train.max(axis=(0, 1, 3), keepdims=True)
    _min = train.min(axis=(0, 1, 3), keepdims=True)

    print('_max.shape:', _max.shape)
    print('_min.shape:', _min.shape)

    def normalize(x):
        x = 1. * (x - _min) / (_max - _min)
        x = 2. * x - 1.
        return x

    train_norm = normalize(train)
    val_norm = normalize(val)
    test_norm = normalize(test)

    return {'_max': _max, '_min': _min}, train_norm, val_norm, test_norm

print("✅ Functions defined!")

✅ Functions defined!


In [38]:
def read_and_generate_dataset_encoder_decoder_correct(graph_signal_matrix_filename,
                                              num_of_weeks, num_of_days,
                                              num_of_hours, num_for_predict,
                                              points_per_hour=12, save=False):
    
    data_seq = np.load(graph_signal_matrix_filename)['data']  # (T, N, F)
    print(f"📊 Loaded data shape: {data_seq.shape}")

    all_samples = []
    for idx in range(data_seq.shape[0]):
        sample = get_sample_indices(data_seq, num_of_weeks, num_of_days,
                                    num_of_hours, idx, num_for_predict,
                                    points_per_hour)
        if ((sample[0] is None) and (sample[1] is None) and (sample[2] is None)):
            continue

        week_sample, day_sample, hour_sample, target = sample

        sample = []

        if num_of_weeks > 0:
            week_sample = np.expand_dims(week_sample, axis=0).transpose((0, 2, 3, 1))  # (1,N,F,T)
            sample.append(week_sample)

        if num_of_days > 0:
            day_sample = np.expand_dims(day_sample, axis=0).transpose((0, 2, 3, 1))  # (1,N,F,T)
            sample.append(day_sample)

        if num_of_hours > 0:
            hour_sample = np.expand_dims(hour_sample, axis=0).transpose((0, 2, 3, 1))  # (1,N,F,T)
            sample.append(hour_sample)

        # 🔥 SỬA: GIỮ NGUYÊN CẢ 2 FEATURES CHO TARGET
        target = np.expand_dims(target, axis=0).transpose((0, 2, 3, 1))  # (1,N,F,T)
        sample.append(target)

        # 🔥 SỬA: TIMESTAMP THÀNH INT64
        time_sample = np.expand_dims(np.array([idx], dtype=np.int64), axis=0)  # (1,1) int64
        sample.append(time_sample)

        all_samples.append(sample)

    print(f"📊 Total samples created: {len(all_samples)}")

    # Chia dữ liệu chronological
    total_samples = len(all_samples)
    
    if total_samples >= 1344:
        test_samples = all_samples[-672:]
        val_samples = all_samples[-1344:-672]
        train_samples = all_samples[:-1344]
        
        print(f"📊 Chronological split:")
        print(f"   Train samples: {len(train_samples)} (earliest)")
        print(f"   Val samples: {len(val_samples)} (middle)")
        print(f"   Test samples: {len(test_samples)} (latest)")
    else:
        split_line1 = int(total_samples * 0.6)
        split_line2 = int(total_samples * 0.8)
        train_samples = all_samples[:split_line1]
        val_samples = all_samples[split_line1:split_line2]
        test_samples = all_samples[split_line2:]

    # Concatenate samples
    training_set = [np.concatenate(i, axis=0) for i in zip(*train_samples)]
    validation_set = [np.concatenate(i, axis=0) for i in zip(*val_samples)]
    testing_set = [np.concatenate(i, axis=0) for i in zip(*test_samples)]

    train_x = np.concatenate(training_set[:-2], axis=-1)  # (B,N,F,T_total)
    val_x = np.concatenate(validation_set[:-2], axis=-1)
    test_x = np.concatenate(testing_set[:-2], axis=-1)

    train_target = training_set[-2]  # (B,N,F,T)
    val_target = validation_set[-2]
    test_target = testing_set[-2]

    train_timestamp = training_set[-1]  # int64
    val_timestamp = validation_set[-1]   # int64
    test_timestamp = testing_set[-1]     # int64

    print(f"🔍 Before normalization shapes:")
    print(f"   train_target: {train_target.shape}")
    print(f"   val_target: {val_target.shape}")
    print(f"   test_target: {test_target.shape}")
    
    # 🔥 KIỂM TRA TIMESTAMP DTYPES
    print(f"🔍 Timestamp dtypes:")
    print(f"   train_timestamp: {train_timestamp.dtype}")
    print(f"   val_timestamp: {val_timestamp.dtype}")
    print(f"   test_timestamp: {test_timestamp.dtype}")

    # Normalization chỉ cho input X
    (stats, train_x_norm, val_x_norm, test_x_norm) = MinMaxnormalization(train_x, val_x, test_x)

    all_data = {
        'train': {
            'x': train_x_norm,
            'target': train_target,  # (B,N,F,T) = (3001,250,2,12)
            'timestamp': train_timestamp,
        },
        'val': {
            'x': val_x_norm,
            'target': val_target,    # (B,N,F,T) = (672,250,2,12)
            'timestamp': val_timestamp,
        },
        'test': {
            'x': test_x_norm,
            'target': test_target,   # (B,N,F,T) = (672,250,2,12)
            'timestamp': test_timestamp,
        },
        'stats': {
            '_max': stats['_max'],
            '_min': stats['_min'],
        }
    }
    
    # Print results
    print('✅ CORRECTED Results:')
    print('train x:', all_data['train']['x'].shape, all_data['train']['x'].dtype)
    print('train target:', all_data['train']['target'].shape, all_data['train']['target'].dtype)
    print('train timestamp:', all_data['train']['timestamp'].shape, all_data['train']['timestamp'].dtype)
    print()
    print('val x:', all_data['val']['x'].shape, all_data['val']['x'].dtype)
    print('val target:', all_data['val']['target'].shape, all_data['val']['target'].dtype)
    print('val timestamp:', all_data['val']['timestamp'].shape, all_data['val']['timestamp'].dtype)
    print()
    print('test x:', all_data['test']['x'].shape, all_data['test']['x'].dtype)
    print('test target:', all_data['test']['target'].shape, all_data['test']['target'].dtype)
    print('test timestamp:', all_data['test']['timestamp'].shape, all_data['test']['timestamp'].dtype)

    if save:
        file = os.path.basename(graph_signal_matrix_filename).split('.')[0]
        dirpath = os.path.dirname(graph_signal_matrix_filename)
        filename = os.path.join(dirpath,
                                file + '_r' + str(num_of_hours) + '_d' + str(num_of_days) + '_w' + str(num_of_weeks))
        print(f'💾 Saving to: {filename}.npz')
        np.savez_compressed(filename,
                            train_x=all_data['train']['x'], train_target=all_data['train']['target'],
                            train_timestamp=all_data['train']['timestamp'],
                            val_x=all_data['val']['x'], val_target=all_data['val']['target'],
                            val_timestamp=all_data['val']['timestamp'],
                            test_x=all_data['test']['x'], test_target=all_data['test']['target'],
                            test_timestamp=all_data['test']['timestamp'],
                            mean=all_data['stats']['_max'], std=all_data['stats']['_min']
                            )
    return all_data

print("✅ Function with CORRECT target shape and INT64 timestamps defined!")

✅ Function with CORRECT target shape and INT64 timestamps defined!


In [54]:
# Đọc config file
config = configparser.ConfigParser()
config.read('configurations/taxi.conf')

data_config = config['Data']
training_config = config['Training']

# Lấy parameters
graph_signal_matrix_filename = data_config['graph_signal_matrix_filename']
num_of_vertices = int(data_config['num_of_vertices'])
points_per_hour = int(data_config['points_per_hour'])
num_for_predict = int(data_config['num_for_predict'])
num_of_weeks = int(training_config['num_of_weeks'])
num_of_days = int(training_config['num_of_days'])
num_of_hours = int(training_config['num_of_hours'])

print(f"📋 Configuration:")
print(f"   Data file: {graph_signal_matrix_filename}")
print(f"   Vertices: {num_of_vertices}")
print(f"   Points per hour: {points_per_hour}")
print(f"   Predict steps: {num_for_predict}")
print(f"   Temporal: W={num_of_weeks}, D={num_of_days}, H={num_of_hours}")

# Kiểm tra data shape
data = np.load(graph_signal_matrix_filename)
print(f"✅ Data shape: {data['data'].shape}")

📋 Configuration:
   Data file: taxi_raw.npz
   Vertices: 266
   Points per hour: 12
   Predict steps: 12
   Temporal: W=0, D=0, H=1
✅ Data shape: (4368, 266, 2)


In [55]:
print("🚀 Starting data processing...")

all_data = read_and_generate_dataset_encoder_decoder_correct(
    graph_signal_matrix_filename=graph_signal_matrix_filename,
    num_of_weeks=num_of_weeks,
    num_of_days=num_of_days,
    num_of_hours=num_of_hours,
    num_for_predict=num_for_predict,
    points_per_hour=points_per_hour,
    save=True
)

print("\n🎉 Processing complete!")

🚀 Starting data processing...
📊 Loaded data shape: (4368, 266, 2)
📊 Total samples created: 4345
📊 Chronological split:
   Train samples: 3001 (earliest)
   Val samples: 672 (middle)
   Test samples: 672 (latest)
🔍 Before normalization shapes:
   train_target: (3001, 266, 2, 12)
   val_target: (672, 266, 2, 12)
   test_target: (672, 266, 2, 12)
🔍 Timestamp dtypes:
   train_timestamp: int64
   val_timestamp: int64
   test_timestamp: int64
_max.shape: (1, 1, 2, 1)
_min.shape: (1, 1, 2, 1)
✅ CORRECTED Results:
train x: (3001, 266, 2, 12) float64
train target: (3001, 266, 2, 12) int64
train timestamp: (3001, 1) int64

val x: (672, 266, 2, 12) float64
val target: (672, 266, 2, 12) int64
val timestamp: (672, 1) int64

test x: (672, 266, 2, 12) float64
test target: (672, 266, 2, 12) int64
test timestamp: (672, 1) int64
💾 Saving to: taxi_raw_r1_d0_w0.npz

🎉 Processing complete!


In [47]:
import h5py
import numpy as np
from scipy.spatial.distance import cdist
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import os

# ========================================================================================
# NORMALIZATION METHODS
# ========================================================================================

class normalization:
    @staticmethod
    def MinMax01():
        return MinMaxScaler(feature_range=(0, 1))
    
    @staticmethod
    def MinMax11():
        return MinMaxScaler(feature_range=(-1, 1))
    
    @staticmethod
    def Standard():
        return StandardScaler()
    
    @staticmethod
    def None_():
        class NoNormalization:
            def fit_transform(self, x):
                return x
            def transform(self, x):
                return x
        return NoNormalization()

# ========================================================================================
# GRAPH NORMALIZATION METHODS
# ========================================================================================

def random_walk_matrix(adj_matrix):
    """Random walk normalization"""
    d = np.sum(adj_matrix, axis=1)
    d_inv = np.power(d, -1)
    d_inv[np.isinf(d_inv)] = 0.
    d_mat_inv = np.diag(d_inv)
    return d_mat_inv.dot(adj_matrix)

def normalized_laplacian(adj_matrix):
    """Normalized Laplacian"""
    d = np.sum(adj_matrix, axis=1)
    d_inv_sqrt = np.power(d, -0.5)
    d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
    d_mat_inv_sqrt = np.diag(d_inv_sqrt)
    return d_mat_inv_sqrt.dot(adj_matrix).dot(d_mat_inv_sqrt)

# ========================================================================================
# MAIN FUNCTION
# ========================================================================================

def create_svd_from_h5_advanced(dataset='bike', city='NYC', 
                               Normal_Method='MinMax01', 
                               data_category=['bike'],
                               hidden_size=50,
                               normalized_category='randomwalk',
                               output_path='data/bike_svd.npy',
                               _len=[672, 672]):
    """
    Tạo SVD theo phương pháp trong prompt
    
    Parameters:
    -----------
    dataset: str, tên dataset
    city: str, tên thành phố  
    Normal_Method: str, phương pháp normalization
    data_category: list, danh sách categories
    hidden_size: int, số dimensions cho SVD
    normalized_category: str, 'randomwalk' or 'laplacian'
    output_path: str, đường dẫn lưu file SVD
    _len: list, [val_len, test_len] để loại bỏ
    """
    
    print(f"🚀 Creating SVD with advanced method")
    print(f"   Dataset: {dataset}")
    print(f"   City: {city}")
    print(f"   Normalization: {Normal_Method}")
    print(f"   Categories: {data_category}")
    print(f"   Hidden size: {hidden_size}")
    print(f"   Graph normalization: {normalized_category}")
    
    # Lấy normalization method
    normal_method = getattr(normalization, Normal_Method)
    
    data = []
    
    for category in data_category:
        print(f"📂 Processing category: {category}")
        
        normal = normal_method()
        
        # Đọc pick data
        h5_file = f"data/{dataset}/{city}/{category}_data.h5"
        print(f"   Reading: {h5_file}")
        
        if not os.path.exists(h5_file):
            print(f"   ❌ File not found: {h5_file}")
            continue
            
        with h5py.File(h5_file, 'r') as hf:
            print(f"   📋 Keys in file: {list(hf.keys())}")
            data_pick = hf[f'{category}_pick'][:]
            print(f"   📊 Pick data shape: {data_pick.shape}")
        
        # Đọc drop data
        with h5py.File(h5_file, 'r') as hf:
            data_drop = hf[f'{category}_drop'][:]
            print(f"   📊 Drop data shape: {data_drop.shape}")
        
        # Stack và normalize
        stacked_data = np.stack([data_pick, data_drop], axis=2)  # (T, N, 2)
        normalized_data = normal.fit_transform(stacked_data.reshape(-1, stacked_data.shape[-1]))
        normalized_data = normalized_data.reshape(stacked_data.shape)
        
        print(f"   📊 Normalized data shape: {normalized_data.shape}")
        data.append(normalized_data)
    
    # Concatenate data
    data = np.concatenate(data, axis=1).transpose((0, 2, 1))  # (T, F, N)
    print(f"📊 Concatenated data shape: {data.shape}")
    
    # Remove validation and test data
    data = data[:-(_len[0] + _len[1])]
    print(f"📊 Training data shape after removal: {data.shape}")
    
    T, input_dim, N = data.shape
    
    # Reshape for SVD
    inputs = data.reshape(-1, N)  # (T*F, N)
    print(f"📊 Reshaped inputs for SVD: {inputs.shape}")
    
    # Perform SVD
    print(f"🔧 Performing SVD...")
    u, s, v = np.linalg.svd(inputs, full_matrices=False)
    print(f"📊 SVD shapes - U: {u.shape}, S: {s.shape}, V: {v.shape}")
    
    # Create weight matrix
    w = np.diag(s[:hidden_size]).dot(v[:hidden_size, :]).T  # (N, hidden_size)
    print(f"📊 Weight matrix shape: {w.shape}")
    
    # Create graph using euclidean distance
    print(f"🔧 Creating graph...")
    graph = cdist(w, w, metric='euclidean')  # (N, N)
    print(f"📊 Graph shape: {graph.shape}")
    
    # Create support matrix
    support = graph * -1 / (np.std(graph) ** 2)
    support = np.exp(support)
    print(f"📊 Support matrix stats: min={support.min():.6f}, max={support.max():.6f}")
    
    # Remove self-connections
    support = support - np.identity(support.shape[0])
    
    # Apply graph normalization
    if normalized_category == 'randomwalk':
        print(f"🔧 Applying random walk normalization...")
        support = random_walk_matrix(support)
    elif normalized_category == 'laplacian':
        print(f"🔧 Applying Laplacian normalization...")
        support = normalized_laplacian(support)
    
    print(f"📊 Final support matrix shape: {support.shape}")
    print(f"📊 Final support matrix stats: min={support.min():.6f}, max={support.max():.6f}")
    
    # Save SVD (sử dụng weight matrix w thay vì support)
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    np.save(output_path, w)  # Lưu weight matrix (N, hidden_size)
    print(f"✅ Saved SVD weight matrix to: {output_path}")
    
    # Lưu support matrix nếu cần
    support_path = output_path.replace('.npy', '_support.npy')
    np.save(support_path, support)
    print(f"✅ Saved support matrix to: {support_path}")
    
    return w, support

# ========================================================================================
# WRAPPER FUNCTION CHO BIKE DATA
# ========================================================================================

def create_bike_svd_advanced():
    """
    Tạo bike SVD theo phương pháp advanced
    """
    
    # Kiểm tra structure thư mục
    base_path = "data/bike/NYC"
    h5_file = f"{base_path}/bike_data.h5"
    
    if not os.path.exists(h5_file):
        print(f"❌ File not found: {h5_file}")
        print("🔍 Trying alternative paths...")
        
        # Thử các đường dẫn khác
        alternative_paths = [
            "data/nogrid/bike_data.h5",
            "data/bike_data.h5",
            "bike_data.h5"
        ]
        
        found = False
        for alt_path in alternative_paths:
            if os.path.exists(alt_path):
                print(f"✅ Found file at: {alt_path}")
                # Tạo symbolic link hoặc copy
                os.makedirs(os.path.dirname(h5_file), exist_ok=True)
                import shutil
                shutil.copy2(alt_path, h5_file)
                found = True
                break
        
        if not found:
            print("❌ No H5 file found. Please check file paths.")
            return None
    
    # Chạy SVD
    try:
        w, support = create_svd_from_h5_advanced(
            dataset='bike',
            city='NYC', 
            Normal_Method='MinMax01',
            data_category=['bike'],
            hidden_size=250,
            normalized_category='randomwalk',
            output_path='data/bike_svd1.npy',
            _len=[672, 672]
        )
        
        print("\n🎉 SVD creation completed!")
        return w, support
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

# ========================================================================================
# CHẠY
# ========================================================================================

# Tạo bike SVD
result = create_bike_svd_advanced()

if result is not None:
    w, support = result
    print(f"\n📊 Results:")
    print(f"   Weight matrix shape: {w.shape}")
    print(f"   Support matrix shape: {support.shape}")
    
    # Verify files
    if os.path.exists('data/bike_svd1.npy'):
        loaded = np.load('data/bike_svd1.npy')
        print(f"   ✅ Saved SVD shape: {loaded.shape}")

🚀 Creating SVD with advanced method
   Dataset: bike
   City: NYC
   Normalization: MinMax01
   Categories: ['bike']
   Hidden size: 250
   Graph normalization: randomwalk
📂 Processing category: bike
   Reading: data/bike/NYC/bike_data.h5
   📋 Keys in file: ['bike_drop', 'bike_pick']
   📊 Pick data shape: (4368, 250)
   📊 Drop data shape: (4368, 250)
   📊 Normalized data shape: (4368, 250, 2)
📊 Concatenated data shape: (4368, 2, 250)
📊 Training data shape after removal: (3024, 2, 250)
📊 Reshaped inputs for SVD: (6048, 250)
🔧 Performing SVD...
📊 SVD shapes - U: (6048, 250), S: (250,), V: (250, 250)
📊 Weight matrix shape: (250, 250)
🔧 Creating graph...
📊 Graph shape: (250, 250)
📊 Support matrix stats: min=0.000582, max=1.000000
🔧 Applying random walk normalization...
📊 Final support matrix shape: (250, 250)
📊 Final support matrix stats: min=0.000000, max=0.020004
✅ Saved SVD weight matrix to: data/bike_svd1.npy
✅ Saved support matrix to: data/bike_svd1_support.npy

🎉 SVD creation complet

In [50]:
def quick_check():
    """
    Kiểm tra nhanh 2 file SVD
    """
    files = ['data/bike_svd.npy', 'data/bike_svd_support.npy']
    
    for file in files:
        if os.path.exists(file):
            data = np.load(file)
            print(f"✅ {file}:")
            print(f"   Shape: {data.shape}")
            print(f"   Range: [{data.min():.6f}, {data.max():.6f}]")
            print(f"   Mean: {data.mean():.6f}")
            print(f"   Std: {data.std():.6f}")
            print()
        else:
            print(f"❌ {file} not found")

quick_check()

✅ data/bike_svd.npy:
   Shape: (250, 250)
   Range: [0.000000, 0.004293]
   Mean: 0.004000
   Std: 0.000267

✅ data/bike_svd_support.npy:
   Shape: (250, 250)
   Range: [0.000000, 0.020004]
   Mean: 0.004000
   Std: 0.001453



In [None]:
# File: process_bike_data.py
import numpy as np
import h5py
import os

def main():
    print("🚀 PROCESSING BIKE DATA")
    print("="*50)
    
    # Bước 1: Tạo bike_raw.npz
    print("📝 Step 1: Creating bike_raw.npz...")
    raw_data = create_bike_raw_npz_flexible()
    
    # Bước 2: Validate data
    print("\n📝 Step 2: Validating data...")
    # validate_bike_data()
    
    # Bước 3: Chạy preprocessing
    print("\n📝 Step 3: Running preprocessing...")
    all_data = read_and_generate_dataset_encoder_decoder(
        graph_signal_matrix_filename='data/bike_raw.npz',
        num_of_weeks=1,
        num_of_days=1,
        num_of_hours=1,
        num_for_predict=12,
        points_per_hour=12,
        save=True
    )
    
    print("\n✅ DONE! Files created:")
    print("   - data/bike_raw.npz (raw data)")
    print("   - data/bike_raw_r1_d1_w1.npz (processed data)")

if __name__ == "__main__":
    main()