In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import datetime
import random
import os
import rasterio
from tqdm import tqdm
import pickle
from rasterio.enums import Resampling

In [2]:
# 数据预处理函数
def preprocess_data(file_path):
    df = pd.read_csv(file_path)

    # Applying the specified transformations
    df['Time'] = df['Year'].apply(lambda x: datetime.datetime(x, 1, 1))

    # Log transformation
    cols_to_log_transform = ['GDP', 'Population', 'Personalincome', 'Percapitapersonalincome', 'econs', 'aland', 'awater','shape_area','shape_leng','Natural Gas Delivered to Consumers in California (Including Vehicle Fuel)  Million Cubic Feet']
    df[cols_to_log_transform] = df[cols_to_log_transform].apply(np.log)

    # Sorting and handling previous year's GDP and Image Path
    df.sort_values(by=['GeoFIPS', 'Time'], inplace=True)
    df['previous_image_path'] = df.groupby('GeoFIPS')['ImagePath'].shift(1)
    df['PreviousYearGDP'] = df.groupby('GeoFIPS')['GDP'].shift(1)
    df['PreviousYearPopulation'] = df.groupby('GeoFIPS')['Population'].shift(1)
    df['PreviousYearPersonalincome'] = df.groupby('GeoFIPS')['Personalincome'].shift(1)
    df['PreviousYearPercapitapersonalincome'] = df.groupby('GeoFIPS')['Percapitapersonalincome'].shift(1)
    df['PreviousYearecons'] = df.groupby('GeoFIPS')['econs'].shift(1)
    df['PreviousYearNatural Gas Delivered to Consumers in California (Including Vehicle Fuel)  Million Cubic Feet'] = df.groupby('GeoFIPS')['Natural Gas Delivered to Consumers in California (Including Vehicle Fuel)  Million Cubic Feet'].shift(1)
    
    # Renaming ImagePath to current_image_path
    df.rename(columns={'ImagePath': 'current_image_path'}, inplace=True)

    # Adding a column to check for continuous records
    df['IsContinuous'] = (df['Year'] - df.groupby('GeoFIPS')['Year'].shift(1)) == 1

    # Filtering out non-continuous records
    df_filtered = df[df['IsContinuous'] & df['previous_image_path'].notna()].copy()

    # Calculating GDP Difference
    df_filtered['GDPDifference'] = df_filtered['GDP'] - df_filtered['PreviousYearGDP']
    df_filtered['PopulationDifference'] = df_filtered['Population'] - df_filtered['PreviousYearPopulation']
    df_filtered['PersonalincomeDifference'] = df_filtered['Personalincome'] - df_filtered['PreviousYearPersonalincome']
    df_filtered['econsDifference'] = df_filtered['econs'] - df_filtered['PreviousYearecons']
    df_filtered['Natural Gas Delivered to Consumers in California (Including Vehicle Fuel)  Million Cubic Feet Difference'] = df_filtered['Natural Gas Delivered to Consumers in California (Including Vehicle Fuel)  Million Cubic Feet'] - df_filtered['PreviousYearNatural Gas Delivered to Consumers in California (Including Vehicle Fuel)  Million Cubic Feet']
    
    # Dropping unnecessary columns
    df_cleaned = df_filtered.drop(['IsContinuous'], axis=1)
    
    df_cleaned.head()
    return df_cleaned

In [None]:
class EconomicDataset(Dataset):
    def __init__(self, dataframe, root_dir, economic_features, img_size, img_augmented_size, scale_factor=0.8, transform=None, scaler=None):
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.img_size = img_size
        self.img_augmented_size = img_augmented_size
        self.transform = transform
        self.economic_features = economic_features
        self.scale_factor = scale_factor
        self.scaler = scaler


    def read_and_resize_image(self, image_path):
        with rasterio.open(image_path) as src:
            # 调整读取分辨率
            data = src.read(
                out_shape=(
                    src.count,
                    int(src.height * self.scale_factor),
                    int(src.width * self.scale_factor)
                ),
                resampling=Resampling.bilinear
            )
            data = data.astype('float32')  # 转换数据类型
            nan_mask = np.isnan(data)
            data[nan_mask] = 0  # 替换 NaN 值
            return data

    def random_transform(self, image):
        # 随机水平翻转
        if random.random() > 0.5:
            image = np.flip(image, axis=2)  # 使用 NumPy 的切片操作进行翻转

        # 中心裁剪
        _, height, width = image.shape
        new_height = new_width = min(height, width)
        top = (height - new_height) // 2
        left = (width - new_width) // 2
        image = image[:, top:top + new_height, left:left + new_width].copy()

        # 在转换之前复制 NumPy 数组
        image = torch.from_numpy(image).float()

        # 调整图像大小
        image = F.interpolate(image.unsqueeze(0), size=(self.img_size, self.img_size), mode='bilinear', align_corners=False).squeeze(0)

        # 将图像像素值裁剪到范围 [0, 1]
        image = torch.clamp(image, 0, 1)

        return image

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        current_img_path = os.path.join(self.root_dir, self.dataframe.iloc[idx]['current_image_path'])
        previous_img_path = os.path.join(self.root_dir, self.dataframe.iloc[idx]['previous_image_path'])

        # 图像读取与变换
        current_image = self.read_and_resize_image(current_img_path)
        previous_image = self.read_and_resize_image(previous_img_path)
        current_image = self.random_transform(current_image)
        previous_image = self.random_transform(previous_image)

        # 当前和前一年的经济特征（注意 idx 在三个 tensor 中是一致的）
        current_econ_features = self.economic_features[idx].float()

        # 找到对应行中前一年的经济特征列并标准化（根据 scaler）
        prev_row = self.dataframe.iloc[idx]
        prev_econ_values = [
            prev_row['PreviousYearecons'],
            prev_row['PreviousYearPopulation'],
            prev_row['PreviousYearPersonalincome'],
            prev_row['PreviousYearPercapitapersonalincome'],
            prev_row['aland'],
            prev_row['awater'],
            prev_row['shape_area'],
            prev_row['shape_leng'],
            prev_row['PreviousYearNatural Gas Delivered to Consumers in California (Including Vehicle Fuel)  Million Cubic Feet'],
            prev_row['intptlat'],
            prev_row['intptlon']
        ]
        prev_econ_values = np.array(prev_econ_values).astype(np.float32).reshape(1, -1)
        prev_econ_scaled = torch.tensor(self.scaler.transform(prev_econ_values), dtype=torch.float32).squeeze(0)
        columns_to_scale = ['econs','Population', 'Personalincome', 'Percapitapersonalincome', 'aland', 'awater',
                    'shape_area','shape_leng','Natural Gas Delivered to Consumers in California (Including Vehicle Fuel)  Million Cubic Feet',
                    'intptlat','intptlon']

        prev_econ_df = pd.DataFrame(prev_econ_values, columns=columns_to_scale)
        prev_econ_scaled = torch.tensor(self.scaler.transform(prev_econ_df), dtype=torch.float32).squeeze(0)

        econ_diff = current_econ_features - prev_econ_scaled

        # 标签
        GDPDifference = self.dataframe.iloc[idx]['GDPDifference']

        return current_image, previous_image, current_econ_features, prev_econ_scaled, econ_diff, GDPDifference


In [4]:
from sklearn.preprocessing import StandardScaler
# 修改后的数据集划分函数
def create_datasets(df_cleaned, img_size, img_augmented_size):
    # 按 GeoFIPS 分组，然后分别划分每组数据
    grouped = df_cleaned.groupby('GeoFIPS')
    train_df_list = []
    val_df_list = []
    test_df_list = []

    for _, group in grouped:
        train, temp = train_test_split(group, test_size=0.3, random_state=42)
        val, test = train_test_split(temp, test_size=0.5, random_state=42)
        train_df_list.append(train)
        val_df_list.append(val)
        test_df_list.append(test)

    train_df = pd.concat(train_df_list)
    val_df = pd.concat(val_df_list)
    test_df = pd.concat(test_df_list)
    # 缩放经济特征
    scaler = MinMaxScaler()
    columns_to_scale = ['econs','Population', 'Personalincome', 'Percapitapersonalincome', 'aland', 'awater','shape_area','shape_leng','Natural Gas Delivered to Consumers in California (Including Vehicle Fuel)  Million Cubic Feet','intptlat','intptlon']
    scaler.fit(train_df[columns_to_scale])
    
    train_features = scaler.transform(train_df[columns_to_scale])
    test_features = scaler.transform(test_df[columns_to_scale])
    val_features = scaler.transform(val_df[columns_to_scale])
    train_features_tensor = torch.tensor(train_features, dtype=torch.float32)
    test_features_tensor = torch.tensor(test_features, dtype=torch.float32)
    val_features_tensor = torch.tensor(val_features, dtype=torch.float32)
    # 实例化数据集
    train_dataset = EconomicDataset(train_df, "<train_image_path>", train_features_tensor, img_size, img_augmented_size, scaler=scaler)
    val_dataset = EconomicDataset(val_df, "<validation_image_path>", val_features_tensor, img_size, img_augmented_size, scaler=scaler)
    test_dataset = EconomicDataset(test_df, "<test_image_path>", test_features_tensor, img_size, img_augmented_size, scaler=scaler)


    return train_dataset, val_dataset, test_dataset


In [5]:
def process_and_save_dataset(dataset, save_path):
    processed_data = []
    for i in tqdm(range(len(dataset))):
        cur_img, prev_img, cur_econ, prev_econ, econ_diff, label = dataset[i]

        # 替换 NaN
        cur_econ[torch.isnan(cur_econ)] = 0
        prev_econ[torch.isnan(prev_econ)] = 0
        econ_diff[torch.isnan(econ_diff)] = 0

        processed_data.append((cur_img, prev_img, cur_econ, prev_econ, econ_diff, label))

    with open(save_path, 'wb') as f:
        pickle.dump(processed_data, f)

    print(f"Saved processed data to {save_path}")


In [6]:
df_cleaned = preprocess_data("E://FYP//fypcode//Data//Label//ccc.csv")

In [None]:
# 创建数据集
train_dataset, val_dataset, test_dataset = create_datasets(df_cleaned, 512, 512)

# 处理和保存数据集到新的文件夹
process_and_save_dataset(train_dataset, "5f_train_dataset.pkl")
process_and_save_dataset(val_dataset, "5f_val_dataset.pkl")
process_and_save_dataset(test_dataset, "5f_test_dataset.pkl")


 18%|█▊        | 127/709 [07:29<4:35:02, 28.36s/it]