In [1]:

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torchvision import models
from torchvision.models import resnet50, ResNet50_Weights
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import learning_curve
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import datetime
import random
import os
import rasterio
from tqdm import tqdm
import matplotlib.pyplot as plt
import pickle
from rasterio.enums import Resampling

In [2]:
# data preprocessing
def preprocess_data(file_path):
    df = pd.read_csv(file_path)

    # Applying the specified transformations
    df['Time'] = df['Year'].apply(lambda x: datetime.datetime(x, 1, 1))

    # Log transformation 
    cols_to_log_transform = ['GDP', 'Population', 'Personalincome', 'Percapitapersonalincome', 'econs', 'aland', 'awater','shape_area','shape_leng','Natural Gas Delivered to Consumers in California (Including Vehicle Fuel)  Million Cubic Feet']
    df[cols_to_log_transform] = df[cols_to_log_transform].apply(np.log)

    df.sort_values(by=['GeoFIPS', 'Time'], inplace=True)
    # delte the 21 missing values
    df = df.groupby('GeoFIPS').filter(lambda x: len(x) == 20)
    
    df_cleaned = df.copy()

    # Displaying the first few rows of the processed data
    df_cleaned.head()
    return df_cleaned

In [3]:
class EconomicDataset(Dataset):
    def __init__(self, dataframe, root_dir, economic_features, img_size, img_augmented_size, scale_factor=0.6,transform=None):
        self.dataframe = dataframe
        self.root_dir = root_dir
        self.economic_features = economic_features
        self.img_size = img_size
        self.img_augmented_size = img_augmented_size
        self.transform = transform
        self.years = dataframe['Year']  
        self.geo_fips = dataframe['GeoFIPS']
        self.scale_factor = scale_factor  

    def read_and_resize_image(self, image_path):
        with rasterio.open(image_path) as src:
            # read the image and resize it
            data = src.read(
                out_shape=(
                    src.count,
                    int(src.height * self.scale_factor),
                    int(src.width * self.scale_factor)
                ),
                resampling=Resampling.bilinear
            )
            data = data.astype('float32')  
            nan_mask = np.isnan(data)
            data[nan_mask] = 0  
            return data
    def random_transform(self, image):

        image = torch.from_numpy(image).float()
        image = F.interpolate(image.unsqueeze(0), size=(self.img_size, self.img_size), mode='bilinear', align_corners=False).squeeze(0)
        image = torch.clamp(image, 0, 1)

        return image

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.dataframe.iloc[idx, 0])
        with rasterio.open(img_path) as src:
            image = src.read()  
            image = image.astype('float32') 
            nan_mask = np.isnan(image)
            image[nan_mask] = 0 

        image = self.random_transform(image)

        economic_features = self.economic_features[idx].float()  
        label = self.dataframe.iloc[idx]['GDP'] 
        year = self.years.iloc[idx]
        geo_fips = self.geo_fips.iloc[idx]
        return image, economic_features, label, year, geo_fips


In [4]:
# dataset creation
def create_datasets(df_cleaned, img_size, img_augmented_size):
    # group by 'GeoFIPS' and split into train, validation, and test sets
    grouped = df_cleaned.groupby('GeoFIPS')
    train_df_list = []
    val_df_list = []
    test_df_list = []

    for _, group in grouped:
        group = group.sort_values(by='Year')
        total_years = len(group)

        train_split = int(0.9 * total_years)
        val_split = train_split + int(0.05 * total_years)

        train_df_list.append(group.iloc[:train_split])
        val_df_list.append(group.iloc[train_split:val_split])
        test_df_list.append(group.iloc[val_split:])

    train_df = pd.concat(train_df_list)
    val_df = pd.concat(val_df_list)
    test_df = pd.concat(test_df_list)
    scaler = MinMaxScaler()
    columns_to_scale = ['econs','Population', 'Personalincome', 'Percapitapersonalincome', 'aland', 'awater','shape_area','shape_leng','Natural Gas Delivered to Consumers in California (Including Vehicle Fuel)  Million Cubic Feet','intptlat','intptlon']
    scaler.fit(train_df[columns_to_scale])

    train_features = scaler.transform(train_df[columns_to_scale])
    test_features = scaler.transform(test_df[columns_to_scale])
    val_features = scaler.transform(val_df[columns_to_scale])

    train_features_tensor = torch.tensor(train_features, dtype=torch.float32)
    test_features_tensor = torch.tensor(test_features, dtype=torch.float32)
    val_features_tensor = torch.tensor(val_features, dtype=torch.float32)

    train_dataset = EconomicDataset(train_df, "<train_image_path>", train_features_tensor, img_size, img_augmented_size)
    val_dataset = EconomicDataset(val_df, "<validation_image_path>", val_features_tensor, img_size, img_augmented_size)
    test_dataset = EconomicDataset(test_df, "<test_image_path>", test_features_tensor, img_size, img_augmented_size)

    return train_dataset, val_dataset, test_dataset

In [5]:
def process_and_save_dataset(dataset, save_path):
    processed_data = []
    for i in tqdm(range(len(dataset))):
        image, econ_features, label, year, geo_fips = dataset[i]
        if torch.isnan(econ_features).any():
            econ_features[torch.isnan(econ_features)] = 0

        processed_data.append((image, econ_features, label, year, geo_fips))

    with open(save_path, 'wb') as f:
        pickle.dump(processed_data, f)

    print(f"Saved processed data to {save_path}")


In [6]:
df_cleaned = preprocess_data("E://Nowcasting Code//Data//Label//2001_2020.csv")


In [7]:

train_dataset, val_dataset, test_dataset = create_datasets(df_cleaned, 512, 512)

process_and_save_dataset(train_dataset, "GDP01_18_train_dataset.pkl")
process_and_save_dataset(val_dataset, "GDP18_19_val_dataset.pkl")
process_and_save_dataset(test_dataset, "GDP19_20_test_dataset.pkl")


100%|██████████| 846/846 [1:10:53<00:00,  5.03s/it]    


Saved processed data to GDP01_18_train_dataset.pkl


100%|██████████| 47/47 [01:42<00:00,  2.19s/it]


Saved processed data to GDP18_19_val_dataset.pkl


100%|██████████| 47/47 [01:26<00:00,  1.84s/it]


Saved processed data to GDP19_20_test_dataset.pkl
