In [4]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.data import Data
from scipy.sparse import coo_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch_geometric.nn import NNConv, GCNConv, global_mean_pool

# Visualization
import matplotlib.pyplot as plt
import os

In [5]:
output_folder_path = f"/home/schivilkar/dev/final_video_processing/DataSummary/Path1/"
output_file_path = os.path.join(output_folder_path, "Path1_All_Sessions.csv")
combined_data = pd.read_csv(output_file_path)
print(combined_data.columns)

Index(['timestamp', 'recorder1_9m', 'recorder2_9m', 'A_to_B', 'B_to_A',
       'Total_A_to_B', 'Total_B_to_A', 'recorder1_9m_b', 'recorder2_9m_b',
       'recorder3_9m_b', 'recorder1_9m_c', 'recorder2_9m_c'],
      dtype='object')


In [6]:
# Filter out rows where time is not between 7 AM and 7 PM
# Ensure 'timestamp' is a datetime object
combined_data['timestamp'] = pd.to_datetime(combined_data['timestamp'])

# Filtering out rows between 7 PM (19:00) to 7 AM (07:00)
combined_data = combined_data[
    (combined_data['timestamp'].dt.hour >= 7) & (combined_data['timestamp'].dt.hour < 19)
]

print(combined_data.shape)

(284340, 12)


In [7]:
# Ensure the timestamp column is in datetime format
combined_data['timestamp'] = pd.to_datetime(combined_data['timestamp'])
combined_data['hour'] = combined_data['timestamp'].dt.hour
combined_data['day_of_week'] = combined_data['timestamp'].dt.dayofweek

combined_data.head()

Unnamed: 0,timestamp,recorder1_9m,recorder2_9m,A_to_B,B_to_A,Total_A_to_B,Total_B_to_A,recorder1_9m_b,recorder2_9m_b,recorder3_9m_b,recorder1_9m_c,recorder2_9m_c,hour,day_of_week
0,2024-02-15 12:30:23,0,0,0,0,0,0,4.0,7.0,6.0,3.0,5.0,12,3
1,2024-02-15 12:30:24,0,1,0,0,0,0,3.0,7.0,6.0,3.0,5.0,12,3
2,2024-02-15 12:30:25,0,0,0,0,0,0,2.0,8.0,6.0,3.0,4.0,12,3
3,2024-02-15 12:30:26,0,0,0,0,0,0,2.0,10.0,10.0,3.0,4.0,12,3
4,2024-02-15 12:30:27,0,0,0,0,0,0,2.0,7.0,5.0,3.0,2.0,12,3


In [8]:
# Sample DataFrame creation
# Assuming 'timestamp' is a column with datetime data on second intervals
combined_data['timestamp'] = pd.date_range(start='2024-02-15 12:30:23', periods=len(combined_data), freq='S')
combined_data.set_index('timestamp', inplace=True)

# Define custom aggregation functions for each column
aggregations = {
    'recorder1_9m': 'mean',  # Replace 'location1' to 'location6' with your actual column names
    'recorder2_9m': 'mean',
    'recorder1_9m_b': 'mean',
    'recorder2_9m_b': 'mean',
    'recorder3_9m_b': 'mean',
    'recorder1_9m_c': 'mean',
    'recorder2_9m_c': 'mean',
    'hour': 'first',
    'day_of_week': 'first',
    'A_to_B': 'sum',
    'B_to_A': 'sum',
    'Total_A_to_B': 'mean',
    'Total_B_to_A': 'mean'
}

# Resample and compute the mean for each 5-second interval
aggregated_data_2s = combined_data.resample('2S').agg(aggregations)
aggregated_data_3s = combined_data.resample('3S').agg(aggregations)
aggregated_data_5s = combined_data.resample('5S').agg(aggregations)
aggregated_data_10s = combined_data.resample('10S').agg(aggregations)

# Reset the index if you want 'timestamp' back as a column
aggregated_data_2s.reset_index(inplace=True)
aggregated_data_3s.reset_index(inplace=True)
aggregated_data_5s.reset_index(inplace=True)
aggregated_data_10s.reset_index(inplace=True)

# Now, aggregated_data holds the averaged values for each 5-second chunk
aggregated_data_2s.head()

Unnamed: 0,timestamp,recorder1_9m,recorder2_9m,recorder1_9m_b,recorder2_9m_b,recorder3_9m_b,recorder1_9m_c,recorder2_9m_c,hour,day_of_week,A_to_B,B_to_A,Total_A_to_B,Total_B_to_A
0,2024-02-15 12:30:22,0.0,0.0,4.0,7.0,6.0,3.0,5.0,12,3,0,0,0.0,0.0
1,2024-02-15 12:30:24,0.0,0.5,2.5,7.5,6.0,3.0,4.5,12,3,0,0,0.0,0.0
2,2024-02-15 12:30:26,0.0,0.0,2.0,8.5,7.5,3.0,3.0,12,3,0,0,0.0,0.0
3,2024-02-15 12:30:28,3.5,0.0,3.5,5.5,7.5,3.5,3.5,12,3,0,0,0.0,0.0
4,2024-02-15 12:30:30,0.0,0.0,4.5,7.0,6.5,3.5,3.0,12,3,0,0,0.0,0.0


In [9]:
# 'timestamp' is the first column and the rest are recorder counts
timestamps_2s = aggregated_data_2s['timestamp']
timestamps_3s = aggregated_data_3s['timestamp']
timestamps_5s = aggregated_data_5s['timestamp']
timestamps_10s = aggregated_data_10s['timestamp']

# Selecting features for the model
date_columns = ['hour'] 

simple = 4
if simple == 1:
    recorder_columns = ['recorder1_9m', 'recorder2_9m']
elif simple == 2:
    recorder_columns = ['recorder1_9m_b', 'recorder2_9m_b',
       'recorder3_9m_b']
elif simple == 3:
    recorder_columns = ['recorder1_9m_c', 'recorder2_9m_c']
else:
    recorder_columns = ['recorder1_9m', 'recorder2_9m','recorder1_9m_b', 'recorder2_9m_b',
       'recorder3_9m_b', 'recorder1_9m_c', 'recorder2_9m_c']
    
video_columns = ['A_to_B', 'B_to_A']
target_columns = ['A_to_B', 'B_to_A']

In [10]:
data = combined_data

In [11]:
data.columns

Index(['recorder1_9m', 'recorder2_9m', 'A_to_B', 'B_to_A', 'Total_A_to_B',
       'Total_B_to_A', 'recorder1_9m_b', 'recorder2_9m_b', 'recorder3_9m_b',
       'recorder1_9m_c', 'recorder2_9m_c', 'hour', 'day_of_week'],
      dtype='object')

In [15]:
# Create the feature (X) and target (Y) tensor
all_features = data[recorder_columns + date_columns + video_columns].fillna(0)
target_features = data[target_columns].fillna(0)

all_features_tensor = torch.tensor(all_features.values, dtype=torch.float)
target_tensor = torch.tensor(target_features.values, dtype=torch.float)

In [16]:
# Min-max scaling of features (pedestrian counts)
all_features_np = all_features_tensor.numpy()
target_np = target_tensor.numpy()

def min_max_scale(data):
    data_min = np.min(data, axis=0)
    data_max = np.max(data, axis=0)
    return (data - data_min) / (data_max - data_min), data_min, data_max

def reverse_min_max_scale(scaled_data, data_min, data_max):
    """Reverses the min-max scaling applied to the data."""
    return (scaled_data * (data_max - data_min)) + data_min

scaled_features, features_min, features_max = min_max_scale(all_features_np)
scaled_targets, targets_min, targets_max = min_max_scale(target_np)

In [17]:
print("Min of features:", features_min)
print("Max of features:", features_max)
print("Min of targets:", targets_min)
print("Max of features:", targets_max)

Min of features: [0. 0. 0. 0. 0. 0. 0. 7. 0. 0.]
Max of features: [16. 24. 16. 22. 16. 17. 23. 18.  4.  3.]
Min of targets: [0. 0.]
Max of features: [4. 3.]


In [18]:
scaled_features[0]

array([0.        , 0.        , 0.25      , 0.3181818 , 0.375     ,
       0.1764706 , 0.2173913 , 0.45454547, 0.        , 0.        ],
      dtype=float32)

In [20]:
# Define sequence length
sequence_length = 10  # 10 time steps per sequence
date_id = 7

def create_feature_target_sequences(data, target_data, seq_length):
    xs = []
    ys = []
    indices = []
    for i in range(len(data) - seq_length):
        # Check if the time of day is continuous
        time_of_day_seq = reverse_min_max_scale(data[i:(i + seq_length), date_id-1], features_min[date_id-1], features_max[date_id-1])
        
        # Compare consecutive time steps, check if they are 1 unit apart
        if np.sum(np.diff(time_of_day_seq) == 1) or np.all(np.diff(time_of_day_seq) == 0):
            # Add feature sequence
            xs.append(data[i:(i + seq_length)])
            
            # Add the corresponding target sequence
            summed_sequence = np.sum(target_data[i:(i + seq_length)], axis=0)
            ys.append(summed_sequence)
            
            indices.append(i)
        
    return np.array(xs), np.array(ys), np.array(indices)

# Assuming scaled_features contain the full set of features and scaled_targets is the target data
sequential_features, sequential_targets, indices = create_feature_target_sequences(scaled_features, scaled_targets, sequence_length)

print(len(sequential_features))
print(len(sequential_targets))
print(indices)

# Remove the last sequence from the feature tensor if its corresponding target does not exist
if len(sequential_features) > len(sequential_targets):
    sequential_features = sequential_features[:-1]

227646
227646
[     0      1      2 ... 284327 284328 284329]


In [21]:
# Calculate the number of sequences in the dataset, same for audio and video
num_sequences = len(sequential_features)  

# Calculate the split index
split_index = int(num_sequences * 0.9)  # 60% of the data for training
print(split_index)

# Split the features 90%
trainvalid_features = sequential_features[:split_index]
test_features = sequential_features[split_index:]

# Split by audio and video
audio_trainvalid_features = trainvalid_features[:,:,:7]
video_trainvalid_features = trainvalid_features[:,:,6:]

audio_test_features = test_features[:,:,:7]
video_test_features = test_features[:,:,6:]

# Split the targets 10% 
trainvalid_targets = sequential_targets[:split_index]
test_targets = sequential_targets[split_index:]

print(f"Audio Training features shape: {audio_trainvalid_features.shape}")
print(f"Video Training features shape: {video_trainvalid_features.shape}")
print(f"Audio + Video Training features shape: {trainvalid_features.shape}")
print(f"Training targets shape: {trainvalid_targets.shape}")

print(f"Audio Testing features shape: {audio_test_features.shape}")
print(f"Video Testing features shape: {video_test_features.shape}")
print(f"Audio + Video Testing shape: {test_features.shape}")
print(f"Testing targets shape: {test_targets.shape}")

204881
Audio Training features shape: (204881, 10, 7)
Video Training features shape: (204881, 10, 4)
Audio + Video Training features shape: (204881, 10, 10)
Training targets shape: (204881, 2)
Audio Testing features shape: (22765, 10, 7)
Video Testing features shape: (22765, 10, 4)
Audio + Video Testing shape: (22765, 10, 10)
Testing targets shape: (22765, 2)


In [22]:
print("audio feature example: ")
print(audio_trainvalid_features[80])
print("video feature example: ")
print(video_trainvalid_features[80])
print("target example: ")
print(trainvalid_targets[80])

audio feature example: 
[[0.125      0.04166667 0.125      0.27272728 0.5        0.4117647
  0.39130434]
 [0.125      0.08333334 0.125      0.22727273 0.625      0.3529412
  0.4347826 ]
 [0.125      0.08333334 0.125      0.3181818  0.6875     0.29411766
  0.39130434]
 [0.125      0.08333334 0.1875     0.36363637 0.5        0.29411766
  0.47826087]
 [0.0625     0.125      0.25       0.3181818  0.5625     0.29411766
  0.3478261 ]
 [0.1875     0.125      0.25       0.36363637 0.5        0.23529412
  0.39130434]
 [0.125      0.08333334 0.25       0.22727273 0.625      0.29411766
  0.4347826 ]
 [0.125      0.08333334 0.125      0.22727273 0.375      0.29411766
  0.26086956]
 [0.125      0.08333334 0.125      0.27272728 0.4375     0.29411766
  0.3043478 ]
 [0.125      0.08333334 0.25       0.18181819 0.4375     0.29411766
  0.2173913 ]]
video feature example: 
[[0.39130434 0.45454547 0.         0.        ]
 [0.4347826  0.45454547 0.         0.        ]
 [0.39130434 0.45454547 0.         0.  

In [23]:
print(video_trainvalid_features[200][:, 1:])
np.sum(video_trainvalid_features[200][:, 1:])

[[0.45454547 0.         0.        ]
 [0.45454547 0.         0.        ]
 [0.45454547 0.         0.        ]
 [0.45454547 0.         0.        ]
 [0.45454547 0.         0.        ]
 [0.45454547 0.         0.        ]
 [0.45454547 0.         0.        ]
 [0.45454547 0.         0.        ]
 [0.45454547 0.         0.        ]
 [0.45454547 0.         0.        ]]


4.5454545

In [24]:
# Filter out sequences where all recorders have zero pedestrians across all 10 frames 
valid_indices = [i for i, seq in enumerate(video_trainvalid_features) if not np.sum(seq[:, 1:]) == 0]
zero_indices = [i for i, seq in enumerate(video_trainvalid_features) if np.sum(seq[:, 1:]) == 0]

# Use valid_indices to filter both features and targets
nonzero_audio_features = audio_trainvalid_features[valid_indices]
nonzero_video_features = video_trainvalid_features[valid_indices]
nonzero_features = trainvalid_features[valid_indices]
nonzero_targets = trainvalid_targets[valid_indices]

# Use valid_indices to filter both features and targets
allzero_audio_features = audio_trainvalid_features[zero_indices]
allzero_video_features = video_trainvalid_features[zero_indices]
allzero_features = trainvalid_features[zero_indices]
allzero_targets = trainvalid_targets[zero_indices]

In [25]:
# Determine the number of samples to take from each group
# num_samples = min(len(nonzero_features), len(allzero_features))
num_samples = len(nonzero_audio_features)
print("number of non-zero samples: ", num_samples)

print("length of Non-zero sequences: ", len(nonzero_audio_features))
print("length of All-zero sequences: ", len(allzero_audio_features))

number of non-zero samples:  192216
length of Non-zero sequences:  192216
length of All-zero sequences:  12665


In [None]:
# Generate random indices for sampling without replacement
zero_indices = np.random.choice(len(allzero_audio_features), int(num_samples/2), replace=False)

# non_zero_indices = np.random.choice(len(nonzero_audio_features), num_samples, replace=False)

# Sample features and targets based on the indices
allzero_audio_features_sampled = allzero_audio_features[zero_indices]
allzero_video_features_sampled = allzero_video_features[zero_indices]
allzero_features_sampled = allzero_features[zero_indices]

# nonzero_features_sampled = nonzero_features[non_zero_indices]

allzero_targets_sampled = allzero_targets[zero_indices]
# nonzero_targets_sampled = nonzero_targets[non_zero_indices]

print("length of Non-zero samples: ", len(nonzero_targets))
print("length of All-zero samples: ", len(allzero_targets_sampled))

# Combine the samples to form new feature and target sets
balanced_audio_features = np.concatenate([allzero_audio_features_sampled, nonzero_audio_features], axis=0)
balanced_video_features = np.concatenate([allzero_video_features_sampled, nonzero_video_features], axis=0)
balanced_features = np.concatenate([allzero_features_sampled, nonzero_features], axis=0)
balanced_targets = np.concatenate([allzero_targets_sampled, nonzero_targets], axis=0)

# Shuffle the combined dataset to randomize the order of sequences
# indices = np.arange(balanced_features.shape[0])
# np.random.shuffle(indices)
# balanced_features = balanced_features[indices]
# balanced_targets = balanced_targets[indices]

# Debugging and validation
print("Balanced audio features shape:", balanced_audio_features.shape)
print("Balanced video features shape:", balanced_video_features.shape)
print("Balanced audio + video features shape:", balanced_features.shape)
print("Balanced targets shape:", balanced_targets.shape)

ValueError: Cannot take a larger sample than population when 'replace=False'