<a href="https://colab.research.google.com/github/AvantiShri/gcp_analysis/blob/main/alldata/ComputeLifetimeDeviceStats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#mount Google drive to download the data some

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/GCP_data/all_data
#data at https://drive.google.com/drive/u/0/folders/1MUS-xbwoBiodWLz4gese19ti_ftlKsVu

/content/drive/MyDrive/GCP_data/all_data


In [3]:
#install some utilities code at https://github.com/AvantiShri/gcpdatautils
!pip uninstall -y gcpdatautils #uninstall the pre-existing version if want to fetch latest
!pip install git+https://github.com/AvantiShri/gcpdatautils.git #install latest from github

#Note: the code that was used to consolidate the downloaded csv.gz files into hdf5 files
# is at https://github.com/AvantiShri/gcp_analysis/blob/main/alldata/Consolidate_GCP_as_HDF5.ipynb

Found existing installation: gcpdatautils 0.1.1.0
Uninstalling gcpdatautils-0.1.1.0:
  Successfully uninstalled gcpdatautils-0.1.1.0
Collecting git+https://github.com/AvantiShri/gcpdatautils.git
  Cloning https://github.com/AvantiShri/gcpdatautils.git to /tmp/pip-req-build-3jnfv98k
  Running command git clone --filter=blob:none --quiet https://github.com/AvantiShri/gcpdatautils.git /tmp/pip-req-build-3jnfv98k
  Resolved https://github.com/AvantiShri/gcpdatautils.git to commit d101c1b7b435d3e0878e5f635943d4479fa76267
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: gcpdatautils
  Building wheel for gcpdatautils (setup.py) ... [?25l[?25hdone
  Created wheel for gcpdatautils: filename=gcpdatautils-0.1.1.0-py3-none-any.whl size=6390 sha256=00458be080bd9450a6495f581bb983f25d6c887634230023b293a4072c956389
  Stored in directory: /tmp/pip-ephem-wheel-cache-qyvmjk33/wheels/fe/f9/b7/d5afa50c9e111dcf3a66edb57f0c2273e54d108591664596b8
Successfully buil

In [4]:
from gcpdatautils import GCPHdf5DataReader

gcp_data_reader = GCPHdf5DataReader()

Parsing the bad data file: /usr/local/lib/python3.10/dist-packages/gcpdatautils/resources/rotteneggs.txt
manually correcting 47,2000-01-01 00:00:00,2001-06-31 23:59:59,2222
manually correcting 47,2008-04-25 00:00:00,2008-04-55 23:59:59,1092


In [5]:
from datetime import datetime
from collections import defaultdict
import numpy as np

device_to_numtrials = defaultdict(lambda: 0)
device_to_trialsums = defaultdict(lambda: 0)
device_to_trialsquaredsums = defaultdict(lambda: 0)

for year in range(1998, 2024):
  days = gcp_data_reader.get_available_days_in_year(year)
  for day in days:
    print(day)
    year_fh = gcp_data_reader.get_fh_for_year(year) #get the file handle for the year
    #check the metadata for the start and end timestamps of the day (some days have incomplete data)
    start_timestamp = year_fh[day].attrs["start_time"]
    end_timestamp = year_fh[day].attrs["end_time"]
    day_data, day_devices = gcp_data_reader.fetch_data_within_day(
          starttime=datetime.fromtimestamp(start_timestamp),
          endtime=datetime.fromtimestamp(end_timestamp),
          bail_if_missing_seconds=False,
          mask_bad_data=False) #if we do not mask bad data, some devices (e.g. 223) end up with v. low variance
    for i, device in enumerate(day_devices):
      device_to_numtrials[device] += np.sum(np.isnan(day_data[:,i])==False)
      device_to_trialsums[device] += np.nansum(day_data[:,i])
      device_to_trialsquaredsums[device] += np.nansum(np.square(day_data[:,i]))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2010-04-16
2010-04-17
2010-04-18
2010-04-19
2010-04-20
2010-04-21
2010-04-22
2010-04-23
2010-04-24
2010-04-25
2010-04-26
2010-04-27
2010-04-28
2010-04-29
2010-04-30
2010-05-02
2010-05-03
2010-05-04
2010-05-05
2010-05-06
2010-05-07
2010-05-08
2010-05-09
2010-05-10
2010-05-11
2010-05-12
2010-05-13
2010-05-14
2010-05-15
2010-05-16
2010-05-17
2010-05-18
2010-05-19
2010-05-20
2010-05-21
2010-05-22
2010-05-23
2010-05-24
2010-05-25
2010-05-26
2010-05-27
2010-05-28
2010-05-29
2010-05-30
2010-05-31
2010-06-01
2010-06-02
2010-06-03
2010-06-04
2010-06-05
2010-06-06
2010-06-07
2010-06-08
2010-06-09
2010-06-10
2010-06-11
2010-06-12
2010-06-13
2010-06-14
2010-06-15
2010-06-16
2010-06-17
2010-06-18
2010-06-19
2010-06-20
2010-06-21
2010-06-22
2010-06-23
2010-06-24
2010-06-25
2010-06-26
2010-06-27
2010-06-28
2010-06-29
2010-06-30
2010-07-01
2010-07-02
2010-07-03
2010-07-04
2010-07-05
2010-07-06
2010-07-07
2010-07-08
2010-07-09
2010-07-10


In [10]:
device_to_empirical_means = {}
device_to_empirical_variances = {}

for device in sorted(device_to_numtrials.keys()):
  device_to_empirical_means[int(device)] = device_to_trialsums[device]/device_to_numtrials[device]
  device_to_empirical_variances[int(device)] = device_to_trialsquaredsums[device]/device_to_numtrials[device] - np.square(device_to_empirical_means[device])
  print(device, device_to_empirical_means[device], device_to_empirical_variances[device], device_to_numtrials[device])

1 100.00003387637281 50.0394973794173 757046811
28 100.00102260927562 49.929318470223734 339925530
33 99.99905513535508 49.98364548077734 73111001
34 99.9991490545327 50.010904383181696 16070360
37 99.99951231229896 49.97856402159414 764667633
100 100.00064465560546 50.035016702666326 186193991
101 99.99902730977756 50.026016811461886 232433713
102 99.99973835200012 50.044627336321355 299589525
103 100.00037224849079 49.999872650772886 579102415
104 100.11563462196374 50.08789445698858 2923
105 99.99986369053155 50.035604185643024 350745994
106 99.99946800412046 50.03989745086983 258548995
107 99.99975286590929 50.0261066558287 80539273
108 100.00020136937034 50.039432455276255 443528228
109 99.99979810292754 50.05367296222721 81313710
110 99.96129707520899 53.73453631106531 514129077
111 100.0000309797357 50.02283268463907 525472527
112 99.99991723651705 49.98023645664034 728038476
114 99.99970929893038 50.02288781737661 166359897
115 100.00028868847906 50.05818525186805 266761598
116

  device_to_empirical_means[int(device)] = device_to_trialsums[device]/device_to_numtrials[device]
  device_to_empirical_variances[int(device)] = device_to_trialsquaredsums[device]/device_to_numtrials[device] - np.square(device_to_empirical_means[device])


In [11]:
import json

with open("empirical_device_variances.json", 'w') as f:
  f.write(json.dumps(device_to_empirical_variances))

with open("empirical_device_means.json", 'w') as f:
  f.write(json.dumps(device_to_empirical_means))