In [6]:
import pandas as pd
import re


In [7]:
csv_file = 'covis_db_metadata_report_20190403.csv'

data = pd.read_csv(csv_file)
data['datetime'] = pd.to_datetime(data['datetime'])

In [22]:
raw_size = data['raw_size'].sum()

date_min = data['datetime'].min()
date_max = data['datetime'].max()
date_span = date_max - date_min


print("Total of %d data files spanning %d days from %s to %s" % (len(data.index), date_span.days, \
                                                                date_min.strftime('%Y-%m-%d'), \
                                                                 date_max.strftime('%Y-%m-%d')) )
print("Raw size % 9.2f GB  (%15d bytes)" % (raw_size / 1024**3, raw_size))

raw_null_files = data[ pd.isnull(data['raw_size']) ]
zip_null_files = data[ pd.isnull(data['7z_size']) ]
print("")
print("%d files with no reported raw size" % len(raw_null_files))
print("%d files with no reported 7z size" % len(zip_null_files))

# Count number with non-zero gz size
gz_files = data[ data['gz_size'] > 0 ]

gz_raw_size = gz_files['raw_size'].sum()
gz_size  = gz_files['gz_size'].sum()

print("")
print("%d files with reported .gz size" % len(gz_files))
print("gz size  % 9.2f GB  (%15d bytes) mean compression %.2f pct" % ( gz_size / 1024**3,  gz_size, 100*gz_size/gz_raw_size))


zip_files = data[ data['7z_size'] > 0]
zip_size = zip_files['7z_size'].sum()
zip_raw_size = zip_files['raw_size'].sum()

print("")
print("%d files with reported .7z size" % len(zip_files))
print("7z size  % 9.2f GB  (%15d bytes) mean compression %.2f pct" % (zip_size / 1024**3, zip_size, 100*zip_size/zip_raw_size))


print(zip_null_files)

Total of 54808 data files spanning 3107 days from 2010-09-30 to 2019-04-03
Raw size  119600.50 GB  (128420055112694 bytes)

466 files with no reported raw size
118 files with no reported 7z size

7390 files with reported .gz size
gz size    1649.32 GB  (  1770944963473 bytes) mean compression 29.58 pct

54690 files with reported .7z size
7z size   20899.83 GB  ( 22441017461052 bytes) mean compression 17.62 pct
                                              # basename  \
4405   APLUWCOVISMBSONAR001_20111002T001157.593Z-DOPPLER   
4406   APLUWCOVISMBSONAR001_20111003T001156.606Z-DOPPLER   
4407   APLUWCOVISMBSONAR001_20111003T061155.528Z-DOPPLER   
4408   APLUWCOVISMBSONAR001_20111004T003551.745Z-DOPPLER   
4409   APLUWCOVISMBSONAR001_20111004T063549.504Z-DOPPLER   
4410   APLUWCOVISMBSONAR001_20111005T001156.667Z-DOPPLER   
4411   APLUWCOVISMBSONAR001_20111005T061159.035Z-DOPPLER   
4486   APLUWCOVISMBSONAR001_20111008T093536.721Z-DOPPLER   
4756   APLUWCOVISMBSONAR001_20111014T031029.34

In [5]:
squash = { 'diffuse': re.compile('diffuse*'),
            'doppler': re.compile('doppler*'),
             'imaging': re.compile('imag*'),
             'bathy':  re.compile('bathy*'),
             'survey':  re.compile('survey*'),
             'pano': re.compile('pano*')}

dates = {}
types = {}
for index, entry in data.iterrows():
    dates[ entry['datetime'].date() ] = True
    mode = entry['mode']
    
    for key,regex in squash.items():
        if regex.match(mode):
            mode = key
            break
        
    if mode not in types:
        types[mode] = 0
        
    types[mode] += 1
    
print("COVIS ran on %d days, averaging %.2f GB per day (as gz)" % (len(dates), (gz_size / (1024**3*len(dates)))))
    
for mode,count in types.items():
    print("%10s : %d" % (mode,count))
    

COVIS ran on 604 days, averaging 0.42 GB per day (as gz)
    survey : 3
   imaging : 420
   diffuse : 2543
    target : 18
   doppler : 55
     bathy : 3
   IMAGING : 5033
   DOPPLER : 4714
   DIFFUSE : 4452


In [62]:
types = {}
for index, entry in data.iterrows():
    mode = entry['mode']
    if mode not in types:
        types[mode] = 0
        
    types[mode] += 1
    
for mode,count in types.items():
    print("%10s : %d" % (mode,count))

   survey1 : 1
   survey2 : 1
   imaging : 1
   diffuse : 9
   survey3 : 1
  diffuse3 : 479
  diffuse2 : 515
  diffuse1 : 480
    target : 18
  imaging1 : 177
  imaging2 : 180
  imaging3 : 184
  doppler1 : 1
  doppler2 : 97
  doppler3 : 1
diffuse2up : 1
diffuse2down : 3
diffuse3down : 1
diffuse1down : 1
    bathy2 : 1
    bathy1 : 1
    bathy3 : 1
diffuse3deep : 213
diffuse1deep : 206
diffuse2deep : 217
diffuse2shallow : 219
diffuse1shallow : 214
diffuse3shallow : 216
  panoleft : 8
panocenter : 7
 panoright : 2
diffuseright : 709
diffuseleft : 689
 panoleft2 : 1
panoright2 : 1
 imageleft : 503
imagecenter : 503
