---
# Figures notebook

Supplementary material for the paper ***The Visual Story of Data Storage: From Storage Properties to User Interfaces***, *Aleksandar Anžel, Dominik Heider, and Georges Hattab*

Please cite the paper when using this notebook or data.

---

In [1]:
import pandas as pd
import numpy as np
import altair as alt
import os
from datetime import timedelta
from altair_saver import save

In [2]:
__author__ = 'Aleksandar Anžel'
__copyright__ = ''
__credits__ = ['Aleksandar Anžel', 'Georges Hattab']
__license__ = 'GNU General Public License v3.0'
__version__ = '1.0'
__maintainer__ = 'Aleksandar Anžel'
__email__ = 'aleksandar.anzel@uni-marburg.de'
__status__ = 'Dev'

In [3]:
# Function that changes font family globaly
def cm_mono():
    font = "CM Mono"
    
    return {
        "config" : {
             "title": {'font': font},
             "axis": {
                  "labelFont": font,
                  "titleFont": font
             },
             "header": {
                  "labelFont": font,
                  "titleFont": font
             },
             "legend": {
                  "labelFont": font,
                  "titleFont": font
             }
        }
    }

alt.themes.register('cm_mono', cm_mono)
alt.themes.enable('cm_mono')


ThemeRegistry.enable('cm_mono')

---
## Data import

In [4]:
table_name = 'ori.dat' 
table = pd.read_csv(table_name, delimiter = ';', index_col = 0, skipinitialspace = True)
table.head()

Unnamed: 0_level_0,year,name,type,usage,capacity,lifespan,addressability,mutability,accessability,Unnamed: 10
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1932,Drum memory,magnetic,1960s,62500,,0.0,2,0,
2,1946,Williams-Kilburn Tube,cathode ray tube,1955,1000,,0.0,2,1,
3,1949,Magnetic-core memory,magnetic,1970s,100000,,1.0,2,1,
4,1952,Magnetic Band (Tape),magnetic,today,330000000000000,,0.0,2,0,
5,1956,Hard Disk Drive (HDD),magnetic,today,2000000000000,,1.0,2,1,


---
## Data cleaning

In [5]:
# Used for correcting year representation
def fix_year(year_string):
    return str.split(str(year_string), '.')[0]

In [6]:
table['year'] = table['year'].apply(fix_year)
table['year'] = pd.to_datetime(table['year'])
table.head()

Unnamed: 0_level_0,year,name,type,usage,capacity,lifespan,addressability,mutability,accessability,Unnamed: 10
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1932-01-01,Drum memory,magnetic,1960s,62500,,0.0,2,0,
2,1946-01-01,Williams-Kilburn Tube,cathode ray tube,1955,1000,,0.0,2,1,
3,1949-01-01,Magnetic-core memory,magnetic,1970s,100000,,1.0,2,1,
4,1952-01-01,Magnetic Band (Tape),magnetic,today,330000000000000,,0.0,2,0,
5,1956-01-01,Hard Disk Drive (HDD),magnetic,today,2000000000000,,1.0,2,1,


In [7]:
# Add jitter to year values so that we don't have overlapping
value_count = table['year'].value_counts()
value_count.index = pd.to_datetime(value_count.index)
value_count.head()

1994-01-01    2
2010-01-01    2
1978-01-01    2
1995-01-01    2
2000-01-01    1
Name: year, dtype: int64

In [8]:
counter = 0
timestamp_temp = None

for i, row in table.iterrows():
    
    if (timestamp_temp != row['year']):
        timestamp_temp = row['year']
        counter = 0
        
    if (value_count[row['year']] != 1):
        table.at[i, 'year'] = row['year'] + pd.DateOffset(years=counter)
        counter+=1


In [9]:
table.head()

Unnamed: 0_level_0,year,name,type,usage,capacity,lifespan,addressability,mutability,accessability,Unnamed: 10
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,1932-01-01,Drum memory,magnetic,1960s,62500,,0.0,2,0,
2,1946-01-01,Williams-Kilburn Tube,cathode ray tube,1955,1000,,0.0,2,1,
3,1949-01-01,Magnetic-core memory,magnetic,1970s,100000,,1.0,2,1,
4,1952-01-01,Magnetic Band (Tape),magnetic,today,330000000000000,,0.0,2,0,
5,1956-01-01,Hard Disk Drive (HDD),magnetic,today,2000000000000,,1.0,2,1,


Casting column *type* to string

In [10]:
storage_types = table['type']
storage_types = storage_types.unique().astype(str)
storage_types

array(['magnetic', 'cathode ray tube', 'optical', 'electronic',
       'electro-mechanical', 'magneto-optical', 'molecular', 'atomic'],
      dtype='<U18')

---
## Creating figures
### Defining global variables

In [11]:
# Defining colors

pink_custom = '#CD7DA9' #alt.value('rgb(205, 125, 169)')
blue_custom = '#2476B6' #alt.value('rgb(36, 118, 182)')
orange_custom = '#E6A02E' #alt.value('rgb(230, 160, 46)')
green_custom = '#479F77' #alt.value('rgb(71, 159, 119)')
red_custom = '#D73F47' #alt.value('rgb(215, 63, 71)')
gray_custom = '#BCBCBC' #alt.value('rgb(188, 188, 188)')

# Defining global variables
strokeWidth_var = 3
chartWidth_var = 600


### 1. Creating chart: x = year, y = access

In [12]:
first_chart_access = alt.Chart(data = table).mark_circle(size=100, opacity = 1).encode(
    alt.X('year:T'),
    alt.Y('accessability:N'),
    alt.Color('type:N', scale = alt.Scale(domain=['magnetic', 'cathode ray tube', 'optical', 'electronic', 'electro-mechanical'],
                      range=[blue_custom, pink_custom, gray_custom, orange_custom, red_custom]), legend = None)
)

In [13]:
first_chart_access

In [14]:
second_chart_access = alt.Chart(data = table).mark_circle(size=60, opacity = 1, stroke = blue_custom, fill = gray_custom).encode(
    alt.X('year:T'),
    alt.Y('accessability:N'),
    alt.Opacity('type:N', legend = None),
).properties(
    width=600
).transform_filter ('datum.type == "magneto-optical"')

In [15]:
second_chart_access

In [16]:
third_chart_access = alt.Chart(data = table).mark_circle(size=60, opacity = 1, stroke = 'gray', fill = 'white').encode(
    alt.X('year:T'),
    alt.Y('accessability:N'),
    alt.Opacity('type:N', legend = None),
).properties(
    width=600
).transform_filter ('datum.type == "atomic"')

In [17]:
third_chart_access

In [18]:
fourth_chart_access = alt.Chart(data = table).mark_circle(size=60, opacity = 1, stroke = 'black', fill = 'white').encode(
    alt.X('year:T', scale = alt.Scale (nice = True), axis = alt.Axis(title = None)),
    alt.Y('accessability:N', axis = alt.Axis(title = None)),
    alt.Opacity('type:N', legend = None)
).properties(
    width=600
).transform_filter ('datum.type == "molecular"')

In [19]:
fourth_chart_access

In [20]:
final_chart_access = first_chart_access + second_chart_access + third_chart_access + fourth_chart_access

In [21]:
final_chart_access = final_chart_access.properties(width = chartWidth_var)
final_chart_access

### 2. Creating chart: x = capacity, y = name

In [22]:
table['capacity_float'] = table['capacity'].astype(float)
table.head()

Unnamed: 0_level_0,year,name,type,usage,capacity,lifespan,addressability,mutability,accessability,Unnamed: 10,capacity_float
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1932-01-01,Drum memory,magnetic,1960s,62500,,0.0,2,0,,62500.0
2,1946-01-01,Williams-Kilburn Tube,cathode ray tube,1955,1000,,0.0,2,1,,1000.0
3,1949-01-01,Magnetic-core memory,magnetic,1970s,100000,,1.0,2,1,,100000.0
4,1952-01-01,Magnetic Band (Tape),magnetic,today,330000000000000,,0.0,2,0,,330000000000000.0
5,1956-01-01,Hard Disk Drive (HDD),magnetic,today,2000000000000,,1.0,2,1,,2000000000000.0


In [23]:
first_chart_capacity = alt.Chart(data = table).mark_circle(size=100, opacity = 1).encode(
    alt.X('name:N', sort = None, axis = alt.Axis(title = None, grid = True, labelAngle = -45)),
    alt.Y('capacity_float:Q', scale = alt.Scale(type = 'log', nice = True), axis=alt.Axis(title = None, format=".1e")),
    alt.Color('type:N', scale = alt.Scale(domain=['magnetic', 'cathode ray tube', 'optical', 'electronic', 'electro-mechanical'],
                      range=[blue_custom, pink_custom, gray_custom, orange_custom, red_custom]), legend = None),
)


In [24]:
first_chart_capacity

In [25]:
second_chart_capacity = alt.Chart(data = table).mark_circle(size=60, opacity = 1, stroke = blue_custom, fill = gray_custom).encode(
    alt.X('name:N', sort = None, axis = alt.Axis(grid = True, labelAngle = -45)),
    alt.Y('capacity_float:Q', scale = alt.Scale(type = 'log')),
).transform_filter ('datum.type == "magneto-optical"')

In [26]:
second_chart_capacity

In [27]:
third_chart_capacity = alt.Chart(data = table).mark_circle(size=60, opacity = 1, stroke = gray_custom, fill = 'white').encode(
    alt.X('name:N', sort = None, axis = alt.Axis(grid = True, labelAngle = -45)),
    alt.Y('capacity_float:Q', scale = alt.Scale(type = 'log', nice = True)),
).transform_filter ('datum.type == "atomic"')

In [28]:
third_chart_capacity

In [29]:
fourth_chart_capacity = alt.Chart(data = table).mark_circle(size=60, opacity = 1, stroke = 'black', fill = 'white').encode(
    alt.X('name:N', sort = None, axis = alt.Axis(grid = True, labelAngle = -45)),
    alt.Y('capacity_float:Q', scale = alt.Scale(type = 'log', nice = True)),
).transform_filter ('datum.type == "molecular"')

In [30]:
fourth_chart_capacity

In [31]:
final_chart_capacity = first_chart_capacity + second_chart_capacity + third_chart_capacity + fourth_chart_capacity

In [32]:
final_chart_capacity = final_chart_capacity.properties(width = chartWidth_var)
final_chart_capacity

### 3. Creating chart: x = year/usage, y = name

In [33]:
table.head()

Unnamed: 0_level_0,year,name,type,usage,capacity,lifespan,addressability,mutability,accessability,Unnamed: 10,capacity_float
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1932-01-01,Drum memory,magnetic,1960s,62500,,0.0,2,0,,62500.0
2,1946-01-01,Williams-Kilburn Tube,cathode ray tube,1955,1000,,0.0,2,1,,1000.0
3,1949-01-01,Magnetic-core memory,magnetic,1970s,100000,,1.0,2,1,,100000.0
4,1952-01-01,Magnetic Band (Tape),magnetic,today,330000000000000,,0.0,2,0,,330000000000000.0
5,1956-01-01,Hard Disk Drive (HDD),magnetic,today,2000000000000,,1.0,2,1,,2000000000000.0


In [34]:
# Modifying table column "usage"
def fix_usage(year_string):
    if year_string == 'today':
        year_string = '2021'
    elif year_string[-1] == 's':
        year_string = year_string[:-2] + '5'
    return year_string

table['usage'] = table['usage'].astype(str)
table['usage'] = table['usage'].apply(fix_usage)
table['usage'] = pd.to_datetime(table['usage'])
table.head()


Unnamed: 0_level_0,year,name,type,usage,capacity,lifespan,addressability,mutability,accessability,Unnamed: 10,capacity_float
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,1932-01-01,Drum memory,magnetic,1965-01-01,62500,,0.0,2,0,,62500.0
2,1946-01-01,Williams-Kilburn Tube,cathode ray tube,1955-01-01,1000,,0.0,2,1,,1000.0
3,1949-01-01,Magnetic-core memory,magnetic,1975-01-01,100000,,1.0,2,1,,100000.0
4,1952-01-01,Magnetic Band (Tape),magnetic,2021-01-01,330000000000000,,0.0,2,0,,330000000000000.0
5,1956-01-01,Hard Disk Drive (HDD),magnetic,2021-01-01,2000000000000,,1.0,2,1,,2000000000000.0


In [35]:
first_chart_usage = alt.Chart(data = table).mark_bar().encode(
    alt.X('year:T', axis = alt.Axis(title = None, tickCount = 50)),
    alt.X2('usage:T'),
    alt.Y('name:N', sort = 'x', axis = alt.Axis(title = None)),
    alt.Color('type:N', scale = alt.Scale(domain=['magnetic', 'cathode ray tube', 'optical', 'electronic', 'electro-mechanical'],
                      range=[blue_custom, pink_custom, gray_custom, orange_custom, red_custom]), legend = None),
)


In [36]:
first_chart_usage

In [37]:
second_chart_usage = alt.Chart(data = table).mark_bar(stroke = blue_custom, fill = gray_custom, strokeWidth = strokeWidth_var).encode(
    alt.X('year:T'),
    alt.X2('usage:T'),
    alt.Y('name:N', sort = 'x'),
).transform_filter ('datum.type == "magneto-optical"')

In [38]:
second_chart_usage

In [39]:
third_chart_usage = alt.Chart(data = table).mark_bar(stroke = gray_custom, fill = 'white', strokeWidth = strokeWidth_var).encode(
    alt.X('year:T'),
    alt.X2('usage:T'),
    alt.Y('name:N', sort = 'x'),
).transform_filter ('datum.type == "atomic"')

In [40]:
third_chart_usage

In [41]:
fourth_chart_usage = alt.Chart(data = table).mark_bar(stroke = 'black', fill = 'white', strokeWidth = strokeWidth_var).encode(
    alt.X('year:T'),
    alt.X2('usage:T'),
    alt.Y('name:N', sort = 'x'),
).transform_filter ('datum.type == "molecular"')

In [42]:
fourth_chart_usage

In [43]:
final_chart_usage = first_chart_usage + second_chart_usage + third_chart_usage + fourth_chart_usage

In [44]:
final_chart_usage = final_chart_usage.properties(width = chartWidth_var)
final_chart_usage

## Saving figures

Important: Due to a bug documented here https://github.com/altair-viz/altair/issues/1954 font changes are not persistent when trying to save as pdf.
The warnings can be ignored.

In [45]:
root_save_path = 'Output'

#save(final_chart_access, os.path.join(root_save_path, 'Year_access.pdf'))
#save(final_chart_capacity, os.path.join(root_save_path, 'Capacity_name.pdf'))
#save(final_chart_usage, os.path.join(root_save_path, 'Usage_name.pdf'))

save(final_chart_access, os.path.join(root_save_path, 'Year_access.svg'))
save(final_chart_capacity, os.path.join(root_save_path, 'Capacity_name.svg'))
save(final_chart_usage, os.path.join(root_save_path, 'Usage_name.svg'))


WARN Channel opacity should not be used with an unsorted discrete field.
WARN Channel opacity should not be used with an unsorted discrete field.
WARN Channel opacity should not be used with an unsorted discrete field.
