# Asymmetric distribution of data products from WALLABY, an SKA precursor neutral hydrogen survey

This is a notebook to execute the statistics of the results for the poster presented in ADASS XXXI.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

In [None]:
# Get the results file
df = pd.read_csv('results.csv', header=0 ,names=['inserts','deletes','started','ended','comments','size'])

In [None]:
# Parse date format to reduce to minutes

df['started'] = df['started'].map(lambda x: datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S.%f'))
df['ended'] = df['ended'].map(lambda x: datetime.strptime(str(x), '%Y-%m-%d %H:%M:%S.%f'))

df['delay'] = df['ended']- df['started']
df['delay'] = df['delay'].map(lambda x: x.total_seconds())


In [None]:
# Get a first view

df.head()

# 10MBytes, 5 rows, same time (50 MBytes transaction)

Get table and results for data products with 10 MBytes and a 5 rows inserted at the same time (batch insertion instead one-by-one), so a single transaction with 50 MBytes.


In [None]:
df[ ( df['size']==10.0) & (df['inserts']==5.0)] 

# 5MBytes, 5 rows, same time (25 MBytes transaction)

Get table and results for data products with 5 MBytes and a 5 rows inserted at the same time (batch insertion instead one-by-one), so a 25 MBytes transaction.

In [None]:
df[ ( df['size']==5.0) & (df['inserts']==5.0)] 

# 20MBytes, 5 rows, same time (100 MBytes transaction)

Get table and results for data products with 20 MBytes and a 5 rows inserted at the same time (batch insertion instead one-by-one), so a 100MBytes transaction.

In [None]:
df[ ( df['size']==20.0) & (df['inserts']==5.0)] 

# 2MBytes, 5 rows, same time (10 MBytes transaction)

Get table and results for data products with 2 MBytes and a 5 rows inserted at the same time (batch insertion instead one-by-one), a 10MBytes each transaction.

In [None]:
df[ ( df['size']==2.0) & (df['inserts']==5.0)] 

# Summary of average data product replication delay for 5-item insertion batches and different data product sizes

In [None]:
plt.errorbar(x = [2,5,10,20], y=[
    df[ ( df['size']==2.0) & (df['inserts']==5.0)]['delay'].mean(),
        df[ ( df['size']==5.0) & (df['inserts']==5.0)]['delay'].mean(),
            df[ ( df['size']==10.0) & (df['inserts']==5.0)]['delay'].mean(),
    df[ ( df['size']==20.0) & (df['inserts']==5.0)]['delay'].mean(),
 
], yerr= [df[ ( df['size']==2.0) & (df['inserts']==5.0)]['delay'].std(),
          df[ ( df['size']==5.0) & (df['inserts']==5.0)]['delay'].std(),
            df[ ( df['size']==10.0) & (df['inserts']==5.0)]['delay'].std(),
                      df[ ( df['size']==20.0) & (df['inserts']==5.0)]['delay'].std()        
         ],linestyle='solid', fmt='o', color='black', marker='o', ecolor='lightblue', elinewidth=3)

plt.xlabel("Number of rows inserted at the same time")
plt.ylabel("Average time for consolidated operation")
plt.legend(loc='upper left')
plt.title('Average delay in replication of data product insertion batches.')

In [None]:
df[ ( df['size']==5.0) & (df['inserts']!=0)] 

# Average replication time for incremental insertion/deletion with 5MBytes data products

In [None]:
plt.errorbar(x = [2,5,10,15,20,25,30,35,40,45,50,60,70,80,90,100],
             y=[
                df[ ( df['size']==5.0) & (df['inserts']==2)]['delay'],
                df[ ( df['size']==5.0) & (df['inserts']==5)]['delay'].mean(),               
                df[ ( df['size']==5.0) & (df['inserts']==10)]['delay'],
                df[ ( df['size']==5.0) & (df['inserts']==15)]['delay'],
                df[ ( df['size']==5.0) & (df['inserts']==20)]['delay'],                 
                df[ ( df['size']==5.0) & (df['inserts']==25)]['delay'],
                df[ ( df['size']==5.0) & (df['inserts']==30)]['delay'],
                df[ ( df['size']==5.0) & (df['inserts']==35)]['delay'],                 
                df[ ( df['size']==5.0) & (df['inserts']==40)]['delay'],                 
                df[ ( df['size']==5.0) & (df['inserts']==45)]['delay'],                 
                df[ ( df['size']==5.0) & (df['inserts']==50)]['delay'],                 
                df[ ( df['size']==5.0) & (df['inserts']==60)]['delay'],                 
                df[ ( df['size']==5.0) & (df['inserts']==70)]['delay'],                 
                df[ ( df['size']==5.0) & (df['inserts']==80)]['delay'],                 
                df[ ( df['size']==5.0) & (df['inserts']==90)]['delay'],                 
                df[ ( df['size']==5.0) & (df['inserts']==100)]['delay'],                 
             ],
             linestyle='solid', fmt='o', color='black', marker='o', ecolor='lightblue', elinewidth=3)

plt.xlabel("Number of rows inserted at the same time")
plt.ylabel("Average time for consolidated operation")
plt.legend(loc='upper left')
plt.title('Average delay in replication of data product insertion batches ().')

# Average replication time for incremental insertion/deletion with 2MBytes data products

In [None]:
plt.errorbar(x = [2,5,10,15,20,25,30,35,40,45,50,60,70,80,90,100],
             y=[
                df[ ( df['size']==2.0) & (df['inserts']==2)]['delay'].mean(),
                df[ ( df['size']==2.0) & (df['inserts']==5)]['delay'].mean(),               
                df[ ( df['size']==2.0) & (df['inserts']==10)]['delay'].mean(),
                df[ ( df['size']==2.0) & (df['inserts']==15)]['delay'].mean(),
                df[ ( df['size']==2.0) & (df['inserts']==20)]['delay'].mean(),                 
                df[ ( df['size']==2.0) & (df['inserts']==25)]['delay'].mean(),
                df[ ( df['size']==2.0) & (df['inserts']==30)]['delay'].mean(),
                df[ ( df['size']==2.0) & (df['inserts']==35)]['delay'].mean(),                 
                df[ ( df['size']==2.0) & (df['inserts']==40)]['delay'].mean(),                 
                df[ ( df['size']==2.0) & (df['inserts']==45)]['delay'].mean(),                 
                df[ ( df['size']==2.0) & (df['inserts']==50)]['delay'].mean(),                 
                df[ ( df['size']==2.0) & (df['inserts']==60)]['delay'].mean(),                 
                df[ ( df['size']==2.0) & (df['inserts']==70)]['delay'].mean(),                 
                df[ ( df['size']==2.0) & (df['inserts']==80)]['delay'].mean(),                 
                df[ ( df['size']==2.0) & (df['inserts']==90)]['delay'].mean(),                 
                df[ ( df['size']==2.0) & (df['inserts']==100)]['delay'].mean()               
             ],
             linestyle='solid', fmt='o', color='black', marker='o', ecolor='lightblue', elinewidth=3)

plt.xlabel("Number of rows inserted at the same time")
plt.ylabel("Average time for consolidated operation")
plt.legend(loc='upper left')
plt.title('Average delay in replication of data product insertion batches ().')