In [1]:
import pandas as pd
import psutil
import resource
import requests as r
from io import StringIO

## Memory management
When you start a program, you are allocated a finite amount of Random Access Memory (RAM) to work with. Generally, your program will behave well when you are not approaching your memory limit.

In [2]:
# Calculate the maximum memory limit (80% of available memory)
virtual_memory = psutil.virtual_memory()
available_memory = virtual_memory.available
memory_limit = int(available_memory * 0.8)
print("available memory: {}, setting to 80% of that: {}".format(available_memory, memory_limit))

# Set the memory limit
resource.setrlimit(resource.RLIMIT_AS, (memory_limit, memory_limit))

available memory: 501522817024, setting to 80% of that: 401218253619


### Read in file from Panasas (parallel storage)

In [3]:
%%time
df = pd.read_csv("/home/gems_learning/shared/hpc4ag/3k-core-v7-chr1/chr1.vcf", sep="\t", skiprows=6)

CPU times: user 10.5 s, sys: 1.27 s, total: 11.8 s
Wall time: 11.9 s


### Basic dataframe manipulations

In [4]:
df.head()

Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,B001_B001,...,IRIS_313-15901_IRIS_313-15901,IRIS_313-15902_IRIS_313-15902,IRIS_313-15903_IRIS_313-15903,IRIS_313-15904_IRIS_313-15904,IRIS_313-15905_IRIS_313-15905,IRIS_313-15906_IRIS_313-15906,IRIS_313-15907_IRIS_313-15907,IRIS_313-15908_IRIS_313-15908,IRIS_313-15909_IRIS_313-15909,IRIS_313-15910_IRIS_313-15910
0,1,1178,1178,G,T,.,.,PR,GT,0/0,...,./.,./.,0/0,0/0,0/0,0/0,./.,1/1,1/1,0/0
1,1,1203,1203,T,C,.,.,PR,GT,0/0,...,./.,0/0,1/1,./.,1/1,0/0,0/0,1/1,1/1,0/0
2,1,1248,1248,G,A,.,.,PR,GT,0/0,...,0/0,0/0,0/0,0/0,1/1,0/0,0/0,0/0,0/0,0/0
3,1,1282,1282,G,A,.,.,PR,GT,0/0,...,0/0,0/0,./.,0/0,0/0,0/0,0/0,0/0,0/0,0/0
4,1,1299,1299,T,C,.,.,PR,GT,0/0,...,0/0,0/0,./.,0/0,0/0,0/0,0/0,1/1,1/1,0/0


In [5]:
%%time
sum(df['POS'].unique())

CPU times: user 3.73 ms, sys: 0 ns, total: 3.73 ms
Wall time: 3.77 ms


920164560916

#### Now let's limit memory and see what happens

In [6]:
#resource.setrlimit(resource.RLIMIT_AS, (5000000000, 7890000000))
resource.setrlimit(resource.RLIMIT_AS, (4999900000, 16000000000))

In [7]:
%%time
df = pd.read_csv("/home/gems_learning/shared/hpc4ag/3k-core-v7-chr1/chr1.vcf", sep="\t", skiprows=6)

CPU times: user 9.82 s, sys: 544 ms, total: 10.4 s
Wall time: 10.4 s


#### Rerun the same operation

In [8]:
%%time
sum(df['POS'].unique())

CPU times: user 4.07 ms, sys: 0 ns, total: 4.07 ms
Wall time: 4.11 ms


920164560916