## Interacting with files

In [1]:
from pubweb import PubWeb
from pubweb.auth import UsernameAndPasswordAuth
from pubweb.config import load_config

client = PubWeb(auth_info=UsernameAndPasswordAuth(*load_config()))

In [2]:
project_id = '9a31492a-e679-43ce-9f06-d84213c8f7f7'
dataset_id = '0394e754-f3aa-4fbe-8ba8-3bf18809ca5c'

files = client.dataset.get_dataset_files(project_id=project_id,
                                         dataset_id=dataset_id)

In [11]:
from pubweb.file_utils import filter_files_by_pattern

counts_file = next((f for f in files if f.name == 'counts.txt'))
# You can also use the filter files function
counts_file = filter_files_by_pattern(files, '**/counts.txt')[0]
counts_file

If you don't already have access to the file, you must use the file service to get the file contents as a string.

In [13]:
from pubweb.models.file import FileAccessContext

access_context = FileAccessContext.download_dataset(dataset_id=dataset_id,
                                                    project_id=project_id)
counts = client.file.get_file(access_context=access_context,
                              file_path=counts_file.relative_path)

From here you can load it into a dataframe by wrapping it in `StringIO`

In [17]:
import pandas as pd
from io import StringIO

df = pd.read_csv(StringIO(counts), sep='\t')
df.head()

Unnamed: 0,sgRNA,Gene,MO_Brunello_gDNA_2,MO_Brunello_1,MO_Brunello_2,MO_Brunello_gDNA_1
0,A1BG_0,A1BG,0,0,0,0
1,A1BG_1,A1BG,0,0,0,2
2,A1BG_2,A1BG,0,0,0,0
3,A1BG_3,A1BG,0,0,2,0
4,A1CF_36946,A1CF,0,0,0,0


If you already have IAM access to the file location, you can just feed the absolute path into `read_csv` directly.

Note that you must also have the package `s3fs` installed.

In [18]:
df = pd.read_csv(counts_file.absolute_path, sep='\t')
df.head()

Unnamed: 0,sgRNA,Gene,MO_Brunello_gDNA_2,MO_Brunello_1,MO_Brunello_2,MO_Brunello_gDNA_1
0,A1BG_0,A1BG,0,0,0,0
1,A1BG_1,A1BG,0,0,0,2
2,A1BG_2,A1BG,0,0,0,0
3,A1BG_3,A1BG,0,0,2,0
4,A1CF_36946,A1CF,0,0,0,0
