In [None]:
# default_exp ipfsspec

In [56]:
#hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# IPFSSpec

> Read and Write implementation FSSpec for IPFS 

In [81]:
#hide
from nbdev.showdoc import *

In [82]:
#export
from os.path import isdir
from typing import Union, List
import json
import random
import time
import requests
import pandas as pd
from io import StringIO
from requests.exceptions import HTTPError
from fsspec.spec import AbstractFileSystem
from ipfsspec.core import IPFSBufferedFile
from ipfshttpclient.multipart import stream_files, stream_directory

In [83]:
#export 
from ipfspy.utils import GATEWAYS_API_READ, GATEWAYS_API_WRITE, parse_error_message, parse_response, get_coreurl

## IPFSGateway

In [84]:
#export
class IPFSGateway:
    def __init__(self, 
        local:bool=True, # Use local IPFS deamon or not
        coreurl:str=None, # Core URL of an alterative gateways to use 
    ):
        'Starts a IPFS Gateway either using local node or infura. If given `coreurl`, will use that for the gateway'
        self.coreurl = coreurl
        if self.coreurl is None:
            self.url = self.get_gateway(local=local)
            
        else:
            self.url = self.get_gateway(coreurl=coreurl)
            
        self.session = requests.Session()
        adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
        self.session.mount('http://', adapter)
        self.session.mount('https://', adapter)
    
    def get_gateway(self,
        local:bool=True, # If local uses local node, else uses Infura.io gateway
        coreurl:str=None, # URL of other gateways
    ):
        'Set the core url for convenience'
        
        return get_coreurl(local=local, coreurl=coreurl)

        
    def get(self, 
        cid:str, # Path to the IPFS object
        **kwargs
    ):
        'Get a file/directory from IPFS'
        
        params = {}
        params['arg'] = cid
        params.update(kwargs)
        
        res = self.session.post(f'{self.url}/get', params=params)
            
        if res.status_code == 200:
            return res, parse_response(res)
        
        else:
            raise HTTPError (parse_error_message(res))
            
    def cat(self, 
        cid:str, # Path to the IPFS object
        **kwargs
    ):
        'Read a file from IPFS'
        
        params = {}
        params['arg'] = cid
        params.update(kwargs)
        
        res = self.session.post(f'{self.url}/cat', params=params)
        
        if res.status_code == 200:
            return res, res.text
        
        else:
            if res.status_code == 500:
                raise TypeError (f"dag node {path} is a directory; Provide a file CID")
            else:
                raise HTTPError (parse_error_message(res))

    def apipost(self, 
        call:str, # The call type to post e.g. 'add', 'ls', 'pin/add', 'pin/ls'
        filepath:Union[str, List[str]]=None, # Path to files or directory or IPFS Path
        directory:bool=False, # Is filepath a directory 
        chunk_size=200000, # Chunk size to use
        **kwargs): 
        'Makes `post` call to the HTTP APPI'
    
        
        if call == 'add':
            if isdir(filepath):
                if directory == False:
                    raise TypeError (f"{filepath} is a directory. Set arg directory as True")
            
            params = {}
            params.update(kwargs)
            
            if not directory:
                data, headers = stream_files(filepath, chunk_size=chunk_size)

            else:
                data, headers = stream_directory(filepath, chunk_size=chunk_size)
            
            res = self.session.post(f'{self.url}/add',
                                     params=params,
                                     data=data,
                                     headers=headers)
            
            if res.status_code == 200:
                return res, parse_response(res)
            
            else:
                raise HTTPError (parse_error_message(res))
                
        else:
            params = {}
            params['arg'] = filepath
            params.update(kwargs)
            
            res = self.session.post(f'{self.url}/{call}', params=params)
                
            if res.status_code == 200:
                return res, parse_response(res)
            
            else:
                raise HTTPError (parse_error_message(res))

    def head(self, 
        cid:str, # Path to the IPFS object
        headers=None, 
        **kwargs
    ):     
        
        res,_ = self.get(cid)
        
        return res.headers

### Using IPFSGateway

Using local-node supports all function. Infura gateway supports both read and write but not the complete set as offered by the local-node. We have a list of read and read/write gateways. You can access them as such:

In [85]:
#ignoretest
GATEWAYS_API_READ

['https://ipfs.io/api/v0',
 'https://gateway.pinata.cloud/api/v0',
 'https://cloudflare-ipfs.com/api/v0',
 'https://dweb.link/api/v0',
 'https://ipfs.eth.aragon.network/api/v0',
 'https://permaweb.eu.org/api/v0',
 'https://nftstorage.link/api/v0',
 'https://ipfs.lain.la/api/v0',
 'https://ipfs.mihir.ch/api/v0',
 'https://ipfs.telos.miami/api/v0',
 'https://jorropo.net/api/v0',
 'https://cf-ipfs.com/api/v0',
 'https://cloudflare-ipfs.com/api/v0',
 'https://gateway.ipfs.io/api/v0',
 'https://infura-ipfs.io/api/v0',
 'https://via0.com/api/v0',
 'https://ipfs.azurewebsites.net/api/v0']

In [86]:
#ignoretest
GATEWAYS_API_WRITE

['https://ipfs.io/api/v0',
 'https://gateway.pinata.cloud/api/v0',
 'https://cloudflare-ipfs.com/api/v0',
 'https://dweb.link/api/v0']

In this doc, we will be using the local-node unless specified. 

Extending a client session using the local node.

In [87]:
#ignoretest
gw = IPFSGateway(local=True); gw.url

'http://127.0.0.1:5001/api/v0'

Extending a client session using infura.

In [88]:
#ignoretest
gw = IPFSGateway(local=False); gw.url

'https://ipfs.infura.io:5001/api/v0'

Extending a client session using a public gateway.

In [89]:
#ignoretest
gw = IPFSGateway(coreurl=random.choice(GATEWAYS_API_READ)); gw.url

'https://jorropo.net/api/v0'

In [90]:
#ignoretest
show_doc(IPFSGateway.get)

<h4 id="IPFSGateway.get" class="doc_header"><code>IPFSGateway.get</code><a href="__main__.py#L29" class="source_link" style="float:right">[source]</a></h4>

> <code>IPFSGateway.get</code>(**`cid`**:`str`, **\*\*`kwargs`**)

Get a file/directory from IPFS

||Type|Default|Details|
|---|---|---|---|
|**`cid`**|`str`||Path to the IPFS object|
|**`kwargs`**|||*No Content*|


Let's use a local-node to get a IPFS file

In [91]:
#ignoretest
gw = IPFSGateway(local=True)

In [92]:
#ignoretest
res, cont = gw.get('QmUfwG4P6EA5xbD3De5bS7XKcBion8ReQj7m9ZjxaPvq3B'); cont[500:700]

'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00# Welcome to Immerse by Algovera\r\n> A python library by Algovera to interact with IPFS and IPFS ecosystem such as the common pinning services\r\n\r\n\r\n## What is Immerse?\r\n\r\nImmerse is a pytho'

Use the `cat` function which return the file without the headers and other stuffs that the `get` method returns.

In [93]:
#ignoretest
show_doc(IPFSGateway.cat)

<h4 id="IPFSGateway.cat" class="doc_header"><code>IPFSGateway.cat</code><a href="__main__.py#L47" class="source_link" style="float:right">[source]</a></h4>

> <code>IPFSGateway.cat</code>(**`cid`**:`str`, **\*\*`kwargs`**)

Read a file from IPFS

||Type|Default|Details|
|---|---|---|---|
|**`cid`**|`str`||Path to the IPFS object|
|**`kwargs`**|||*No Content*|


In [94]:
#ignoretest
res, cont = gw.cat('QmUfwG4P6EA5xbD3De5bS7XKcBion8ReQj7m9ZjxaPvq3B'); cont

'# Welcome to Immerse by Algovera\r\n> A python library by Algovera to interact with IPFS and IPFS ecosystem such as the common pinning services\r\n\r\n\r\n## What is Immerse?\r\n\r\nImmerse is a python library by Algovera to interact with IPFS and IPFS ecosystem such as the common pinning services. It is designed by data scientists for data scientists to interact with the IPFS ecosystem without leaving the comfort of python and jupyter notebook.\r\n\r\nYou can learn more about IPFS [here](https://ipfs.io/#why)\r\n\r\nIPFS is built using the go-lang and javascript. With Immerse, you can interact with IPFS using the exposed [HTTP RPC API](https://docs.ipfs.io/reference/http/api/#getting-started). \r\n\r\nYou will need a local IPFS Node running to use the HTTP API (even when using Immerse). As an alternative, you can connect via the [Infura](https://infura.io/product/ipfs)\'s dedicated IPFS gateway. Immerse provide both ways to interact with IPFS.\r\n\r\n## Installing\r\n\r\nto do: instr

Like previously mentioned, infura supports read and write but not the complete set of functions that comes with the local node. Let's see an example. Here, the `ls` call lists the content of a directory. Let's try the same thing with `infura`

In [95]:
#ignoretest
r, res = gw.apipost(call='ls', filepath='QmebrNeK8XYZ6P3oFgftSR6FzBqLmy3rdMzLvT1476bSWA'); res

[{'Objects': [{'Hash': 'QmebrNeK8XYZ6P3oFgftSR6FzBqLmy3rdMzLvT1476bSWA',
    'Links': [{'Name': '.ipynb_checkpoints',
      'Hash': 'QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn',
      'Size': 0,
      'Type': 1,
      'Target': ''},
     {'Name': 'adult_data.csv',
      'Hash': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V',
      'Size': 3974475,
      'Type': 2,
      'Target': ''},
     {'Name': 'fol1',
      'Hash': 'QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn',
      'Size': 0,
      'Type': 1,
      'Target': ''},
     {'Name': 'test.txt',
      'Hash': 'QmTT8vwdbnP9Ls8bSY1LMyW4a8bEwTYZa5izEoJMBtTPfb',
      'Size': 24,
      'Type': 2,
      'Target': ''},
     {'Name': 'test2.txt',
      'Hash': 'QmWa7aPQWCkLAmV2pt1Wjj3uE7V3CG3KYMKBKsVAvWG649',
      'Size': 23,
      'Type': 2,
      'Target': ''},
     {'Name': 'test3.txt',
      'Hash': 'Qmb6W6nVPYd5CJKFpC1zGGuoD7TYQLE5PGG1RHkHm2W3m9',
      'Size': 32,
      'Type': 2,
      'Target': ''},
     {'Name': 'test4.txt'

In [96]:
#ignoretest
gw = IPFSGateway(local=False)
r, res = gw.apipost(call='ls', filepath='QmebrNeK8XYZ6P3oFgftSR6FzBqLmy3rdMzLvT1476bSWA'); res

HTTPError: Response Status Code: 403; Error Message: ipfs method not supported


As can be inferred from the error message, `ls` call is not supported by inura ar this point. 

## IPFSFileSystem (fsspec-like)

In [97]:
#export
class IPFSFileSystem(AbstractFileSystem):
    protocol = "ipfs"
    def __init__(self, 
        local=True, # Use local IPFS deamon or not
        coreurl:str=None, # Core URL of an alterative gateways to use 
        **kwargs):
        'fsspec like read/write IPFS filesystem'
        
        super(IPFSFileSystem, self).__init__(local, **kwargs)
        
        if coreurl is None:
            self.gw = IPFSGateway(local=local)
        
        else:
            self.gw = IPFSGateway(coreurl)
    
    def ls(self, 
        cid:str, # Path of the IPFS object
        detail=True, # Verbose
        **kwargs):
        'List the links of a IPFS file/directory'
        
        _, res = self.gw.apipost("ls", arg=cid)
        
        links = res[0]["Objects"][0]["Links"]
        types = {1: "directory", 2: "file"}
        
        if detail:
            return [{"name": cid + "/" + link["Name"],
                     "size": link["Size"],
                     "type": types[link["Type"]]}
                    for link in links]
        
        else:
            return [cid + "/" + link["Name"]
                    for link in links]
        
    def cat_file(self, 
        cid:str, # Path of the IPFS object
    ):        
        
        r, data = self.gw.cat(cid)      
        
        return r.content
                
    def _open(
        self,
        cid, # Path of the IPFS object
        mode="rb",
        block_size=None,
        autocommit=True,
        cache_options=None,
        **kwargs
    ):
        'Return raw bytes-mode file-like from the file-system'
        
        return IPFSBufferedFile(
            self,
            cid,
            mode,
            block_size,
            autocommit,
            cache_options=cache_options,
            **kwargs
        )
    
    def info(self, 
        cid, # Path of the IPFS object
        **kwargs):
        
        path = self._strip_protocol(cid)

        headers = {"Accept-Encoding": "identity"}  # this ensures correct file size
        response_headers = self.gw.head(cid, headers)

        info = {"name": cid}
        if "X-Content-Length" in response_headers:
            info["size"] = int(response_headers["X-Content-Length"])
        elif "X-Content-Range" in response_headers:
            info["size"] = int(response_headers["X-Content-Range"].split("/")[1])

        if "ETag" in response_headers:
            etag = response_headers["ETag"].strip("\"")
            info["ETag"] = etag
            if etag.startswith("DirIndex"):
                info["type"] = "directory"
                info["CID"] = etag.split("-")[-1]
            else:
                info["type"] = "file"
                info["CID"] = etag
        return info
    
    def write(self,
        filepath, # Path to file/files/directories to write to IPFS
        directory=False, # Is filepath a directory
        chunk_size=200000, # Chunk size to use
        **kwargs
    ):
        'Write the given file/files/directories to the IPFS network'
        
        return self.gw.apipost('add', filepath=filepath, directory=directory, chunk_size=chunk_size, **kwargs)
    
    def read_csv(self, 
        cid:str,
        delimeter:str=','
    ):
        r, data = self.gw.cat(cid)      
        
        return pd.read_csv(StringIO(data), delimiter=delimeter)
    
    def read_json(self, 
        cid:str, 
    ):
        r, data = self.gw.cat(cid)      
        
        return pd.read_json(StringIO(data))

### Using IPFSSpec

In [98]:
#ignoretest
fs = IPFSFileSystem(local=True)

call the `ls` method to list the directories

In [99]:
#ignoretest
res = fs.ls('QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V'); res

[{'name': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V/',
  'size': 262144,
  'type': 'file'},
 {'name': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V/',
  'size': 262144,
  'type': 'file'},
 {'name': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V/',
  'size': 262144,
  'type': 'file'},
 {'name': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V/',
  'size': 262144,
  'type': 'file'},
 {'name': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V/',
  'size': 262144,
  'type': 'file'},
 {'name': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V/',
  'size': 262144,
  'type': 'file'},
 {'name': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V/',
  'size': 262144,
  'type': 'file'},
 {'name': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V/',
  'size': 262144,
  'type': 'file'},
 {'name': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V/',
  'size': 262144,
  'type': 'file'},
 {'name': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V/',
  'size': 262144,
  'type': 'file'},
 {'name': 'QmZnxARhJ

Use the `open` and `read` on a IPFS file

In [100]:
#ignoretest
fs.open('QmUfwG4P6EA5xbD3De5bS7XKcBion8ReQj7m9ZjxaPvq3B').read()

b'# Welcome to Immerse by Algovera\r\n> A python library by Algovera to interact with IPFS and IPFS ecosystem such as the common pinning services\r\n\r\n\r\n## What is Immerse?\r\n\r\nImmerse is a python library by Algovera to interact with IPFS and IPFS ecosystem such as the common pinning services. It is designed by data scientists for data scientists to interact with the IPFS ecosystem without leaving the comfort of python and jupyter notebook.\r\n\r\nYou can learn more about IPFS [here](https://ipfs.io/#why)\r\n\r\nIPFS is built using the go-lang and javascript. With Immerse, you can interact with IPFS using the exposed [HTTP RPC API](https://docs.ipfs.io/reference/http/api/#getting-started). \r\n\r\nYou will need a local IPFS Node running to use the HTTP API (even when using Immerse). As an alternative, you can connect via the [Infura](https://infura.io/product/ipfs)\'s dedicated IPFS gateway. Immerse provide both ways to interact with IPFS.\r\n\r\n## Installing\r\n\r\nto do: inst

`Write` a file, files or a directory to IPFS

In [101]:
#ignoretest
fs.write('output', directory=True)

(<Response [200]>,
 [{'Name': 'output/adult_data.csv',
   'Hash': 'QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V',
   'Size': '3975476'},
  {'Name': 'output/test.txt',
   'Hash': 'QmTT8vwdbnP9Ls8bSY1LMyW4a8bEwTYZa5izEoJMBtTPfb',
   'Size': '32'},
  {'Name': 'output/test2.txt',
   'Hash': 'QmWa7aPQWCkLAmV2pt1Wjj3uE7V3CG3KYMKBKsVAvWG649',
   'Size': '31'},
  {'Name': 'output/test3.txt',
   'Hash': 'Qmb6W6nVPYd5CJKFpC1zGGuoD7TYQLE5PGG1RHkHm2W3m9',
   'Size': '40'},
  {'Name': 'output/fol1',
   'Hash': 'QmUNLLsPACCz1vLxQVkXqqLX5R1X345qqfHbsf67hvA3Nn',
   'Size': '4'},
  {'Name': 'output',
   'Hash': 'QmeyTiqrD6oo4eQXbAt7hZn3ASzpcP3Kvb1rc9qznEowEw',
   'Size': '3975844'}])

Read a csv straight from IPFS into a `pandas` object

In [102]:
#ignoretest
df = fs.read_csv('QmZnxARhJWsCbTxiAzoRhnxHgMtoEkNJNS8DGLCBEMvm4V'); df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Similarly, read a json straight from IPFS into a `pandas` object

In [103]:
#ignoretest
df = fs.read_json('QmaQ3MEK664wo8DQUu8okvGF3EaQivE8e2a7cfS3Lpqr8e'); df.head()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
