### PyHDFS

https://github.com/jingw/pyhdfs

-The methods and return values generally map directly to WebHDFS endpoints. 

In [8]:
import pyhdfs
fs = pyhdfs.HdfsClient(hosts='nn1.example.com:50070,nn2.example.com:50070', user_name='someone')
fs.list_status('/')

In [None]:
fs.listdir('/')

In [9]:
import os 

def recurse_copy_to_remote(fs:pyhdfs.HdfsClient, local_path, remote_path):
    for f in os.listdir(local_path):
        try:
            if os.path.isdir(local_path/f):
                # create remote directory
                fs.mkdirs(remote_path/f)
                recurse_copy_to_remote(fs, local_path/f, remote_path/f)
            else:
                fs.copy_from_local(local_path/f, remote_path/f)
        except pyhdfs.HdfsException as e:
            # HdfsFileAlreadyExistsException
            # HdfsFileNotFoundException
            # HdfsHttpException
            # HdfsIOException
            print(e)

In [None]:
from pathlib import Path
recurse_copy_to_remote(fs, Path('data'),Path('/mushrooms'))

In [None]:
fs.mkdirs('/fruit/x/y') 

In [None]:
fs.create('/fruit/apple', 'delicious')
fs.append('/fruit/apple', ' food')
list(fs.walk('/fruit'))
fs.delete(dir, recursive=True)

In [None]:
with fs.create(file_path) as f:
    f.write(b'Hello, world!\n')

In [None]:
with contextlib.closing(fs.open('/fruit/apple')) as f:
    f.read()

## native RPC client interfaces

- better throughput than WebHDFS

### libhdfs
- c wrapper for the HDFS Java Client
- supported by major Hadoop vendors, part of the Apache Hadoop project

https://arrow.apache.org/docs/python/generated/pyarrow.fs.HadoopFileSystem.html

https://wesmckinney.com/blog/python-hdfs-interfaces/

In [None]:
# local.create_dir('/tmp/new_folder')
# local.copy_file('/tmp/local_fs.dat', '/tmp/new_folder/local_fs.dat')
# local.get_file_info('/tmp/new_folder/local_fs.dat')
# local.delete_dir_contents('/tmp/new_folder')
# local.get_file_info('/tmp/new_folder')
# local.get_file_info('/tmp/new_folder/local_fs.dat')

In [None]:
def upload_file(hdfs_filesystem, filepath):
    local_file = filepath
    hdfs_file = '/tmp/' + os.path.basename(filepath)
    hdfs_filesystem.c(local_file, hdfs_file)
    print('Uploaded file to HDFS: ' + hdfs_file)

    fs.copy_files("registry.opendata.aws/roda/ndjson/index.ndjson",
    "file:///{}/index_copy.ndjson".format(local_path),
    source_filesystem=fs.S3FileSystem())
    return hdfs_file


In [7]:
from pyarrow import fs

host = 'A'
port = 'A'
user = 'A'



hdfs = fs.HadoopFileSystem(host, port, user=user, kerb_ticket=ticket_cache_path)

# Use the HadoopFileSystem instance to interact with HDFS
with hdfs.open('/path/to/file', 'rb') as f: # https://arrow.apache.org/docs/python/generated/pyarrow.fs.LocalFileSystem.html#pyarrow.fs.LocalFileSystem
    data = f.read()
    f.write(b'data')

# Close the HadoopFileSystem instance
hdfs.close()


TypeError: __init__() takes at most 2 positional arguments (3 given)

In [None]:
# with hdfs.open('/path/to/file') as f: