In [1]:
import subprocess
import os

In [2]:
# Suppress native-hadoop warning
!sed -i '$a\# Add the line for suppressing the NativeCodeLoader warning \nlog4j.logger.org.apache.hadoop.util.NativeCodeLoader=ERROR,console' /$HADOOP_HOME/etc/hadoop/log4j.properties

In [3]:
base_dir = os.path.join(os.getcwd(), 'work')
data_dir = os.path.join(base_dir, 'data')

# Function to load a specified dataset to HDFS
def load_to_hdfs(dataset):
    path = os.path.join(data_dir, dataset)
    hdfs_path = f'/data/{dataset}'
    hdfs_dir = os.path.dirname(hdfs_path)
    
    try:
        if hdfs_dir:
            subprocess.run(['hadoop', 'fs', '-mkdir', '-p', hdfs_dir], check=True, capture_output=True)
        
        if os.path.isdir(path):
            # Copy directory recursively
            subprocess.run(['hadoop', 'fs', '-put', '-f', path, hdfs_dir], check=True, capture_output=True)
        else:
            # Copy single file
            subprocess.run(['hadoop', 'fs', '-put', '-f', path, hdfs_path], check=True, capture_output=True)
    except subprocess.CalledProcessError as e:
        print(f"Error uploading {dataset} to HDFS")
        print("Command:", e.cmd)
        print("Return code:", e.returncode)
        print("Output:", e.output.decode())
        print("Error:", e.stderr.decode())
    

In [4]:
datasets = [
    'raw/train',
    'raw/test',
    'processed/user_rating_downsampled.txt',
    'processed/user_and_song_rating_downsampled.txt',
    'song-attributes.txt',
    'genre-hierarchy.txt',
]

# Clean up the existing data
subprocess.run(['hdfs', 'dfs', '-rm', '-r', f'/data'])
subprocess.run(['hdfs', 'dfs', '-mkdir', '-p', '/data'])

for dataset in datasets:
    load_to_hdfs(dataset)

Deleted /data


In [5]:
# list directories in hdfs for users
print(f'\nListing directories in HDFS')
# crawl all of hdfs and list all directories
subprocess.run(['hdfs', 'dfs', '-ls', '-R', '/'])


Listing directories in HDFS
drwxr-xr-x   - root supergroup          0 2024-06-04 20:49 /data
-rw-r--r--   1 root supergroup       4362 2024-06-04 20:49 /data/genre-hierarchy.txt
drwxr-xr-x   - root supergroup          0 2024-06-04 20:49 /data/processed
-rw-r--r--   1 root supergroup  409379887 2024-06-04 20:49 /data/processed/user_and_song_rating_downsampled.txt
-rw-r--r--   1 root supergroup  763640390 2024-06-04 20:49 /data/processed/user_rating_downsampled.txt
drwxr-xr-x   - root supergroup          0 2024-06-04 20:49 /data/raw
drwxr-xr-x   - root supergroup          0 2024-06-04 20:49 /data/raw/test
-rw-r--r--   1 root supergroup   29250779 2024-06-04 20:49 /data/raw/test/test_0.txt
-rw-r--r--   1 root supergroup   30361653 2024-06-04 20:49 /data/raw/test/test_1.txt
-rw-r--r--   1 root supergroup   30360260 2024-06-04 20:49 /data/raw/test/test_2.txt
-rw-r--r--   1 root supergroup   30360544 2024-06-04 20:49 /data/raw/test/test_3.txt
-rw-r--r--   1 root supergroup   30360882 2024-0

CompletedProcess(args=['hdfs', 'dfs', '-ls', '-R', '/'], returncode=0)