In [37]:
import os
import requests
import docker
import opendatasets as od

In [38]:
client = docker.from_env()
container = client.containers.get('master')

def hdfs_mkdir(path):
    container.exec_run(f"hdfs dfs -mkdir -p /{path}/")

def hdfs_upload(path):
    directory = "/".join(path.split("/")[:-1])
    hdfs_mkdir(directory)
    cmd = f"hdfs dfs -put /data/master_volume/{path} /{directory}"
    print(cmd)
    code, output = container.exec_run(cmd)
    print(f"exit code {code}")
    print(output)

def hdfs_set_replication_level(number):
    container.exec_run(f"hdfs dfs -setrep -R {number} /")

Ensure `kaggle.json` is located in the root of the repository or at `~/.kaggle/kaggle.json`.

In [40]:
output_dir = "vagrant/master_volume/datasets"

In [41]:
if not os.path.isdir(f"{output_dir}/steam-dataset"):
    od.download("https://www.kaggle.com/datasets/souyama/steam-dataset", f"{output_dir}")
else:
    print("Dataset steam-dataset already exists, skipping download")

if not os.path.isdir(f"{output_dir}/youtube-trending-video-dataset"):
    od.download("https://www.kaggle.com/datasets/rsrishav/youtube-trending-video-dataset", f"{output_dir}")
else:
    print("Dataset youtube-trending-video-dataset already exists, skipping download")

Dataset steam-dataset already exists, skipping download
Dataset youtube-trending-video-dataset already exists, skipping download


In [43]:
path = f"{output_dir}/covid-dataset.csv"

if not os.path.isfile(path):
    print(f"Downloading covid-dataset to {path}")
    r = requests.get("https://covid.ourworldindata.org/data/owid-covid-data.csv", allow_redirects=True)
    with open(path, 'wb') as file:
        file.write(r.content)
else:
    print("Dataset covid-dataset already exists, skipping download")

print(f"covid-dataset.csv: {os.stat(path).st_size / (1024 * 1024):.02f}MB")

Dataset covid-dataset already exists, skipping download
covid-dataset.csv: 77.76MB
Uploading to HDFS
hdfs dfs -put /data/master_volume/datasets/covid-dataset.csv /datasets
exit code 1
b"put: `/datasets/covid-dataset.csv': File exists\n"
Done


In [55]:
print("Uploading to HDFS")
for root, directories, files in os.walk(f"{output_dir}"):
        for filename in files:
            path = os.path\
                .join(root,filename)\
                .removeprefix("vagrant/master_volume/")\
                .replace("\\", "/")

            hdfs_upload(path)
print("Done")

Uploading to HDFS
hdfs dfs -put /data/master_volume/datasets/covid-dataset.csv /datasets
exit code 1
b"put: `/datasets/covid-dataset.csv': File exists\n"
hdfs dfs -put /data/master_volume/datasets/steam-dataset/steam_dataset/appinfo/dlc_data/missing.json /datasets/steam-dataset/steam_dataset/appinfo/dlc_data
exit code 0
b''
hdfs dfs -put /data/master_volume/datasets/steam-dataset/steam_dataset/appinfo/dlc_data/steam_dlc_data.json /datasets/steam-dataset/steam_dataset/appinfo/dlc_data
exit code 0
b''
hdfs dfs -put /data/master_volume/datasets/steam-dataset/steam_dataset/appinfo/dlc_data/timestamp.txt /datasets/steam-dataset/steam_dataset/appinfo/dlc_data
exit code 0
b''
hdfs dfs -put /data/master_volume/datasets/steam-dataset/steam_dataset/appinfo/store_data/steam_store_data.json /datasets/steam-dataset/steam_dataset/appinfo/store_data
exit code 0
b''
hdfs dfs -put /data/master_volume/datasets/steam-dataset/steam_dataset/appinfo/store_data/timestamp.txt /datasets/steam-dataset/steam_dat

In [56]:
hdfs_set_replication_level(3)