In [76]:
from dask_cloudprovider.aws import EC2Cluster 
from dask.distributed import Client
import configparser
import os
import contextlib
import re
import dask
from platform import python_version

In [77]:
import subprocess
import sys

def pin_versions():
    """
    Ensure we have exactly botocore 1.36.3, aiobotocore 2.19.0 installed.
    """
    pkgs = [
        "botocore==1.36.3",          # there is a tendancy for it to be pushed to 1.38, but this will restore it to working version
        "aiobotocore==2.19.0"
    ]
    # Use pip to install or downgrade to those exact versions
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade"] + pkgs)

# Pin the versions before importing anything that depends on them
pin_versions()

```bash
! pip install botocore==1.36.3

In [21]:
import aiobotocore, botocore
print(aiobotocore.__version__, botocore.__version__)


2.19.0 1.36.3


In [78]:
def get_aws_credentials():
    """Read in your AWS credentials file and convert to environment variables."""
    parser = configparser.RawConfigParser()
    
    parser.read(os.path.expanduser('~/.aws/config'))
    config = parser.items('default')
    
    parser.read(os.path.expanduser('~/.aws/credentials'))
    credentials = parser.items('default')
    
    all_credentials = {key.upper(): value for key, value in [*config, *credentials]}
    with contextlib.suppress(KeyError):
        all_credentials["AWS_REGION"] = all_credentials.pop("REGION")
        
    return all_credentials

# Pass in AWS Credentials + any extra packages you would like to install on cluster via `pip`
env_vars = get_aws_credentials()
env_vars["EXTRA_PIP_PACKAGES"] = "s3fs"

# Select software installed on scheduler + worker instances based on client Python + Dask versions
# versions need to match across client, scheduler, worker -- slight mismatches are OK, though
py_v = '-py' + re.findall(r'\d{1}.\d+', python_version())[0]
dask_docker_tag = f"daskdev/dask:{dask.__version__ + py_v}" # daskdev/dask:2024.7.1-py3.9
print('Docker Image: ', dask_docker_tag)

# launch a cluster of 5 r5.large instances (10 vCPUs):
# 1 scheduler
# 4 workers (2 threads + 16 GB RAM each)
cluster = EC2Cluster(instance_type='c7a.large',
                     n_workers=4,
                     security=False,
                     docker_image=dask_docker_tag,
                     env_vars=env_vars
)


Docker Image:  daskdev/dask:2025.4.1-py3.10
Creating scheduler instance
Created instance i-08ea352f137a22e02 as dask-32f8a1b3-scheduler
Waiting for scheduler to run at 13.220.85.47:8786
Scheduler is running


  next(self.gen)


Creating worker instance
Creating worker instance
Creating worker instance
Creating worker instance
Created instance i-09a7cd67e178e0eb5 as dask-32f8a1b3-worker-9f0584d8
Created instance i-0fc9b84739bea5dfe as dask-32f8a1b3-worker-08ddd983
Created instance i-0cd9e98583b2a9bd6 as dask-32f8a1b3-worker-50509409
Created instance i-0f1491570dc3041d3 as dask-32f8a1b3-worker-3b9f524b


In [79]:
client = Client(cluster)
client # note that slight mismatches between client, scheduler, and worker software are fine


+-------------+-----------------+-----------------+---------+
| Package     | Client          | Scheduler       | Workers |
+-------------+-----------------+-----------------+---------+
| cloudpickle | 3.0.0           | 3.1.1           | None    |
| lz4         | 4.3.2           | 4.3.3           | None    |
| msgpack     | 1.0.3           | 1.1.0           | None    |
| python      | 3.10.16.final.0 | 3.10.12.final.0 | None    |
| toolz       | 1.0.0           | 0.12.0          | None    |
| tornado     | 6.5             | 6.4.2           | None    |
+-------------+-----------------+-----------------+---------+


0,1
Connection method: Cluster object,Cluster type: dask_cloudprovider.EC2Cluster
Dashboard: http://13.220.85.47:8787/status,

0,1
Dashboard: http://13.220.85.47:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://172.31.31.30:8786,Workers: 0
Dashboard: http://172.31.31.30:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [80]:
#wait for clusters to initialize 
client 

0,1
Connection method: Cluster object,Cluster type: dask_cloudprovider.EC2Cluster
Dashboard: http://13.220.85.47:8787/status,

0,1
Dashboard: http://13.220.85.47:8787/status,Workers: 4
Total threads: 8,Total memory: 14.82 GiB

0,1
Comm: tcp://172.31.31.30:8786,Workers: 0
Dashboard: http://172.31.31.30:8787/status,Total threads: 0
Started: 1 minute ago,Total memory: 0 B

0,1
Comm: tcp://172.31.30.44:33951,Total threads: 2
Dashboard: http://172.31.30.44:33861/status,Memory: 3.71 GiB
Nanny: tcp://172.31.30.44:45829,
Local directory: /tmp/dask-scratch-space/worker-1fhl0v4s,Local directory: /tmp/dask-scratch-space/worker-1fhl0v4s

0,1
Comm: tcp://172.31.21.98:34425,Total threads: 2
Dashboard: http://172.31.21.98:38779/status,Memory: 3.71 GiB
Nanny: tcp://172.31.21.98:43085,
Local directory: /tmp/dask-scratch-space/worker-xfc1mdmd,Local directory: /tmp/dask-scratch-space/worker-xfc1mdmd

0,1
Comm: tcp://172.31.17.220:37749,Total threads: 2
Dashboard: http://172.31.17.220:41757/status,Memory: 3.71 GiB
Nanny: tcp://172.31.17.220:40313,
Local directory: /tmp/dask-scratch-space/worker-i_93xi6i,Local directory: /tmp/dask-scratch-space/worker-i_93xi6i

0,1
Comm: tcp://172.31.20.118:45575,Total threads: 2
Dashboard: http://172.31.20.118:39291/status,Memory: 3.71 GiB
Nanny: tcp://172.31.20.118:36695,
Local directory: /tmp/dask-scratch-space/worker-knjnz080,Local directory: /tmp/dask-scratch-space/worker-knjnz080


In [None]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from settings import config
from pathlib import Path
import regressions
from pandas_datareader.famafrench import get_available_datasets
import pandas_datareader.data as web

# Load environment variables
DATA_DIR = Path(config("DATA_DIR"))
DATA_MANUAL = Path(config("LOCAL_MANUAL_DATA_DIR"))
OUTPUT_DIR = Path(config("OUTPUT_DIR"))
WRDS_USERNAME = config("WRDS_USERNAME")
START_DATE = config("START_DATE")
END_DATE = config("END_DATE")

sys.path.insert(0, os.path.abspath("src"))

In [81]:
sched_addr = cluster.scheduler_address
print("Scheduler at:", sched_addr)
env_path = Path("../.") / ".env"
if not env_path.exists():
    env_path.write_text("")  # make sure file exists

# read existing lines, dropping any old DASK_SCHEDULER_ADDRESS
lines = []
with env_path.open("r") as f:
    for line in f:
        if not line.startswith("DASK_SCHEDULER_ADDRESS="):
            lines.append(line)

# append the new setting
lines.append(f"DASK_SCHEDULER_ADDRESS={sched_addr}\n")

# write back
with env_path.open("w") as f:
    f.writelines(lines)


Scheduler at: tcp://13.220.85.47:8786


In [82]:

# write a scheduler file for other processes to pick up
sched_file = os.path.abspath("dask-scheduler.json")
client.write_scheduler_file(sched_file)
print("Wrote scheduler file →", sched_file)

Wrote scheduler file → c:\Users\baile\Box Sync\sp25\MACS 30123\final-project-baileymeche\src\dask-scheduler.json


In [None]:
from dask.distributed import Client
import dask.dataframe as dd

import math
from numba import njit

# Initialize Dask client from environment variable
scheduler = os.getenv("DASK_SCHEDULER_ADDRESS")
if scheduler is None:
    raise RuntimeError("Please export DASK_SCHEDULER_ADDRESS before running")
#client = Client(scheduler)


# Teardown

In [90]:
client.close()

In [91]:
cluster.close()

Terminated dask-32f8a1b3-worker-08ddd983 (i-0fc9b84739bea5dfe)
Terminated dask-32f8a1b3-worker-3b9f524b (i-0f1491570dc3041d3)
Terminated dask-32f8a1b3-worker-9f0584d8 (i-09a7cd67e178e0eb5)
Terminated dask-32f8a1b3-worker-50509409 (i-0cd9e98583b2a9bd6)
Terminated dask-32f8a1b3-scheduler (i-08ea352f137a22e02)
