In [72]:
pip install --upgrade pip

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.8/1.8 MB 7.6 MB/s eta 0:00:00

Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.0.1
    Uninstalling pip-23.0.1:
      Successfully uninstalled pip-23.0.1
Successfully installed pip-25.1.1


In [38]:
# %%bash
set -e
sudo apt-get update -y
sudo apt-get install -y git curl gnupg software-properties-common unzip python3-pip
echo "✅  Base packages installed"

# %%bash
set -e
if ! command -v terraform &>/dev/null; then
  curl -fsSL https://apt.releases.hashicorp.com/gpg | \
       sudo gpg --dearmor -o /usr/share/keyrings/hashicorp.gpg
  echo "deb [signed-by=/usr/share/keyrings/hashicorp.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | \
       sudo tee /etc/apt/sources.list.d/hashicorp.list > /dev/null
  sudo apt-get update -y
  sudo apt-get install -y terraform
fi
terraform -version

# %%bash
# ─── EDIT THESE FOUR LINES ────────────────────────────────────────────────────
export GIT_REPO="https://github.com/Akkey01/MLops-Final-Project.git"   # ◀︎ EDIT
export CLONE_DIR="/work/MLops-Final-Project"                           # ◀︎ EDIT
export OPENRC_FILE="/home/ng3230_nyu_edu/work/MLops-Final-Project/infrastructure/tf/kvm/openrc.sh"                             # ◀︎ EDIT
export TF_SUFFIX="team39"                                              # ◀︎ EDIT
export TF_KEYPAIR="id_rsa_chameleon"
# ──────────────────────────────────────────────────────────────────────────────
echo "Variables set:"
printf " GIT_REPO     = %s\n CLONE_DIR   = %s\n OPENRC_FILE = %s\n TF_SUFFIX   = %s\n TF_KEYPAIR  = %s\n" \
       "$GIT_REPO" "$CLONE_DIR" "$OPENRC_FILE" "$TF_SUFFIX" "$TF_KEYPAIR"

sudo apt-get install -y git python3-pip
pip install --user --upgrade python-openstackclient

# %%bash
set -e
pip install --user --upgrade --no-cache-dir python-openstackclient
if ! grep -q '.local/bin' ~/.bashrc; then
  echo 'export PATH=$HOME/.local/bin:$PATH' >> ~/.bashrc
fi
export PATH=$HOME/.local/bin:$PATH
openstack --version

set -e
if [ ! -d "$CLONE_DIR/.git" ]; then
  git clone --recurse-submodules "$GIT_REPO" "$CLONE_DIR"
else
  echo "Repo exists; stashing and pulling ..."
  git -C "$CLONE_DIR" stash push -m "Auto-stash before pull"
  git -C "$CLONE_DIR" pull --ff-only
fi


# %%bash
set -e
if [ ! -f "$OPENRC_FILE" ]; then
  echo "ERROR: $OPENRC_FILE not found" >&2
  exit 1
fi
unset $(env | grep -oE '^OS_[A-Z_]+')    # clean old vars
source "$OPENRC_FILE"
openstack token issue >/dev/null && echo "✅  Auth OK"

pip install --user kubernetes openshift
export PATH="$HOME/.local/bin:$PATH"

# from the same shell where ansible-playbook runs:
ansible-galaxy collection install kubernetes.core

Hit:1 https://apt.releases.hashicorp.com jammy InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease                       
Hit:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease               
Hit:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease             
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
                        Reading package lists... 0%Reading package lists... 0%Reading package lists... 0%Reading package lists... 4%Reading package lists... 4%Reading package lists... 4%Reading package lists... 4%Reading package lists... 14%Reading package lists... 35%Reading package lists... 45%Reading package lists... 45%Reading package lists... 46%Reading package lists... 46%Reading package lists... 52%Reading package lists... 55%Reading package lists... 55%Reading package lists... 68%Reading package lists... 68%Reading package lists... 73%Reading package lists... 73%Reading package lists... 73%Reading package lists... 73%Reading pac

In [34]:
# Delete Instance

# ls
# cd /work/MLops-Final-Project/infrastructure/tf/kvm
# terraform destroy -auto-approve
# terraform state list
# terraform state rm \
#   'openstack_compute_instance_v2.nodes["node1"]' \
#   'openstack_compute_instance_v2.nodes["node2"]' \
#   'openstack_compute_instance_v2.nodes["node3"]'
# |



In [39]:
# %%bash
set -e
cd "$CLONE_DIR/infrastructure/tf/kvm"

# generate a local tfvars file so commands stay short
cat > local.auto.tfvars <<EOF
suffix = "${TF_SUFFIX}"
key    = "${TF_KEYPAIR}"
EOF

terraform init -input=false
terraform plan -out apply.plan
terraform apply -auto-approve apply.plan

# 3) capture just the one Floating IP
floating_ip=$(terraform output -raw floating_ip_out)

# 4) print it (and also save to a file if you want)
echo "🎉 Terraform finished. Floating IP: $floating_ip"
echo "$floating_ip" > last_floating_ip.txt


# %%bash
CONTROL_IP=$floating_ip           # ← leave as is
SSH_KEY=~/.ssh/id_rsa

cat > /work/MLops-Final-Project/infrastructure/ansible/inventory.ini <<EOF
[control]
node1 ansible_host=${CONTROL_IP}

[workers]
node2 ansible_host=192.168.1.12
node3 ansible_host=192.168.1.13

[all:vars]
ansible_user=cc
ansible_ssh_private_key_file=${SSH_KEY}
ansible_ssh_common_args='-o StrictHostKeyChecking=no'

[workers:vars]
ansible_ssh_common_args='-o StrictHostKeyChecking=no -o ProxyCommand="ssh -i ${SSH_KEY} -o StrictHostKeyChecking=no -W %h:%p cc@${CONTROL_IP}"'
EOF

cat /work/MLops-Final-Project/infrastructure/ansible/inventory.ini

# %%bash
ANSIBLE_DIR=/work/MLops-Final-Project/infrastructure/ansible
SSH_KEY=~/.ssh/id_rsa

mkdir -p "$ANSIBLE_DIR"

cat > "$ANSIBLE_DIR/ansible.cfg" <<EOF
[defaults]
inventory            = ./inventory.ini
remote_user          = cc
private_key_file     = ~/.ssh/id_rsa
host_key_checking    = False
retry_files_enabled  = False
stdout_callback      = yaml
any_errors_fatal     = False

[ssh_connection]
ssh_args = -o StrictHostKeyChecking=off \\
           -o UserKnownHostsFile=/dev/null \\
           -o ForwardAgent=yes \\
           -o ProxyCommand="ssh -o StrictHostKeyChecking=no \\
                             -o UserKnownHostsFile=/dev/null \\
                             -W %h:%p cc@${floating_ip}"
pipelining = True
EOF

echo "Written $ANSIBLE_DIR/ansible.cfg:"
cat "$ANSIBLE_DIR/ansible.cfg"


echo "Installing python3-six, python3-setuptools on all nodes …"
ansible -i $ANSIBLE_DIR/inventory.ini all \
  -m raw \
  -a 'sudo DEBIAN_FRONTEND=noninteractive apt-get update -y &&
      sudo DEBIAN_FRONTEND=noninteractive apt-get install -y \
      python3 python3-apt python3-pip python3-six python3-setuptools\
      python3-netaddr '\
  --become -vv

# 1) Remove the apt copy (keeps only python libs on the targets)
sudo apt-get remove -y ansible-core

# 2) Install 2.15.x in your user env (control plane only)
pip install --user --upgrade "ansible-core<2.16"   # installs 2.15.5 today
export PATH=$HOME/.local/bin:$PATH
ansible --version    # should say 2.15.x

export ANSIBLE_CONFIG=/work/MLops-Final-Project/infrastructure/ansible/ansible.cfg
cd /work/MLops-Final-Project/infrastructure/ansible

# fast check
ansible all -i inventory.ini -m ping

# full run
ansible-playbook -i inventory.yml general/hello_host.yml

[0m[1mInitializing the backend...[0m
[0m[1mInitializing provider plugins...[0m
- Reusing previous version of terraform-provider-openstack/openstack from the dependency lock file
- Using previously-installed terraform-provider-openstack/openstack v1.51.1

[0m[1m[32mTerraform has been successfully initialized![0m[32m[0m
[0m[32m
You may now begin working with Terraform. Try running "terraform plan" to see
any changes that are required for your infrastructure. All Terraform commands
should now work.

If you ever set or change modules or backend configuration for Terraform,
rerun this command to reinitialize your working directory. If you forget, other
commands will detect it and remind you to do so if necessary.[0m
[0m[1mdata.openstack_networking_subnet_v2.sharednet2_subnet: Reading...[0m[0m
[0m[1mdata.openstack_networking_network_v2.sharednet2: Reading...[0m[0m
[0m[1mdata.openstack_networking_secgroup_v2.allow_ssh: Reading...[0m[0m
[0m[1mdata.openstack_network

In [43]:
cd /work/MLops-Final-Project/infrastructure/ansible
ansible-playbook -i inventory.yml pre_k8s/pre_k8s_configure.yml

has been superseded by the the option `result_format=yaml` in callback plugin 
ansible.builtin.default from ansible-core 2.13 onwards. This feature will be 

PLAY [Disable and Mask firewalld] **********************************************

TASK [Gathering Facts] *********************************************************
ok: [node2]
ok: [node3]
ok: [node1]

TASK [Stop firewalld service] **************************************************
ok: [node1]
ok: [node3]
ok: [node2]

TASK [Mask firewalld service] **************************************************
changed: [node1]
changed: [node2]
changed: [node3]

PLAY [Set up insecure registry for Docker] *************************************

TASK [Gathering Facts] *********************************************************
ok: [node2]
ok: [node1]
ok: [node3]

TASK [Ensure /etc/docker directory exists] *************************************
ok: [node2]
ok: [node1]
ok: [node3]

TASK [Create /etc/docker/daemon.json if not exists] *********************

In [12]:
# cd /work/MLops-Final-Project
# git submodule add \
#   https://github.com/kubernetes-sigs/kubespray.git \
#   infrastructure/ansible/k8s/inventory/mycluster/kubespray



In [13]:
# git mv \
#   infrastructure/ansible/k8s/inventory/mycluster/kubespray \
#   infrastructure/ansible/k8s/

In [41]:
python3 -m venv ~/.venvs/kubespray
source ~/.venvs/kubespray/bin/activate
pip install --upgrade "ansible-core>=2.16.4,<2.17.0"

ansible-galaxy collection install kubernetes.core community.general
# from your local shell (not inside a playbook cell)
ansible-galaxy collection install ansible.posix
ansible-galaxy collection install ansible.utils
pip install netaddr ipaddress
ansible-galaxy collection install community.crypto
pip install jmespath






[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: pip install --upgrade pip
Starting galaxy collection install process
Nothing to do. All requested collections are already installed. If you want to reinstall them, consider using `--force`.
Starting galaxy collection install process
Nothing to do. All requested collections are already installed. If you want to reinstall them, consider using `--force`.
Starting galaxy collection install process
Nothing to do. All requested collections are already installed. If you want to reinstall them, consider using `--force`.

[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: pip install --upgrade pip
Starting galaxy collection install process
Nothing to do. All requested collections are already installed. If you want to reinstall them, consider using `--force`.

[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: pip install --upgrade pip


In [111]:
# cd /work/MLops-Final-Project/infrastructure/ansible
# cat > k8s/pre_kubespray_cleanup.yml <<EOF
# ---
# - name: Remove old Docker APT list and keyring
#   hosts: all
#   become: yes
#   tasks:
#     - name: Delete any old docker.list
#       file:
#         path: /etc/apt/sources.list.d/docker.list
#         state: absent

#     - name: Delete any old Docker keyring
#       file:
#         path: /usr/share/keyrings/docker-archive-keyring.gpg
#         state: absent
# EOF


In [127]:
# cd /work/MLops-Final-Project/infrastructure/ansible
# ansible all -i k8s/inventory/mycluster \
#   -m raw -a 'sudo apt-get remove -y containerd.io docker-ce docker-ce-cli' \
#   --become

node2 | CHANGED | rc=0 >>
Reading package lists... 0%Reading package lists... 100%Reading package lists... Done
Building dependency tree... 0%Building dependency tree... 0%Building dependency tree... 50%Building dependency tree... 50%Building dependency tree... Done
Reading state information... 0% Reading state information... 0%Reading state information... Done
The following packages will be REMOVED:
  containerd.io docker-ce docker-ce-cli
0 upgraded, 0 newly installed, 3 to remove and 136 not upgraded.
After this operation, 249 MB disk space will be freed.
(Reading database ... 128740 files and directories currently installed.)
Removing docker-ce (5:28.0.2-1~ubuntu.24.04~noble) ...
Removing containerd.io (1.6.32-1) ...
Removing docker-ce-cli (5:28.0.2-1~ubuntu.24.04~noble) ...
Processing triggers for man-db (2.12.0-4build2) ...
Connection to 192.168.1.12 closed.

node3 | CHANGED | rc=0 >>
Reading package lists... 0%Reading package lists... 100%Reading package lists... Done
Building de

In [144]:
# pwd
# cd /work/MLops-Final-Project/infrastructure/ansible


# ansible-playbook \
#   -i k8s/inventory/mycluster \
#   --become k8s/pre_kubespray_cleanup.yml

/work/MLops-Final-Project/infrastructure/ansible/k8s/kubespray
has been superseded by the the option `result_format=yaml` in callback plugin 
ansible.builtin.default from ansible-core 2.13 onwards. This feature will be 

PLAY [Remove old Docker APT list and keyring] **********************************

TASK [Gathering Facts] *********************************************************
ok: [node3]
ok: [node2]
ok: [node1]

TASK [Delete any old docker.list] **********************************************
ok: [node3]
ok: [node1]
ok: [node2]

TASK [Delete any old Docker keyring] *******************************************
ok: [node1]
ok: [node2]
ok: [node3]

PLAY RECAP *********************************************************************
node1                      : ok=3    changed=0    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   
node2                      : ok=3    changed=0    unreachable=0    failed=0    skipped=0    rescued=0    ignored=0   
node3                     

In [42]:


export ANSIBLE_CONFIG=/work/MLops-Final-Project/infrastructure/ansible/ansible.cfg
export ANSIBLE_ROLES_PATH=roles
cd /work/MLops-Final-Project/infrastructure/ansible/k8s/kubespray
ansible-playbook -i ../inventory/mycluster --become --become-user=root ./cluster.yml

Project/infrastructure/ansible/k8s/kubespray/roles/bootstrap-os/tasks/main.yml,
line 29, column 7, found a duplicate dict key (paths). Using last defined value
only.
has been superseded by the the option `result_format=yaml` in callback plugin 
ansible.builtin.default from ansible-core 2.13 onwards. This feature will be 

PLAY [Check Ansible version] ***************************************************

TASK [Check 2.16.4 <= Ansible version < 2.17.0] ********************************
ok: [node1] => changed=false 
  msg: All assertions passed

TASK [Check that python netaddr is installed] **********************************
ok: [node1] => changed=false 
  msg: All assertions passed

TASK [Check that jinja is not too old (install via pip)] ***********************
ok: [node1] => changed=false 
  msg: All assertions passed

PLAY [Define groups for legacy less structured inventories] ********************

TASK [Match needed groups by their old names or definition] ********************
changed:

In [150]:
# deactivate

In [44]:
cd /work/MLops-Final-Project/infrastructure/ansible
ansible-playbook -i inventory.yml post_k8s/post_k8s_configure.yml

has been superseded by the the option `result_format=yaml` in callback plugin 
ansible.builtin.default from ansible-core 2.13 onwards. This feature will be 

PLAY [Post-Install kubectl Setup] **********************************************

TASK [Gathering Facts] *********************************************************
ok: [node2]
ok: [node1]

TASK [Ensure .kube directory exists] *******************************************
ok: [node2]
ok: [node1]

TASK [Copy admin.conf to user's kubeconfig] ************************************
ok: [node1]
ok: [node2]

TASK [Run kubectl get nodes as cc] *********************************************
changed: [node1]
changed: [node2]

TASK [Show kubectl get nodes output] *******************************************
ok: [node1] => 
  msg:
  - NAME    STATUS   ROLES           AGE   VERSION
  - node1   Ready    control-plane   24h   v1.32.4
  - node2   Ready    control-plane   24h   v1.32.4
  - node3   Ready    <none>          21h   v1.32.4
ok: [node2] => 
  

Configure ArgoCD

In [None]:
# source ~/.venvs/kubespray/bin/activate
# which ansible-playbook
# ansible-playbook --version

In [45]:
export PATH=/work/.local/bin:$PATH
export PYTHONUSERBASE=/work/.local
export ANSIBLE_CONFIG=/work/MLops-Final-Project/infrastructure/ansible/ansible.cfg
export ANSIBLE_ROLES_PATH=roles

In [46]:
cd /work/MLops-Final-Project/infrastructure/ansible

ansible-playbook -i inventory.yml argocd/argocd_add_platform.yml


has been superseded by the the option `result_format=yaml` in callback plugin 
ansible.builtin.default from ansible-core 2.13 onwards. This feature will be 

PLAY [Deploy MLflow platform via ArgoCD & Helm with MinIO secret handling] *****

TASK [Gathering Facts] *********************************************************
ok: [node1]

TASK [Get ArgoCD admin password from Kubernetes secret] ************************
changed: [node1]

TASK [Decode ArgoCD admin password] ********************************************
changed: [node1]

TASK [Log in to ArgoCD] ********************************************************
ok: [node1]

TASK [Add repository to ArgoCD] ************************************************
changed: [node1]

TASK [Detect external IP starting with 10.56] **********************************
ok: [node1]

TASK [Ensure imps-platform namespace exists] ***********************************
ok: [node1]

TASK [Create imps-platform namespace if missing] *******************************
skippi

Once the platform is deployed, we can open:

MinIO object store on http://A.B.C.D:9001 (substitute your own floating IP) - log in with the access key and secret printed by the playbook above. Our model artifacts will be stored here once we start generating them.
MLFlow model registry on http://A.B.C.D:8000 (substitute your own floating IP), and click on the “Models” tab.


In [52]:

cd /work/MLops-Final-Project/infrastructure/ansible
ansible-playbook -i inventory.yml argocd/workflow_build_init.yml

has been superseded by the the option `result_format=yaml` in callback plugin 
ansible.builtin.default from ansible-core 2.13 onwards. This feature will be 

PLAY [Run Argo Workflow from GitHub Repo] **************************************

TASK [Gathering Facts] *********************************************************
ok: [node1]

TASK [Clone or update IMPS repo] ***********************************************
changed: [node1]

PLAY [Run Argo Workflow from GitHub Repo] **************************************

TASK [Gathering Facts] *********************************************************
ok: [node1]

TASK [Clone or update IMPS repo] ***********************************************
ok: [node1]

TASK [Submit Argo Workflow] ****************************************************
changed: [node1]

TASK [Extract Workflow Name] ***************************************************
ok: [node1]

TASK [Wait for workflow to complete (success or fail)] *************************
changed: [node1]

TASK 

: 2

In [215]:

cd /work/MLops-Final-Project/infrastructure/ansible
ansible-playbook -i inventory.yml argocd/argocd_add_staging.yml

has been superseded by the the option `result_format=yaml` in callback plugin 
ansible.builtin.default from ansible-core 2.13 onwards. This feature will be 

PLAY [Deploy imps Staging via ArgoCD & Helm] ***********************************

TASK [Gathering Facts] *********************************************************
ok: [node1]

TASK [Get ArgoCD admin password] ***********************************************
changed: [node1]

TASK [Decode ArgoCD password] **************************************************
changed: [node1]

TASK [Login to ArgoCD] *********************************************************
ok: [node1]

TASK [Detect external IP starting with 10.56] **********************************
ok: [node1]

TASK [Create imps-staging namespace if missing] ********************************
changed: [node1]

TASK [Check if ArgoCD app exists] **********************************************
ok: [node1]

TASK [Create ArgoCD Helm app if not exists] ************************************
change

In [216]:




# runs in Chameleon Jupyter environment
cd /work/MLops-Final-Project/infrastructure/ansible
ansible-playbook -i inventory.yml argocd/argocd_add_canary.yml

# runs in Chameleon Jupyter environment
cd /work/MLops-Final-Project/infrastructure/ansible
ansible-playbook -i inventory.yml argocd/argocd_add_prod.yml



has been superseded by the the option `result_format=yaml` in callback plugin 
ansible.builtin.default from ansible-core 2.13 onwards. This feature will be 

PLAY [Deploy imps Canary via ArgoCD & Helm] ************************************

TASK [Gathering Facts] *********************************************************
ok: [node1]

TASK [Get ArgoCD admin password] ***********************************************
changed: [node1]

TASK [Decode ArgoCD password] **************************************************
changed: [node1]

TASK [Login to ArgoCD] *********************************************************
ok: [node1]

TASK [Detect external IP starting with 10.56] **********************************
ok: [node1]

TASK [Create imps-canary namespace if missing] *********************************
changed: [node1]

TASK [Check if ArgoCD app exists] **********************************************
ok: [node1]

TASK [Create ArgoCD Helm app if not exists] ************************************
change


Test your staging, canary, and production deployments - we have put them on different ports. For now, they are all running exactly the same model!

Visit http://A.B.C.D:8081 (substituting the value of your floating IP) to test the staging service
Visit http://A.B.C.D:8080 (substituting the value of your floating IP) to test the canary service
Visit http://A.B.C.D (substituting the value of your floating IP) to test the production service
At this point, you can also revisit the dashboards you opened earlier:

In the Kubernetes dashboard, you can switch between namespaces to see the different applications that we have deployed.
On the ArgoCD dashboard, you can see the four applications that ArgoCD is managing, and their sync status.

In [50]:
# runs in Chameleon Jupyter environment
cd /work/MLops-Final-Project/infrastructure/ansible
ansible-playbook -i inventory.yml argocd/workflow_templates_apply.yml

has been superseded by the the option `result_format=yaml` in callback plugin 
ansible.builtin.default from ansible-core 2.13 onwards. This feature will be 

PLAY [Clone repo and apply specific Argo WorkflowTemplates] ********************

TASK [Gathering Facts] *********************************************************
ok: [node1]

TASK [Clone or update IMPS repo] ***********************************************
ok: [node1]

TASK [Apply selected WorkflowTemplates to Argo namespace] **********************
failed: [node1] (item=build-container-image.yaml) => changed=true 
  ansible_loop_var: item
  cmd:
  - kubectl
  - apply
  - -n
  - argo
  - -f
  - /tmp/app/workflows/build-container-image.yaml
  delta: '0:00:00.088792'
  end: '2025-05-11 22:34:15.318814'
  item: build-container-image.yaml
  msg: non-zero return code
  rc: 1
  start: '2025-05-11 22:34:15.230022'
  stderr: 'error: the path "/tmp/app/workflows/build-container-image.yaml" does not exist'
  stderr_lines: <omitted>
  stdout:

: 2

LifeCycle Part1