In [None]:
""""
This script downloads the necessary processed datasets and trained models for the lab

"""

from pathlib import Path

# Set project root as parent of the notebooks folder
PROJECT_ROOT = Path.cwd().parent
local_data_dir = PROJECT_ROOT / "data" / "processed"
local_model_dir = PROJECT_ROOT / "data" / "models"
local_layer_dir = PROJECT_ROOT / "data" / "layers"

local_data_dir.mkdir(parents=True, exist_ok=True)
local_model_dir.mkdir(parents=True, exist_ok=True)
local_layer_dir.mkdir(parents=True, exist_ok=True)

# Public Google Drive file IDs (ensure these match your shared links)
data_files = {
    "feature_engineered_eval.csv": "1fH2TXIyuK_ZlMgT4tjHGGHTdZxv3z6rv",
    "feature_engineered_holdout.csv": "12H0hyDcIXOyrNNTctWxFge72WqWInji9",
    "feature_engineered_train.csv": "10RsZsdGmFzNbFRByyEwnxP1kGxBvRUm6",
    "cleaning_holdout.csv": "1wtnm3vTOlUPOTzAqALANecJdJ0NqzY6e",
}

model_files = {
    "lgbm_model.pkl": "1YzOpqBNc4nDwS04GlUfZDgO1T41wLauT",
    "lgbm_best_model.pkl": "1uuT8Mjoj5QKhm3yV6zByODVWwnlGb3Fi",
    "freq_encoder.pkl": "1qgPiDFul9xXhNliVDoftTsKv6baAjaLL",
    "target_encoder.pkl": "12brw0d1A3zmGwdxPPFukJUKXoxbZaYSx",
}

layer_files = {
    "core-layer.zip": "14JBDCVMfJOr8vydbnEazBh2P4_ZY0s7X", 
    "ml-layer.zip": "1l39v2VcZseQMrwGXONdAywh2e2nINgI7",
    "src.zip": "1ttd5YTVSsdvdFaA27toQd5jny2IewYUt", 
}

# Simple downloader for public Drive files using gdown
import sys, subprocess
import gdown

for filename, file_id in data_files.items():
    destination = local_data_dir / filename
    if destination.exists():
        print(f"⚠️ Skipping {filename}, already exists at {destination}")
        continue
    url = f"https://drive.google.com/uc?id={file_id}&export=download"
    print(f"⬇️ Downloading {filename} to {destination}")
    gdown.download(url, str(destination), quiet=False)

for filename, file_id in model_files.items():
    destination = local_model_dir / filename
    if destination.exists():
        print(f"⚠️ Skipping {filename}, already exists at {destination}")
        continue
    url = f"https://drive.google.com/uc?id={file_id}&export=download"
    print(f"⬇️ Downloading {filename} to {destination}")
    gdown.download(url, str(destination), quiet=False)

for filename, file_id in model_files.items():
    destination = local_model_dir / filename
    if destination.exists():
        print(f"⚠️ Skipping {filename}, already exists at {destination}")
        continue
    url = f"https://drive.google.com/uc?id={file_id}&export=download"
    print(f"⬇️ Downloading {filename} to {destination}")
    gdown.download(url, str(destination), quiet=False)

for filename, file_id in layer_files.items():
    destination = local_layer_dir / filename
    if destination.exists():
        print(f"⚠️ Skipping {filename}, already exists at {destination}")
        continue
    url = f"https://drive.google.com/uc?id={file_id}&export=download"
    print(f"⬇️ Downloading {filename} to {destination}")
    gdown.download(url, str(destination), quiet=False)

print('✅ Download step complete. Check', local_data_dir)


⬇️ Downloading feature_engineered_eval.csv to c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\processed\feature_engineered_eval.csv


Downloading...
From: https://drive.google.com/uc?id=1fH2TXIyuK_ZlMgT4tjHGGHTdZxv3z6rv&export=download
To: c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\processed\feature_engineered_eval.csv
100%|██████████| 46.7M/46.7M [00:15<00:00, 2.94MB/s]


⬇️ Downloading feature_engineered_holdout.csv to c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\processed\feature_engineered_holdout.csv


Downloading...
From: https://drive.google.com/uc?id=12H0hyDcIXOyrNNTctWxFge72WqWInji9&export=download
To: c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\processed\feature_engineered_holdout.csv
100%|██████████| 46.8M/46.8M [00:18<00:00, 2.55MB/s]


⬇️ Downloading feature_engineered_train.csv to c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\processed\feature_engineered_train.csv


Downloading...
From (original): https://drive.google.com/uc?id=10RsZsdGmFzNbFRByyEwnxP1kGxBvRUm6&export=download
From (redirected): https://drive.google.com/uc?id=10RsZsdGmFzNbFRByyEwnxP1kGxBvRUm6&export=download&confirm=t&uuid=40c71af5-ab78-463d-b84a-faa72277e543
To: c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\processed\feature_engineered_train.csv
100%|██████████| 179M/179M [01:06<00:00, 2.69MB/s] 


⬇️ Downloading cleaning_holdout.csv to c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\processed\cleaning_holdout.csv


Downloading...
From: https://drive.google.com/uc?id=1wtnm3vTOlUPOTzAqALANecJdJ0NqzY6e&export=download
To: c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\processed\cleaning_holdout.csv
100%|██████████| 52.3M/52.3M [00:16<00:00, 3.11MB/s]


⬇️ Downloading lgbm_model.pkl to c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\lgbm_model.pkl


Downloading...
From: https://drive.google.com/uc?id=1YzOpqBNc4nDwS04GlUfZDgO1T41wLauT&export=download
To: c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\lgbm_model.pkl
100%|██████████| 3.49M/3.49M [00:01<00:00, 3.06MB/s]


⬇️ Downloading lgbm_best_model.pkl to c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\lgbm_best_model.pkl


Downloading...
From: https://drive.google.com/uc?id=1uuT8Mjoj5QKhm3yV6zByODVWwnlGb3Fi&export=download
To: c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\lgbm_best_model.pkl
100%|██████████| 7.57M/7.57M [00:02<00:00, 3.18MB/s]


⬇️ Downloading freq_encoder.pkl to c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\freq_encoder.pkl


Downloading...
From: https://drive.google.com/uc?id=1qgPiDFul9xXhNliVDoftTsKv6baAjaLL&export=download
To: c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\freq_encoder.pkl
100%|██████████| 879/879 [00:00<00:00, 878kB/s]


⬇️ Downloading target_encoder.pkl to c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\target_encoder.pkl


Downloading...
From: https://drive.google.com/uc?id=12brw0d1A3zmGwdxPPFukJUKXoxbZaYSx&export=download
To: c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\target_encoder.pkl
100%|██████████| 2.70k/2.70k [00:00<00:00, 337kB/s]


⚠️ Skipping lgbm_model.pkl, already exists at c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\lgbm_model.pkl
⚠️ Skipping lgbm_best_model.pkl, already exists at c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\lgbm_best_model.pkl
⚠️ Skipping freq_encoder.pkl, already exists at c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\freq_encoder.pkl
⚠️ Skipping target_encoder.pkl, already exists at c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\models\target_encoder.pkl
⬇️ Downloading core-layer.zip to c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\layers\core-layer.zip


Downloading...
From (original): https://drive.google.com/uc?id=14JBDCVMfJOr8vydbnEazBh2P4_ZY0s7X&export=download
From (redirected): https://drive.google.com/uc?id=14JBDCVMfJOr8vydbnEazBh2P4_ZY0s7X&export=download&confirm=t&uuid=e259a157-4b63-4ea7-bc89-c2f496c85511
To: c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\layers\core-layer.zip
100%|██████████| 28.2M/28.2M [00:09<00:00, 3.10MB/s]


⬇️ Downloading ml-layer.zip to c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\layers\ml-layer.zip


Downloading...
From (original): https://drive.google.com/uc?id=1l39v2VcZseQMrwGXONdAywh2e2nINgI7&export=download
From (redirected): https://drive.google.com/uc?id=1l39v2VcZseQMrwGXONdAywh2e2nINgI7&export=download&confirm=t&uuid=ea1024ff-1f09-4cc6-91f0-c21768e2f0d2
To: c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\layers\ml-layer.zip
100%|██████████| 40.9M/40.9M [00:12<00:00, 3.36MB/s]


⬇️ Downloading src.zip to c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\layers\src.zip


Downloading...
From: https://drive.google.com/uc?id=1ttd5YTVSsdvdFaA27toQd5jny2IewYUt&export=download
To: c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\layers\src.zip
100%|██████████| 41.9k/41.9k [00:00<00:00, 912kB/s]

✅ Download step complete. Check c:\Users\MATIAS\Desktop\git\regression-end-to-end-ml-project\phase-1\data\processed



