# Sourcing Dataset

In [3]:
import pandas as pd
import requests
import zipfile
import os
from pathlib import Path
import yaml

In [4]:
link = Path("../config/config.yaml")
with open(link, 'r') as file:
    config = yaml.safe_load(file)
# Download and extract dataset
dataset_link = config['data']['source_url']
print(dataset_link)

https://ars.els-cdn.com/content/image/1-s2.0-S2352340918315191-mmc2.zip


In [5]:
# Download the ZIP file
response = requests.get(dataset_link)
zip_path = Path("../data/raw/dataset.zip")

# Create directory if it doesn't exist
zip_path.parent.mkdir(parents=True, exist_ok=True)

# Save the ZIP file
with open(zip_path, 'wb') as f:
    f.write(response.content)

print(f"Downloaded ZIP file to {zip_path}")

Downloaded ZIP file to ..\data\raw\dataset.zip


In [6]:
# Extract H1.csv and H2.csv from the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    # List all files in the ZIP
    file_list = zip_ref.namelist()
    print("Files in ZIP:", file_list)
    
    # Extract H1.csv and H2.csv
    for file_name in ['H1.csv', 'H2.csv']:
        if file_name in file_list:
            zip_ref.extract(file_name, path="../data/raw/")
            print(f"Extracted {file_name}")
        else:
            print(f"Warning: {file_name} not found in ZIP")

print("\nExtraction complete!")

Files in ZIP: ['H1.csv', '__MACOSX/', '__MACOSX/._H1.csv', 'H2.csv', '__MACOSX/._H2.csv']
Extracted H1.csv
Extracted H2.csv

Extraction complete!


In [7]:
# Load datasets and save as parquet files

h1_path = Path("../data/raw/H1.csv")
h2_path = Path("../data/raw/H2.csv")
h1_df = pd.read_csv(h1_path)
h2_df = pd.read_csv(h2_path)
combined_df = pd.concat([h1_df, h2_df], ignore_index=True)

h1_df.to_parquet(Path("../data/raw/H1.parquet"), compression ="zstd", index=False)
h2_df.to_parquet(Path("../data/raw/H2.parquet"), compression ="zstd", index=False)
combined_df.to_parquet(Path("../data/raw/combined.parquet"), compression ="zstd", index=False)
print("Saved H1, H2, and combined datasets as parquet files.")

Saved H1, H2, and combined datasets as parquet files.


In [8]:
# Delete the ZIP file and CSVs to save space
os.remove(zip_path)
os.remove(h1_path)
os.remove(h2_path)
print("Deleted ZIP file and original CSV files to save space.")

Deleted ZIP file and original CSV files to save space.
