# Laden Local

In [None]:
import pandas as pd
import requests
import io
import os
import time
import gdown

file_ids = {
    "holiday_events": "1RMjSuqHXHTwAw_PGD5XVjhA3agaAGHDH",
    "items": "1ogMRixVhNY6XOJtIRtkRllyOyzw1nqya",
    "oil": "1Q59vk2v4WQ-Rpc9t2nqHcsZM3QWGFje_",
    "stores": "1Ei0MUXmNhmOcmrlPad8oklnFEDM95cDi",
    "transactions": "1PW5LnAEAiL43fI5CRDn_h6pgDG5rtBW_", 
    "train": "1oEX8NEJPY7wPmSJ0n7lO1JUFYyZjFBRv" 
}

# Pfad zur lokal gespeicherten, großen Datei
LOCAL_TRAIN_FILE_PATH = "Sempel_data/train.csv"
N_ROWS_TO_LOAD = 2_000_000
DOWNLOAD_DIR = "Sempel_data"

# Build das direct download URL from a file ID
def make_drive_url(file_id):
    """Erstellt die direkte Download-URL aus der Datei-ID."""
    return f"https://drive.google.com/uc?id={file_id}"
def download_file_if_missing(file_name, file_id):
    """Prüft, ob eine Datei lokal existiert und lädt sie ansonsten mit gdown herunter."""
    local_path = os.path.join(DOWNLOAD_DIR, file_name)
    
    if os.path.exists(local_path):
        print(f"Datei '{file_name}' existiert bereits lokal. Überspringe Download.")
        return local_path
    print(f"Lade '{file_name}' von Google Drive (ID: {file_id})...")
    url = make_drive_url(file_id)
    
    try:
        if not os.path.exists(DOWNLOAD_DIR):
            os.makedirs(DOWNLOAD_DIR)
            
        gdown.download(url, local_path, quiet=False)
        print(f"Download von '{file_name}' abgeschlossen.")
        return local_path
    except Exception as e:
        print(f"SCHWERER FEHLER beim Download von '{file_name}' mit gdown: {e}")
        return None

# Helper function to load a CSV from a local path
def load_csv_from_local(local_path, name, nrows=None):
    """Lädt eine CSV-Datei von einem lokalen Pfad in einen DataFrame."""
    print(f"Lade '{name}' (lokal)...")
    start_time = time.time()
    try:
        df = pd.read_csv(
            local_path, 
            nrows=nrows, 
            low_memory=False
        )
        load_time = int(time.time() - start_time)
        print(f"'{name}' erfolgreich geladen. Zeilen: {len(df):,}. Zeit: {load_time}s.")
        return df
    except Exception as e:
        print(f"FEHLER beim Laden der lokalen CSV-Datei '{name}': {e}")
        return None

if __name__ == '__main__':
    
    # Stellen Sie sicher, dass gdown installiert ist
    try:
        import gdown
    except ImportError:
        print("FEHLER: Das 'gdown'-Paket ist nicht installiert.")
        print("Bitte führen Sie im Terminal aus: pip install gdown requests")
        exit()
    dataframes = {}
    
    # 1. Lade alle kleineren DataFrames und die große Datei train.csv
    print("--- 1. Download fehlender und Laden aller DataFrames ---")
    
    for name, file_id in file_ids.items():
        file_name = f"{name}.csv"
        local_path = download_file_if_missing(file_name, file_id)
        if local_path is None:
            continue 
        if name == "train":
            df = load_csv_from_local(local_path, name, nrows=N_ROWS_TO_LOAD)
        else:
            df = load_csv_from_local(local_path, name)
            
        if df is not None:
            dataframes[f"df_{name}"] = df
    
    # 2. Zusammenfassung und Zuweisung zu einzelnen Variablen
    
    if dataframes:
        print("Alle DataFrames erfolgreich geladen:")
        globals().update(dataframes)
        for name, df in dataframes.items():
            print(f"- {name}: {df.shape[0]:,} Zeilen, {df.shape[1]} Spalten.")
            
    else:
        print("Keine DataFrames konnten geladen werden. Bitte prüfen Sie die Fehlermeldungen oben.")


# Laden auf googleDrive um mit Coolab zu arbeiten

In [None]:
import pandas as pd
import requests
import io

# Build the direct download URL from a file ID
def make_drive_url(file_id):
    return f"https://drive.google.com/uc?id={file_id}"

# Helper function to load a CSV from a direct URL
def load_csv_from_url(url):
    response = requests.get(url)
    response.raise_for_status()  # Raises an error if the request fails
    return pd.read_csv(io.StringIO(response.text))

# Dictionary of file IDs for clarity
file_ids = {
    "holiday_events": "1RMjSuqHXHTwAw_PGD5XVjhA3agaAGHDH",
    "items": "1ogMRixVhNY6XOJtIRtkRllyOyzw1nqya",
    "oil": "1Q59vk2v4WQ-Rpc9t2nqHcsZM3QWGFje_",
    "stores": "1Ei0MUXmNhmOcmrlPad8oklnFEDM95cDi",
    "train": "1oEX8NEJPY7wPmSJ0n7lO1JUFYyZjFBRv",
    "transactions": "1PW5LnAEAiL43fI5CRDn_h6pgDG5rtBW_"
}

# Load each CSV using the helper functions
df_holiday_events = load_csv_from_url(make_drive_url(file_ids["holiday_events"]))
df_items          = load_csv_from_url(make_drive_url(file_ids["items"]))
df_oil            = load_csv_from_url(make_drive_url(file_ids["oil"]))
df_stores         = load_csv_from_url(make_drive_url(file_ids["stores"]))
# df_train          = load_csv_from_url(make_drive_url(file_ids["train"])) we dont read it as the file is too big and wont work this way
df_transactions   = load_csv_from_url(make_drive_url(file_ids["transactions"]))

# ---------------------

# Download the file using gdown
gdown.download(make_drive_url(file_ids["train"]), "train.csv", quiet=False)
!pip install -q "dask[dataframe]"

import gdown

# Use our existing function to build the download URL
train_url = make_drive_url(file_ids["train"])

# Download the file using gdown
gdown.download(train_url, "train.csv", quiet=False)