In [None]:
import pandas as pd

destinations = pd.read_csv("data/destinations.csv")
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")

In [None]:
train.shape


In [None]:
test.shape


In [None]:
train.head(5)


In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
# EDA_and_Cleaning.ipynb (oder .py)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# --- Pfade anpassen ---
DATA_DIR = Path("data")  # ggf. ändern
train_path = DATA_DIR / "train.csv"
test_path = DATA_DIR / "test.csv"
dest_path = DATA_DIR / "destinations.csv"
sample_path = DATA_DIR / "sample_submission.csv"

In [None]:
# --- Laden ---
train = pd.read_csv(train_path, low_memory=False)
test = pd.read_csv(test_path, low_memory=False)
dest = pd.read_csv(dest_path)
sample_sub = pd.read_csv(sample_path)

In [None]:
# --- Basisüberblick ---
print(train.shape, test.shape, dest.shape)
print(train.info())
print(train.isna().mean().sort_values(ascending=False).head(10))

In [None]:
# --- Typen + Datumsspalten ---
for c in ["date_time", "srch_ci", "srch_co"]:
    train[c] = pd.to_datetime(train[c], errors='coerce')
    test[c] = pd.to_datetime(test[c], errors='coerce')

In [None]:
# --- Neue Features ---
train["stay_nights"] = (train["srch_co"] - train["srch_ci"]).dt.days
test["stay_nights"] = (test["srch_co"] - test["srch_ci"]).dt.days
train["search_month"] = train["date_time"].dt.month
train["search_wday"] = train["date_time"].dt.dayofweek
train["search_hour"] = train["date_time"].dt.hour

In [None]:
# --- Missing flags ---
train["missing_orig_dist"] = train["orig_destination_distance"].isna().astype(int)
test["missing_orig_dist"] = test["orig_destination_distance"].isna().astype(int)

In [None]:
# --- Join destinations ---
import pandas as pd

# Laden der Datensätze (Pfad ggf. anpassen)
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
dest = pd.read_csv("data/destinations.csv")

# --- Nur 10 % Stichprobe ---
train = train.sample(frac=0.1, random_state=42)
test = test.sample(frac=0.1, random_state=42)

# --- Join destinations ---
train = train.merge(dest, how="left", on="srch_destination_id")
test = test.merge(dest, how="left", on="srch_destination_id")

# --- Missing flags ---
train["dest_missing"] = train["d1"].isna().astype(int)
test["dest_missing"] = test["d1"].isna().astype(int)

# --- Fehlende Werte auffüllen ---
train = train.fillna(0)
test = test.fillna(0)

In [None]:
print(dest.columns.tolist())


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# --- Nur 10% verwenden ---
train_small = train.sample(frac=0.1, random_state=42)
test_small = test.sample(frac=0.1, random_state=42)

# --- Join destinations ---
train_small = train_small.merge(dest, how="left", on="srch_destination_id")
test_small = test_small.merge(dest, how="left", on="srch_destination_id")

train_small["dest_missing"] = train_small["d1"].isna().astype(int)
test_small["dest_missing"] = test_small["d1"].isna().astype(int)

train_small = train_small.fillna(0)
test_small = test_small.fillna(0)

# --- Schnelle Plausibilitätsplots ---
plt.figure(figsize=(6,4))
sns.histplot(train_small["stay_nights"], bins=30, kde=False)
plt.title("Stay length distribution (10%)")
plt.show()

plt.figure(figsize=(6,3))
train_small["hotel_cluster"].value_counts(normalize=True).head(10).plot(kind="bar")
plt.title("Top 10 Hotel Clusters (10%)")
plt.show()

In [None]:
# EDA_and_Cleaning.ipynb (oder .py)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# --- Pfade anpassen ---
DATA_DIR = Path("/mnt/data")  # ggf. ändern
train_path = DATA_DIR / "train.csv"
test_path = DATA_DIR / "test.csv"
dest_path = DATA_DIR / "destinations.csv"
sample_path = DATA_DIR / "sample_submission.csv"

# --- Laden ---
train = pd.read_csv(train_path, low_memory=False)
test = pd.read_csv(test_path, low_memory=False)
dest = pd.read_csv(dest_path)
sample_sub = pd.read_csv(sample_path)

# --- Basisüberblick ---
print(train.shape, test.shape, dest.shape)
print(train.info())
print(train.isna().mean().sort_values(ascending=False).head(10))

# --- Typen + Datumsspalten ---
for c in ["date_time", "srch_ci", "srch_co"]:
    train[c] = pd.to_datetime(train[c], errors='coerce')
    test[c] = pd.to_datetime(test[c], errors='coerce')

# --- Neue Features ---
train["stay_nights"] = (train["srch_co"] - train["srch_ci"]).dt.days
test["stay_nights"] = (test["srch_co"] - test["srch_ci"]).dt.days
train["search_month"] = train["date_time"].dt.month
train["search_wday"] = train["date_time"].dt.dayofweek
train["search_hour"] = train["date_time"].dt.hour

# --- Missing flags ---
train["missing_orig_dist"] = train["orig_destination_distance"].isna().astype(int)
test["missing_orig_dist"] = test["orig_destination_distance"].isna().astype(int)

# --- Join destinations ---
train = train.merge(dest, how="left", on="srch_destination_id")
test = test.merge(dest, how="left", on="srch_destination_id")
train["dest_missing"] = train["d1"].isna().astype(int)
test["dest_missing"] = test["d1"].isna().astype(int)
train = train.fillna(0)
test = test.fillna(0)

# --- Schnelle Plausibilitätsplots ---
plt.figure(figsize=(6,4))
sns.histplot(train["stay_nights"], bins=30, kde=False)
plt.title("Stay length distribution")
plt.show()

# Zielverteilung
plt.figure(figsize=(6,3))
train["hotel_cluster"].value_counts(normalize=True).head(10).plot(kind="bar")
plt.title("Top 10 Hotel Clusters")
plt.show()