# Initialization

In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
# initialization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from main import LogisticRegressionPipeline ,DataLoader, Plotter,BasicInfoStep,MissingValuesStep, NumericSummaryStep, CategoricalAnalysisStep, EdaPipeline, FeatureEngineering

sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6) 

In [9]:
#loading the dataset
loader = DataLoader()
df = loader.load()

Loading data from training_set_VU_DM.csv
Loaded: (4958347, 54)


# Exploratory Data Analysis

In [21]:
# EDA
plotter = Plotter(save=False)
steps = [
    # BasicInfoStep(),
    MissingValuesStep(),
    # NumericSummaryStep(plotter, plot=True, corr=True, corr_sample_frac=0.3, max_distributions=5),
    # CategoricalAnalysisStep(plotter, plot=True, threshold=50, max_plots=10),
]
pipeline = EdaPipeline(steps)
pipeline.run(df)
print(df)


=== Starting EDA Pipeline ===

--- Missing Values ---
shape: (1, 3)
┌───────────────────┬─────────┬──────┐
│ column            ┆ missing ┆ pct  │
│ ---               ┆ ---     ┆ ---  │
│ str               ┆ u32     ┆ f64  │
╞═══════════════════╪═════════╪══════╡
│ prop_review_score ┆ 7364    ┆ 0.15 │
└───────────────────┴─────────┴──────┘
=== EDA Pipeline Complete ===
shape: (4_958_347, 54)
┌─────────┬────────────┬─────────┬────────────┬───┬────────────┬───────────┬───────────┬───────────┐
│ srch_id ┆ date_time  ┆ site_id ┆ visitor_lo ┆ … ┆ comp8_rate ┆ click_boo ┆ gross_boo ┆ booking_b │
│ ---     ┆ ---        ┆ ---     ┆ cation_cou ┆   ┆ _percent_d ┆ l         ┆ kings_usd ┆ ool       │
│ i64     ┆ str        ┆ i64     ┆ ntry_id    ┆   ┆ iff        ┆ ---       ┆ ---       ┆ ---       │
│         ┆            ┆         ┆ ---        ┆   ┆ ---        ┆ i64       ┆ str       ┆ i64       │
│         ┆            ┆         ┆ i64        ┆   ┆ str        ┆           ┆           ┆           │

In [25]:
import polars as pl

# Build a table of (column, missing, pct)
missing_table = (
    df
    .null_count()
    .melt(variable_name="column", value_name="missing")
    .with_columns(
        (pl.col("missing") / df.height * 100).round(2).alias("pct")
    )
)

print("Missing Values Report:")
for row in missing_table.to_dicts():
    print(f"{row['column']}: {row['pct']:.2f}%  ({row['missing']} nulls)")


Missing Values Report:
srch_id: 0.00%  (0 nulls)
date_time: 0.00%  (0 nulls)
site_id: 0.00%  (0 nulls)
visitor_location_country_id: 0.00%  (0 nulls)
visitor_hist_starrating: 0.00%  (0 nulls)
visitor_hist_adr_usd: 0.00%  (0 nulls)
prop_country_id: 0.00%  (0 nulls)
prop_id: 0.00%  (0 nulls)
prop_starrating: 0.00%  (0 nulls)
prop_review_score: 0.15%  (7364 nulls)
prop_brand_bool: 0.00%  (0 nulls)
prop_location_score1: 0.00%  (0 nulls)
prop_location_score2: 0.00%  (0 nulls)
prop_log_historical_price: 0.00%  (0 nulls)
position: 0.00%  (0 nulls)
price_usd: 0.00%  (0 nulls)
promotion_flag: 0.00%  (0 nulls)
srch_destination_id: 0.00%  (0 nulls)
srch_length_of_stay: 0.00%  (0 nulls)
srch_booking_window: 0.00%  (0 nulls)
srch_adults_count: 0.00%  (0 nulls)
srch_children_count: 0.00%  (0 nulls)
srch_room_count: 0.00%  (0 nulls)
srch_saturday_night_bool: 0.00%  (0 nulls)
srch_query_affinity_score: 0.00%  (0 nulls)
orig_destination_distance: 0.00%  (0 nulls)
random_bool: 0.00%  (0 nulls)
comp1_rate

  .melt(variable_name="column", value_name="missing")


# Feature Engineering

In [5]:
# feature Engineering
fe = FeatureEngineering(cat_threshold=50).fit(df)
df1 = fe.transform(df)

shape: (54, 3)
┌─────────────────────────────┬─────────┬──────┐
│ column                      ┆ missing ┆ pct  │
│ ---                         ┆ ---     ┆ ---  │
│ str                         ┆ u32     ┆ f64  │
╞═════════════════════════════╪═════════╪══════╡
│ prop_review_score           ┆ 7364    ┆ 0.15 │
│ srch_id                     ┆ 0       ┆ 0.0  │
│ date_time                   ┆ 0       ┆ 0.0  │
│ site_id                     ┆ 0       ┆ 0.0  │
│ visitor_location_country_id ┆ 0       ┆ 0.0  │
│ …                           ┆ …       ┆ …    │
│ comp8_inv                   ┆ 0       ┆ 0.0  │
│ comp8_rate_percent_diff     ┆ 0       ┆ 0.0  │
│ click_bool                  ┆ 0       ┆ 0.0  │
│ gross_bookings_usd          ┆ 0       ┆ 0.0  │
│ booking_bool                ┆ 0       ┆ 0.0  │
└─────────────────────────────┴─────────┴──────┘
=== FeatureEngineering.fit ===
Dropping 0 columns > 70.0% null:

Imputing 39 columns:
  • 'prop_review_score' → 4.0
  • 'srch_id' → 166507.0
  • 'site_i

# Modelling

In [None]:
# Prepare features/target
feature_cols = ["prop_starrating", "price_usd", "srch_adults_count"]
target_col = "booking_bool"
X = df.select(feature_cols)
y = df[target_col]
# Modeling
model_pipeline = LogisticRegressionPipeline()
model_pipeline.execute(X, y)

