# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
# initialization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from main import LogisticRegressionPipeline ,DataLoader, Plotter,BasicInfoStep,MissingValuesStep, NumericSummaryStep, CategoricalAnalysisStep, EdaPipeline, FeatureEngineering
from Dora_EDA import MyModel
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6) 

# Exploratory Data Analysis

In [10]:


model = MyModel()
model.load_data(filepath="training_set_VU_DM.csv")

#Finds columns that have a correlation value higher than 0.7
high_corrs = model.get_highly_correlated_columns(threshold=0.7)
for col1, col2, corr in high_corrs:
    print(f"High Correlation:{col1} vs {col2}: correlation = {corr:.3f}")

#Remove one of highly correlated columns
model.drop_columns(["booking_bool"])

#Finds and removes columns that have more than 80% missing values
high_missing_cols = model.get_columns_with_high_missing(threshold=0.8)
model.drop_columns(high_missing_cols)

#Prints the new dataset after dropping columns
print("New dataframe after dropping high-missing columns:")
print(model.dataframe)

#Prints out the summary of numericals columns such as range mean...
model.summarize_numerical_columns()
model.get_non_numerical_columns()

#Prints out columns that have missing values still
model.get_columns_with_missing_values()

Loading data from: training_set_VU_DM.csv using Polars
Data loaded: (4958347, 54) (rows, cols) in 5.38 seconds.
High Correlation:click_bool vs booking_bool: correlation = 0.783
Dropping columns: ['booking_bool']
Remaining columns: ['srch_id', 'date_time', 'site_id', 'visitor_location_country_id', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id', 'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'srch_query_affinity_score', 'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv', 'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv', 'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv', 'comp4_rate_percent_

['prop_review_score',
 'prop_location_score2',
 'orig_destination_distance',
 'comp2_rate',
 'comp2_inv',
 'comp3_rate',
 'comp3_inv',
 'comp5_rate',
 'comp5_inv',
 'comp8_rate',
 'comp8_inv']

# Feature Engineering

In [16]:
fe = FeatureEngineering(cat_threshold=50).fit(model.dataframe)
df1 = fe.transform(model.dataframe)

=== FeatureEngineering.fit ===
Imputing 32 columns:
  • 'comp3_rate' → 0.0
  • 'comp3_inv' → 0.0
  • 'comp8_rate' → 0.0
  • 'comp8_inv' → 0.0
  • 'comp2_rate' → 0.0
  • 'comp2_inv' → 0.0
  • 'comp5_rate' → 0.0
  • 'comp5_inv' → 0.0
  • 'orig_destination_distance' → 386.6
  • 'prop_location_score2' → 0.069
  • 'prop_review_score' → 4.0
  • 'srch_id' → 166507.0
  • 'site_id' → 5.0
  • 'visitor_location_country_id' → 219.0
  • 'prop_country_id' → 219.0
  • 'prop_id' → 69638.0
  • 'prop_starrating' → 3.0
  • 'prop_brand_bool' → 1.0
  • 'prop_location_score1' → 2.77
  • 'prop_log_historical_price' → 4.91
  • 'position' → 16.0
  • 'price_usd' → 122.0
  • 'promotion_flag' → 0.0
  • 'srch_destination_id' → 13541.0
  • 'srch_length_of_stay' → 2.0
  • 'srch_booking_window' → 17.0
  • 'srch_adults_count' → 2.0
  • 'srch_children_count' → 0.0
  • 'srch_room_count' → 1.0
  • 'srch_saturday_night_bool' → 1.0
  • 'random_bool' → 0.0
  • 'click_bool' → 0.0

=== FeatureEngineering.transform ===
Added n

# Modelling

In [None]:
# Prepare features/target
feature_cols = ["prop_starrating", "price_usd", "srch_adults_count"]
target_col = "booking_bool"
X = df.select(feature_cols)
y = df[target_col]
# Modeling
model_pipeline = LogisticRegressionPipeline()
model_pipeline.execute(X, y)

