# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# initialization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from main import LogisticRegressionPipeline ,DataLoader, Plotter,BasicInfoStep,MissingValuesStep, NumericSummaryStep, CategoricalAnalysisStep, EdaPipeline, FeatureEngineering
from Dora_EDA import MyModel
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6) 

# Exploratory Data Analysis

In [13]:
model = MyModel()
model.load_data(filepath="training_set_VU_DM.csv")
model2 = MyModel()
model2.load_data(filepath="test_set_VU_DM.csv")

#Finds columns that have a correlation value higher than 0.7
high_corrs = model.get_highly_correlated_columns(threshold=0.7)
for col1, col2, corr in high_corrs:
    print(f"High Correlation:{col1} vs {col2}: correlation = {corr:.3f}")

#Remove one of highly correlated columns
model.drop_columns(["booking_bool"])
model2.drop_columns(["booking_bool"])

#Finds and removes columns that have more than 80% missing values
high_missing_cols = model.get_columns_with_high_missing(threshold=0.8)
model.drop_columns(high_missing_cols)
model2.drop_columns(high_missing_cols)

#Prints the new dataset after dropping columns
print("New dataframe after dropping high-missing columns:")
print(model.dataframe)

#Prints out the summary of numericals columns such as range mean...
model.summarize_numerical_columns()
model.get_non_numerical_columns()

#Prints out columns that have missing values still
model.get_columns_with_missing_values()

#Mention why data is clean

Loading data from: training_set_VU_DM.csv using Polars
Data loaded: (4958347, 54) (rows, cols) in 4.89 seconds.
Loading data from: test_set_VU_DM.csv using Polars
Error: File not found at test_set_VU_DM.csv
High Correlation:click_bool vs booking_bool: correlation = 0.783
Dropping columns: ['booking_bool']
Remaining columns: ['srch_id', 'date_time', 'site_id', 'visitor_location_country_id', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id', 'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'srch_query_affinity_score', 'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv', 'comp2_rate_percent_diff', 'comp3

['prop_review_score',
 'prop_location_score2',
 'orig_destination_distance',
 'comp2_rate',
 'comp2_inv',
 'comp3_rate',
 'comp3_inv',
 'comp5_rate',
 'comp5_inv',
 'comp8_rate',
 'comp8_inv']

# Feature Engineering

In [14]:
from Dora_EDA import FeatureEngineer

# After your cleaning steps
feature_engineer = FeatureEngineer(model.dataframe)
df_imputed = feature_engineer.transform()

feature_engineer2 = FeatureEngineer(model2.dataframe)
df_imputed_test = feature_engineer2.transform()

# Set back to the model if needed
model.dataframe = df_imputed
model2.dataframe = df_imputed_test

# Final check
print("✅ Columns with missing values AFTER imputation:")
model.get_columns_with_missing_values()

print("✅ Columns with missing values AFTER imputation:")
model2.get_columns_with_missing_values()


=== FeatureEngineering (Auto-Imputation): Start ===
Found 11 columns with missing values: ['prop_review_score', 'prop_location_score2', 'orig_destination_distance', 'comp2_rate', 'comp2_inv', 'comp3_rate', 'comp3_inv', 'comp5_rate', 'comp5_inv', 'comp8_rate', 'comp8_inv']
Imputed prop_review_score (continuous numeric) with median: 4.0
Imputed prop_location_score2 (continuous numeric) with median: 0.069
Imputed orig_destination_distance (continuous numeric) with median: 386.6
Imputed comp2_rate (binary/categorical numeric) with mode: 0
Imputed comp2_inv (binary/categorical numeric) with mode: 0
Imputed comp3_rate (binary/categorical numeric) with mode: 0
Imputed comp3_inv (binary/categorical numeric) with mode: 0
Imputed comp5_rate (binary/categorical numeric) with mode: 0
Imputed comp5_inv (binary/categorical numeric) with mode: 0
Imputed comp8_rate (binary/categorical numeric) with mode: 0
Imputed comp8_inv (binary/categorical numeric) with mode: 0

=== FeatureEngineering: Complete =

AttributeError: 'NoneType' object has no attribute 'null_count'

In [12]:
for col in model.dataframe.columns:
    print(f"\n=== Column: {col} ===")
    print(model.dataframe[col].head(50).to_list())


=== Column: srch_id ===
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]

=== Column: date_time ===
['2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2012-12-31 08:59:22', '2012-12-31 08:59:22', '2012-12-31 08:59:22', '2012-12-31 08:59:22', '2012-12-31 08:59:22', '2012-12-31 08:59:22', '2012-12-31 08

# Modelling

In [None]:
from Dora_EDA import HotelRanker

ranker = HotelRanker(df_imputed)
ranker.train()
