# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# initialization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from main import LogisticRegressionPipeline ,DataLoader, Plotter,BasicInfoStep,MissingValuesStep, NumericSummaryStep, CategoricalAnalysisStep, EdaPipeline, FeatureEngineering
from Dora_EDA import MyModel
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6) 

# Exploratory Data Analysis

In [18]:
model = MyModel()
model.load_data(filepath="training_set_VU_DM.csv")
model2 = MyModel()
model2.load_data(filepath="test_set_VU_DM.csv")

#Finds columns that have a correlation value higher than 0.7
high_corrs = model.get_highly_correlated_columns(threshold=0.7)
for col1, col2, corr in high_corrs:
    print(f"High Correlation:{col1} vs {col2}: correlation = {corr:.3f}")

#Remove one of highly correlated columns
# model.drop_columns(["booking_bool"])
# model2.drop_columns(["booking_bool"])

#Finds and removes columns that have more than 80% missing values
high_missing_cols = model.get_columns_with_high_missing(threshold=0.8)
model.drop_columns(high_missing_cols)
model2.drop_columns(high_missing_cols)

#Prints the new dataset after dropping columns
print("New dataframe after dropping high-missing columns:")
print(model.dataframe)

#Prints out the summary of numericals columns such as range mean...
model.summarize_numerical_columns()
model.get_non_numerical_columns()

#Prints out columns that have missing values still
model.get_columns_with_missing_values()

#Mention why data is clean

Loading data from: training_set_VU_DM.csv using Polars
Data loaded: (4958347, 54) (rows, cols) in 6.98 seconds.
Loading data from: test_set_VU_DM.csv using Polars
Data loaded: (4959183, 50) (rows, cols) in 4.34 seconds.
High Correlation:click_bool vs booking_bool: correlation = 0.783
Columns with more than 80% missing values: ['visitor_hist_starrating', 'visitor_hist_adr_usd', 'srch_query_affinity_score', 'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff', 'comp2_rate_percent_diff', 'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv', 'comp4_rate_percent_diff', 'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv', 'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv', 'comp7_rate_percent_diff', 'comp8_rate_percent_diff', 'gross_bookings_usd']
Dropping columns: ['visitor_hist_starrating', 'visitor_hist_adr_usd', 'srch_query_affinity_score', 'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff', 'comp2_rate_percent_diff', 'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv', 'comp4_rate

['prop_review_score',
 'prop_location_score2',
 'orig_destination_distance',
 'comp2_rate',
 'comp2_inv',
 'comp3_rate',
 'comp3_inv',
 'comp5_rate',
 'comp5_inv',
 'comp8_rate',
 'comp8_inv']

# Feature Engineering

In [21]:
from Dora_EDA import FeatureEngineer

# After your cleaning steps
feature_engineer = FeatureEngineer(model.dataframe)
df_imputed = feature_engineer.transform()

feature_engineer2 = FeatureEngineer(model2.dataframe)
df_imputed_test = feature_engineer2.transform()

# Set back to the model if needed
model.dataframe = df_imputed
model2.dataframe = df_imputed_test

# Final check
print("✅ Columns with missing values AFTER imputation:")
model.get_columns_with_missing_values()

print("✅ Columns with missing values AFTER imputation:")
model2.get_columns_with_missing_values()


=== FeatureEngineering (Auto-Imputation): Start ===
Found 0 columns with missing values: []

=== FeatureEngineering: Complete ===
Binary/categorical numeric columns imputed: []
Continuous numeric columns imputed: []

=== FeatureEngineering (Auto-Imputation): Start ===
Found 0 columns with missing values: []

=== FeatureEngineering: Complete ===
Binary/categorical numeric columns imputed: []
Continuous numeric columns imputed: []
✅ Columns with missing values AFTER imputation:
No columns have missing values.
✅ Columns with missing values AFTER imputation:
No columns have missing values.


[]

In [12]:
for col in model.dataframe.columns:
    print(f"\n=== Column: {col} ===")
    print(model.dataframe[col].head(50).to_list())


=== Column: srch_id ===
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]

=== Column: date_time ===
['2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2013-04-04 08:32:15', '2012-12-31 08:59:22', '2012-12-31 08:59:22', '2012-12-31 08:59:22', '2012-12-31 08:59:22', '2012-12-31 08:59:22', '2012-12-31 08:59:22', '2012-12-31 08

# Modelling

In [22]:
from Dora_EDA import HotelRanker

ranker = HotelRanker(model.dataframe)
ranker.train()
predictions = ranker.predict(model2.dataframe)
ranker.export_ranking(predictions, filename="submission.csv")


Preparing data...
Using 30 feature columns: ['site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_review_score']... (and more)
Training model...
Validation Accuracy: 0.9939
Predicting scores for ranking...


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- date_time
Feature names seen at fit time, yet now missing:
- click_bool
- position
