# Initialization

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# initialization
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Dora_EDA import MyModel
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 6) 

# Exploratory Data Analysis

In [38]:
model = MyModel()
model.load_data(filepath="training_set_VU_DM.csv")
model2 = MyModel()
model2.load_data(filepath="test_set_VU_DM.csv")

#Finds and removes columns that have more than 80% missing values
high_missing_cols = model.get_columns_with_high_missing(threshold=0.8)
model.drop_columns(high_missing_cols)
model2.drop_columns(high_missing_cols)
model.drop_columns(['date_time'])
model2.drop_columns(['date_time'])

#Prints the new dataset after dropping columns
print("New dataframe after dropping high-missing columns:")
print(model.dataframe)

#Prints out the summary of numericals columns such as range mean...
model.summarize_numerical_columns()
model.get_non_numerical_columns()

#Prints out columns that have missing values still
model.get_columns_with_missing_values()


#Mention why data is clean

Loading data from: training_set_VU_DM.csv using Polars
Data loaded: (4958347, 54) (rows, cols) in 3.64 seconds.
Loading data from: test_set_VU_DM.csv using Polars
Data loaded: (4959183, 50) (rows, cols) in 2.82 seconds.
Columns with more than 80% missing values: ['visitor_hist_starrating', 'visitor_hist_adr_usd', 'srch_query_affinity_score', 'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff', 'comp2_rate_percent_diff', 'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv', 'comp4_rate_percent_diff', 'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv', 'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv', 'comp7_rate_percent_diff', 'comp8_rate_percent_diff', 'gross_bookings_usd']
Dropping columns: ['visitor_hist_starrating', 'visitor_hist_adr_usd', 'srch_query_affinity_score', 'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff', 'comp2_rate_percent_diff', 'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv', 'comp4_rate_percent_diff', 'comp5_rate_percent_diff', 'comp6_rate', 'comp6_i

['prop_review_score',
 'prop_location_score2',
 'orig_destination_distance',
 'comp2_rate',
 'comp2_inv',
 'comp3_rate',
 'comp3_inv',
 'comp5_rate',
 'comp5_inv',
 'comp8_rate',
 'comp8_inv']

# Feature Engineering

In [31]:
from Dora_EDA import FeatureEngineer

# After your cleaning steps
feature_engineer = FeatureEngineer(model.dataframe)
df_imputed = feature_engineer.transform()
df_imputed = feature_engineer.add_interaction_target()


feature_engineer2 = FeatureEngineer(model2.dataframe)
df_imputed_test = feature_engineer2.transform()
# Set back to the model if needed
model.dataframe = df_imputed
model2.dataframe = df_imputed_test

# Final check
print("✅ Columns with missing values AFTER imputation:")
model.get_columns_with_missing_values()

print("✅ Columns with missing values AFTER imputation:")
model2.get_columns_with_missing_values()





=== FeatureEngineering (Auto-Imputation): Start ===
Found 11 columns with missing values: ['prop_review_score', 'prop_location_score2', 'orig_destination_distance', 'comp2_rate', 'comp2_inv', 'comp3_rate', 'comp3_inv', 'comp5_rate', 'comp5_inv', 'comp8_rate', 'comp8_inv']
Imputed prop_review_score (continuous numeric) with median: 4.0
Imputed prop_location_score2 (continuous numeric) with median: 0.069
Imputed orig_destination_distance (continuous numeric) with median: 386.6
Imputed comp2_rate (binary/categorical numeric) with mode: 0
Imputed comp2_inv (binary/categorical numeric) with mode: 0
Imputed comp3_rate (binary/categorical numeric) with mode: 0
Imputed comp3_inv (binary/categorical numeric) with mode: 0
Imputed comp5_rate (binary/categorical numeric) with mode: 0
Imputed comp5_inv (binary/categorical numeric) with mode: 0
Imputed comp8_rate (binary/categorical numeric) with mode: 0
Imputed comp8_inv (binary/categorical numeric) with mode: 0

=== FeatureEngineering: Complete =

In [35]:
train_cols = set(model.dataframe.columns)
test_cols = set(model2.dataframe.columns)

# 1️⃣ Extra columns in train (but not in test) -- keep target though
extra_in_train = train_cols - test_cols - {"interaction_target"}  # keep your target columns
print(f"Extra columns in train (dropping these): {extra_in_train}")
model.dataframe = model.dataframe.drop(list(extra_in_train))

Extra columns in train (dropping these): {'click_bool', 'position', 'booking_bool'}


# Modelling

In [37]:
from Dora_EDA import HotelRanker

ranker = HotelRanker(model.dataframe)
ranker.train()
predictions = ranker.predict(model2.dataframe)
ranker.export_ranking(predictions, filename="submission.csv")


Preparing data...
Using 28 feature columns: ['site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_starrating', 'prop_review_score']... (and more)
Training model...
Validation Accuracy: 0.9550
Predicting scores for ranking...


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- date_time
