In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import warnings

# Configure visualizations
warnings.filterwarnings('ignore')

In [2]:
# loading data
df = pd.read_csv('../data/test_set_VU_DM.csv')
df['date_time'] = pd.to_datetime(df['date_time'])
df.head()
df_original = df.copy()

## Impute missing data

In [3]:
# Check for missing data in the entire dataframe
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0]
print(missing_data)

visitor_hist_starrating      4705752
visitor_hist_adr_usd         4704559
prop_review_score               7266
prop_location_score2         1088032
srch_query_affinity_score    4641025
orig_destination_distance    1608679
comp1_rate                   4843307
comp1_inv                    4834309
comp1_rate_percent_diff      4868715
comp2_rate                   2943222
comp2_inv                    2837914
comp2_rate_percent_diff      4405574
comp3_rate                   3434198
comp3_inv                    3317952
comp3_rate_percent_diff      4487973
comp4_rate                   4646462
comp4_inv                    4610375
comp4_rate_percent_diff      4826056
comp5_rate                   2737262
comp5_inv                    2598370
comp5_rate_percent_diff      4119276
comp6_rate                   4716853
comp6_inv                    4696014
comp6_rate_percent_diff      4862045
comp7_rate                   4643454
comp7_inv                    4602430
comp7_rate_percent_diff      4819860
c

In [4]:
# 1. Set all missing competitor fields to 0
comp_cols = [col for col in df.columns if col.startswith('comp')]
df[comp_cols] = df[comp_cols].fillna(0)

# 2. Impute 'orig_destination_distance' and 'prop_review_score' with their median
for col in ['orig_destination_distance', 'prop_review_score']:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)

# 3. Impute 'srch_query_affinity_score' with the minimum value
df['srch_query_affinity_score'] = df['srch_query_affinity_score'].fillna(df['srch_query_affinity_score'].min())

# 4. Impute 'prop_location_score2' with the minimum per 'srch_destination_id'
df['prop_location_score2'] = df.groupby('srch_destination_id')['prop_location_score2'].transform(
    lambda x: x.fillna(x.min())
)
df['prop_location_score2'] = df['prop_location_score2'].fillna(df['prop_location_score2'].median())


# 5. Impute visitor historical features with mean
visitor_hist_cols = ['visitor_hist_starrating', 'visitor_hist_adr_usd']
for col in visitor_hist_cols:
    mean_val = df[col].mean()
    df[col] = df[col].fillna(mean_val)

# 6. Normalize 'price_usd' and 'prop_starrating' based on search and property groups

# Normalize 'price_usd' by 'srch_id'
df['price_usd_norm'] = df['price_usd'] / df.groupby('srch_id')['price_usd'].transform('mean')

# Normalize 'prop_starrating' by 'prop_id'
df['prop_starrating_norm'] = df['prop_starrating'] / df.groupby('prop_id')['prop_starrating'].transform('mean')
df['prop_starrating_norm'] = df['prop_starrating_norm'].fillna(0)

# 7. Drop the original 'price_usd' feature
df = df.drop(columns=['price_usd'])

In [5]:
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0]
print(missing_data)

Series([], dtype: int64)


In [7]:
df.to_csv('../data/clean_test.csv', index=False) 