In [1]:
import pandas as pd
import numpy as np
from scipy import stats

#load dataset
df = pd.read_csv('training_set_VU_DM.csv')

In [2]:
# convert date to numerical feature 
df = df.sort_values(by="date_time")
df["date_time"] = (pd.to_datetime(df["date_time"]) - pd.Timestamp("1970-01-01")) // pd.Timedelta("1s")

In [3]:
# drop columns with lot of nans and uncorrelated columns
drop_columns_nans = ['visitor_hist_starrating', 'visitor_hist_adr_usd','srch_query_affinity_score','comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate']
drop_columns_low_correlation = ['site_id','prop_log_historical_price', 'srch_destination_id']
df = df.drop(drop_columns_nans, axis=1)
df = df.drop(drop_columns_low_correlation, axis=1)

In [4]:
df_prop_93974 = df[df['prop_id'] == 93974]
len(df_prop_93974)

969

In [7]:
# Vectorized outlier removal
def remove_outliers(group):
    z_scores = np.abs(stats.zscore(group['price_usd']))
    return group[(z_scores <= 1)]

df_wo_outliers = df.groupby('prop_id').apply(remove_outliers).reset_index(drop=True)

hotel_means = df_wo_outliers.groupby('prop_id')['price_usd'].mean().reset_index()

df_merged = df.merge(hotel_means, on='prop_id', suffixes=('', '_mean'))

threshold = 0.3
def replace_with_mean(row):
    if row['price_usd'] < row['price_usd_mean'] * threshold:
        print(row['prop_id'])
        print(row['price_usd'], row['price_usd_mean'])
        return row['price_usd_mean']
    else:
        return row['price_usd']

df_merged['adjusted_price'] = df_merged.apply(replace_with_mean, axis=1)

df_merged = df_merged.drop('price_usd', axis=1).rename(columns={'adjusted_price': 'price_usd'})


Elapsed time for 100,000 rows: 6.678829908370972 seconds
Estimated time for 5 million rows: 3339.414954185486 seconds
done 1
284
done 2
done 3
done 4
93974.0
0.16 215.8966715328467
133689.0
36.23 234.8433674513818
119527.0
41.13 256.87955555555556
4503.0
46.16 319.13955486542443
5322.0
0.28 244.572633107454
5322.0
43.33 244.572633107454
6713.0
0.26 244.54781193490055
8864.0
47.19 236.55423758865248
14480.0
28.24 194.08427364864866
20430.0
0.16 187.58278620689654
34151.0
0.14 203.39539042821158
34151.0
27.59 203.39539042821158
35679.0
0.24 233.01969639468692
35679.0
39.97 233.01969639468692
24041.0
0.24 237.36482926829268
57170.0
0.19 185.66717348927875
79475.0
26.82 161.89411602209944
79398.0
31.2 235.40482283464567
78647.0
26.43 151.6594117647059
77074.0
34.81 233.53156435643564
75408.0
63.83 508.56500924214413
91975.0
0.18 216.69142131979694
67477.0
0.31 281.70660175267767
67477.0
38.94 281.70660175267767
60144.0
26.05 194.74704284221528
5138.0
4.74 122.75148148148148
7566.0
6.66 143

In [8]:
df.to_csv('cleaned.csv')