# Data Preprocessing

## Load Dataset

In [None]:
df = pd.read_csv('data_files/concatted_data.csv')

df.drop(columns=['Unnamed: 0'], inplace=True)
df = df.rename(columns={'realSum': 'ROOM_PRICE'})

In [None]:
df['person_capacity'] = df['person_capacity'].astype(int)

## Group Features

In [None]:
categorical_features = ['room_type', 'room_shared', 'room_private', 'host_is_superhost', 'multi', 'biz', 'city', 'is_weekend']
continuous_num_features = ['guest_satisfaction_overall', 'dist', 'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm', 'lng', 'lat']
ordinal_num_features = ['person_capacity', 'cleanliness_rating', 'bedrooms'] # Cleanliness rating?

## Handle Missing Values

In [None]:
missing_summary = pd.DataFrame({
    'Feature': df.columns,
    'Missing Count': df.isna().sum(),
    'Missing Percentage (%)': (df.isna().sum() / len(df)) * 100
})

# Filter only features with missing values
#missing_summary = missing_summary[missing_summary['Missing Count'] > 0]

missing_summary = missing_summary.sort_values(by='Missing Count', ascending=False).reset_index(drop=True)

# Display the summary table
missing_summary

## Detect Duplicates

In [None]:
# Detect duplicate rows based on the 'features' list
duplicate_rows = df.duplicated(keep='first')

# Count the number of duplicates
num_duplicates = duplicate_rows.sum()
print(f"Number of duplicate rows: {num_duplicates}")

## Handle Outliers

In [None]:
from scipy.stats import skew

num_features = continuous_num_features + ordinal_num_features

Q1 = df[num_features].quantile(0.25)
Q3 = df[num_features].quantile(0.75)
IQR = Q3 - Q1

outliers_IQR = ((df[num_features] < (Q1 - 1.5 * IQR)) | (df[num_features] > (Q3 + 1.5 * IQR)))

outliers_count_IQR = outliers_IQR.sum()

outliers_percentage = (outliers_count_IQR / df.shape[0]) * 100

skewness = df[num_features].apply(lambda x: skew(x.dropna()))

# Prepare summary statistics DataFrame
num_desc_stats = pd.DataFrame({
    # 'min': df[num_features].min(),
    # 'max': df[num_features].max(),
    # 'mean': df[num_features].mean(),
    # 'median': df[num_features].median(),
    # 'std': df[num_features].std(),
    'skewness': skewness,
    'outlier_count': outliers_count_IQR,
    'outliers (%)': outliers_percentage,
})

# Outlier group based on percentage
def classify_outlier_group(percentage):
    if percentage == 0:
        return 'No outliers'
    elif percentage < 5:
        return 'Low'
    elif percentage < 15:
        return 'Moderate'
    else:
        return 'High'

num_desc_stats['outlier_group'] = num_desc_stats['outliers (%)'].apply(classify_outlier_group)

# Sort by outlier_count
num_desc_stats = num_desc_stats.sort_values(by='outlier_count', ascending=False).reset_index()
num_desc_stats

## Feature Transformation

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from feature_engine.outliers import Winsorizer

# Step 1: Winsorize features with high outliers

def winsorize_percentile(series, lower_percentile=5, upper_percentile=95):
    lower = np.percentile(series, lower_percentile)
    upper = np.percentile(series, upper_percentile)
    return np.clip(series, lower, upper)

df['bedrooms'] = winsorize_percentile(df['bedrooms'])
df['cleanliness_rating'] = winsorize_percentile(df['cleanliness_rating'])

# Step 2: Log transformation for skewed features
log_features = ['metro_dist', 'attr_index', 'rest_index', 'attr_index_norm', 'dist']
for feature in log_features:
    df[feature] = np.log1p(df[feature])

# Step 3: RobustScaler for outlier-affected features
robust_scaler = RobustScaler()
df[['bedrooms', 'metro_dist', 'dist']] = robust_scaler.fit_transform(df[['bedrooms', 'metro_dist', 'dist']])

# Step 4: StandardScaler for remaining numerical features
standard_scaler = StandardScaler()
standard_features = ['rest_index_norm', 'lng', 'lat', 'person_capacity', 'guest_satisfaction_overall']
df[standard_features] = standard_scaler.fit_transform(df[standard_features])

df.head(5)

## Feature Encoding

In [None]:
df['room_shared'] = df['room_shared'].map({False:0, True:1})
df['room_private'] = df['room_private'].map({False:0, True:1})
df['host_is_superhost'] = df['host_is_superhost'].map({False:0, True:1})

In [None]:
one_hot_features = ['room_type', 'city']

one_hot_df = pd.get_dummies(
    df[one_hot_features], 
    prefix=one_hot_features,
    drop_first=True
).astype(int)

df_encoded = pd.concat(
    [df.drop(columns=one_hot_features), one_hot_df],
    axis=1
)

## Feature Scaling

In [None]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop(columns=['ROOM_PRICE'], axis=1)
y = df_encoded['ROOM_PRICE']  

# drop attr_index_norm and rest_index_norm
X = X.drop(columns=['attr_index_norm', 'rest_index_norm'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)