In [None]:
import pandas as pd
import os

path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "all_cities.csv")
df = pd.read_csv(path)

# Convert cleanliness rating from 10-scale to 5-scale
df['cleanliness_rating'] = df['cleanliness_rating'] / 2


  df = pd.read_csv(path)


In [2]:
# Rename columns
df.rename(columns={'lat': 'latitude', 'lng': 'longitude'}, inplace=True)


In [None]:
import pandas as pd

path2 = os.path.join(os.path.dirname(os.path.abspath("__file__")), "scraped_data.csv")
df2 = pd.read_csv(path2)

df2.rename(columns={'price': 'realSum'}, inplace=True)


In [4]:
# Create room type columns
df2['room_shared'] = df2['room_type'].apply(lambda x: True if x == 'Shared room' else False)
df2['room_private'] = df2['room_type'].apply(lambda x: True if x == 'Private room' else False)


In [5]:
# Rename accommodates to person_capacity
df2.rename(columns={'accommodates': 'person_capacity'}, inplace=True)


In [6]:
# Process host_is_superhost column
df2.dropna(subset=['host_is_superhost'], inplace=True)

mapping = {'t': 1, 'f': 0, True: 1, False: 0, 'True': 1, 'False': 0}
df2['host_is_superhost'] = df2['host_is_superhost'].map(mapping)

df2.dropna(subset=['host_is_superhost'], inplace=True)
df2['host_is_superhost'] = df2['host_is_superhost'].astype(int)


In [7]:
# Create multi and biz columns based on host listings count
df2['multi'] = df2['host_listings_count'].apply(lambda x: 1 if x > 1 else 0)
df2['biz'] = df2['host_listings_count'].apply(lambda x: 1 if x > 1 else 0)


In [8]:
# Process cleanliness rating
df2.rename(columns={'review_scores_cleanliness': 'cleanliness_rating'}, inplace=True)
df2['cleanliness_rating'] = df2['cleanliness_rating'].round(1)
df2['cleanliness_rating'] = df2['cleanliness_rating'].fillna(0)


In [9]:
# Process guest satisfaction overall
df2.rename(columns={'host_acceptance_rate': 'guest_satisfaction_overall'}, inplace=True)
df2.dropna(subset=['guest_satisfaction_overall'], inplace=True)
df2['guest_satisfaction_overall'] = df2['guest_satisfaction_overall'].str.replace('%', '', regex=False).astype(float)


In [10]:
import numpy as np

# Create empty columns for distance and index features
new_columns = ['dist', 'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm']
for col in new_columns:
    df2[col] = np.nan


In [11]:
# Filter for allowed cities
allowed_cities = ['amsterdam', 'athens', 'barcelona', 'berlin', 'budapest', 'vienna', 'lisbon', 'london', 'paris', 'rome']

location_split = df2['host_location'].str.split(',', n=1, expand=True)
df2['city'] = location_split[0].str.strip().str.lower()
df2['country'] = location_split[1].str.strip() if len(location_split.columns) > 1 else None

df2 = df2[df2['city'].isin(allowed_cities)]
df2.dropna(subset=['city'], inplace=True)


In [12]:
import pandas as pd

# Map country to city for missing city values
allowed_cities = ['amsterdam', 'athens', 'barcelona', 'berlin', 'budapest', 'vienna', 'lisbon', 'london', 'paris', 'rome']

country_to_city = {
    'netherlands': 'amsterdam',
    'greece': 'athens',
    'spain': 'barcelona',
    'germany': 'berlin',
    'hungary': 'budapest',
    'austria': 'vienna',
    'portugal': 'lisbon',
    'united kingdom': 'london',
    'france': 'paris',
    'italy': 'rome'
}

location_split = df2['host_location'].str.split(',', n=1, expand=True)
temp_city = location_split[0].str.strip().str.lower()
temp_country = location_split[1].str.strip().str.lower() if len(location_split.columns) > 1 else pd.Series([None] * len(df2))

def finalize_location(row_idx):
    city = temp_city.iloc[row_idx]
    country = temp_country.iloc[row_idx]
    
    if city in allowed_cities:
        return city, country
    elif country in country_to_city:
        return country_to_city[country], country
    else:
        return None, None

df2[['city', 'country']] = [finalize_location(i) for i in range(len(df2))]
df2.dropna(subset=['city'], inplace=True)


In [13]:
import pandas as pd

# Remove unwanted values
unwanted = ['ga', 'in', 'canada', 'ny']
df2 = df2[~df2['city'].str.lower().isin(unwanted)]
df2 = df2[~df2['country'].str.lower().fillna('').isin(unwanted)]

print(f"Total remaining rows: {len(df2)}")


Total remaining rows: 196304


In [14]:
# Convert date to datetime and create weekend indicator
df2['calendar_last_scraped'] = pd.to_datetime(df2['calendar_last_scraped'], format='mixed', dayfirst=False)
df2['day_type'] = df2['calendar_last_scraped'].dt.dayofweek.apply(lambda x: 'weekend' if x >= 5 else 'weekday')
df2['is_weekend'] = df2['day_type'].apply(lambda x: 1 if x == 'weekend' else 0)
df2.drop(columns=['calendar_last_scraped'], inplace=True)


In [15]:
import pandas as pd

# Extract amenities features
target_amenities = ['wifi', 'kitchen', 'air_conditioning', 'parking', 'tv', 'heating']

for amenity in target_amenities:
    search_term = amenity.replace('_', ' ')
    df2[amenity] = df2['amenities'].str.contains(search_term, case=False, na=False).astype(int)


In [16]:
import pandas as pd

# Drop original amenities column
df2.drop(columns=['amenities'], inplace=True)


In [17]:
print(f"df2 shape: {df2.shape}")


df2 shape: (196304, 32)


In [18]:
import pandas as pd

# Compare columns between dataframes
common_cols = set(df.columns).intersection(set(df2.columns))
only_in_df = set(df.columns) - set(df2.columns)
only_in_df2 = set(df2.columns) - set(df.columns)

print(f"Common columns: {len(common_cols)}")
print(f"Only in df: {only_in_df}")
print(f"Only in df2: {only_in_df2}")


Common columns: 30
Only in df: set()
Only in df2: {'host_listings_count', 'host_location'}


In [19]:
import pandas as pd

# Merge dataframes
final_df = pd.concat([df, df2], ignore_index=True)
print(f"Total rows: {len(final_df)}")


Total rows: 251011


In [20]:
import pandas as pd

# Check data types and nulls
print(f"Shape: {final_df.shape}")
print(f"Total null values: {final_df.isnull().sum().sum()}")


Shape: (251011, 32)
Total null values: 1597930


In [23]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Impute amenities features using KNN
amenities_cols = ['wifi', 'kitchen', 'air_conditioning', 'parking', 'tv', 'heating']

for col in amenities_cols:
    final_df[col] = pd.to_numeric(final_df[col], errors='coerce')

core_cols = ['realSum', 'person_capacity', 'bedrooms', 'longitude', 'latitude']
all_num_cols = list(set(core_cols + amenities_cols))

df_with_nulls = final_df[final_df[amenities_cols].isnull().any(axis=1)].copy()
df_no_nulls = final_df[final_df[amenities_cols].notnull().all(axis=1)].copy()

if len(df_with_nulls) == 0:
    print("No nulls found in amenities columns")
else:
    scaler = StandardScaler()
    scaler.fit(df_no_nulls[all_num_cols])
    
    imputer = KNNImputer(n_neighbors=3, weights='distance')
    batch_size = 1000
    imputed_list = []
    
    print(f"Imputing {len(df_with_nulls)} rows...")
    
    for i in tqdm(range(0, len(df_with_nulls), batch_size), desc="Imputing"):
        current_batch = df_with_nulls.iloc[i : i + batch_size][all_num_cols].copy()
        reference = df_no_nulls[all_num_cols].sample(n=1000, random_state=42)
        combined = pd.concat([current_batch, reference])
        
        combined_scaled = scaler.transform(combined)
        imputed_scaled = imputer.fit_transform(combined_scaled)
        imputed_unscaled = scaler.inverse_transform(imputed_scaled)
        imputed_batch = imputed_unscaled[:len(current_batch)]
        imputed_list.append(imputed_batch)
    
    imputed_final_array = np.vstack(imputed_list)
    df_with_nulls[all_num_cols] = imputed_final_array
    
    for c in amenities_cols:
        df_with_nulls[c] = (df_with_nulls[c] >= 0.5).astype(int)
    
    final_df = pd.concat([df_with_nulls, df_no_nulls]).sort_index()
    
    print(f"Remaining nulls: {final_df[amenities_cols].isna().sum().sum()}")


Imputing 51707 rows...


Imputing: 100%|██████████| 52/52 [00:12<00:00,  4.10it/s]


Remaining nulls: 0


In [24]:
print(final_df[amenities_cols].isnull().sum())


wifi                0
kitchen             0
air_conditioning    0
parking             0
tv                  0
heating             0
dtype: int64


In [25]:
print(f"Total rows: {len(final_df)}")


Total rows: 251011


In [26]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Impute distance features using KNN
dist_cols = ['dist', 'metro_dist', 'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm']

for col in dist_cols:
    final_df[col] = pd.to_numeric(final_df[col], errors='coerce')

core_cols = ['realSum', 'longitude', 'latitude', 'person_capacity']
all_num_cols = list(set(core_cols + dist_cols))

df_with_nulls = final_df[final_df[dist_cols].isnull().any(axis=1)].copy()
df_no_nulls = final_df[final_df[dist_cols].notnull().all(axis=1)].copy()

if len(df_with_nulls) == 0:
    print("No nulls in distance columns")
else:
    scaler = StandardScaler()
    scaler.fit(df_no_nulls[all_num_cols])
    
    imputer = KNNImputer(n_neighbors=3, weights='distance')
    batch_size = 2000
    imputed_list = []
    
    print(f"Imputing {len(df_with_nulls)} rows...")
    
    for i in tqdm(range(0, len(df_with_nulls), batch_size), desc="Imputing"):
        current_batch = df_with_nulls.iloc[i : i + batch_size][all_num_cols].copy()
        reference = df_no_nulls[all_num_cols].sample(n=1500, random_state=42)
        combined = pd.concat([current_batch, reference])
        
        combined_scaled = scaler.transform(combined)
        imputed_scaled = imputer.fit_transform(combined_scaled)
        imputed_unscaled = scaler.inverse_transform(imputed_scaled)
        imputed_list.append(imputed_unscaled[:len(current_batch)])
    
    imputed_final_array = np.vstack(imputed_list)
    df_with_nulls[all_num_cols] = imputed_final_array
    
    final_df = pd.concat([df_with_nulls, df_no_nulls]).sort_index()
    
    print(f"Remaining nulls: {final_df[dist_cols].isna().sum().sum()}")


Imputing 196304 rows...


Imputing: 100%|██████████| 99/99 [01:05<00:00,  1.51it/s]


Remaining nulls: 0


In [27]:
print(f"Total rows: {len(final_df)}")
print(f"Total nulls: {final_df.isnull().sum().sum()}")


Total rows: 251011
Total nulls: 109864


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# Impute bedrooms and beds using KNN
target_cols = ['bedrooms', 'beds']
helper_cols = ['person_capacity', 'realSum', 'longitude', 'latitude']
all_cols = target_cols + helper_cols

df_with_nulls = final_df[final_df[target_cols].isnull().any(axis=1)].copy()
df_no_nulls = final_df[final_df[target_cols].notnull().all(axis=1)].copy()

if len(df_with_nulls) > 0:
    print(f"Imputing {len(df_with_nulls)} rows...")
    
    scaler = StandardScaler()
    scaler.fit(df_no_nulls[all_cols])
    
    imputer = KNNImputer(n_neighbors=3, weights='distance')
    imputed_list = []
    batch_size = 100
    
    for i in tqdm(range(0, len(df_with_nulls), batch_size), desc="Imputing"):
        batch = df_with_nulls.iloc[i : i + batch_size][all_cols].copy()
        reference = df_no_nulls[all_cols].sample(n=2000, random_state=42)
        combined = pd.concat([batch, reference])
        
        combined_scaled = scaler.transform(combined)
        imputed_scaled = imputer.fit_transform(combined_scaled)
        imputed_unscaled = scaler.inverse_transform(imputed_scaled)
        imputed_list.append(imputed_unscaled[:len(batch)])
    
    df_with_nulls[all_cols] = np.vstack(imputed_list)
    df_with_nulls[target_cols] = df_with_nulls[target_cols].round().astype(int)
    
    final_df = pd.concat([df_with_nulls, df_no_nulls]).sort_index()
else:
    print("No nulls in bedrooms/beds columns")

to_int_final = ['bedrooms', 'beds', 'wifi', 'kitchen', 'air_conditioning', 'parking', 'tv', 'heating']
final_df[to_int_final] = final_df[to_int_final].astype(int)

print(f"Total nulls remaining: {final_df.isnull().sum().sum()}")

# Save final output — outputs to the app data folder
output_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "..", "app", "data", "airbnb_listings_clean.csv")
final_df.to_csv(output_path, index=False)
print("Data saved successfully")


No nulls in bedrooms/beds columns
Total nulls remaining: 109414
Data saved successfully


In [None]:
import pandas as pd

# Verify final file
file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "..", "app", "data", "airbnb_listings_clean.csv")
final_df = pd.read_csv(file_path)

print(f"Shape: {final_df.shape}")
print(f"Total nulls: {final_df.isnull().sum().sum()}")


Shape: (251011, 32)
Total nulls: 109414


  final_df = pd.read_csv(file_path)
