In [14]:
import pandas as pd
import os

# Path to the extracted dataset folder
folder_path = '../ML-Exam/airbnb-prices-in-european-cities/'

# List all CSV files
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

combined_df = []

for file in csv_files:
    file_path = os.path.join(folder_path, file)

    # Load CSV
    df = pd.read_csv(file_path)

    # Extract city and weekend flag from filename
    filename_no_ext = os.path.splitext(file)[0]
    
    # Split city and weekday/weekend info
    if filename_no_ext.endswith('_weekends'):
        city = filename_no_ext.replace('_weekends', '').capitalize()
        is_weekend = True
    elif filename_no_ext.endswith('_weekdays'):
        city = filename_no_ext.replace('_weekdays', '').capitalize()
        is_weekend = False
    else:
        city = filename_no_ext.capitalize()
        is_weekend = pd.NA

    df['City'] = city
    df['Is_weekend'] = is_weekend

    combined_df.append(df)

# Concatenate all city data
final_df = pd.concat(combined_df, ignore_index=True)

# Save to a new CSV
final_df.to_csv(os.path.join('data/combined_airbnb_data.csv'), index=False)


In [16]:
df = pd.read_csv('data/combined_airbnb_data.csv')

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51707 entries, 0 to 51706
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Unnamed: 0                  51707 non-null  int64  
 1   realSum                     51707 non-null  float64
 2   room_type                   51707 non-null  object 
 3   room_shared                 51707 non-null  bool   
 4   room_private                51707 non-null  bool   
 5   person_capacity             51707 non-null  float64
 6   host_is_superhost           51707 non-null  bool   
 7   multi                       51707 non-null  int64  
 8   biz                         51707 non-null  int64  
 9   cleanliness_rating          51707 non-null  float64
 10  guest_satisfaction_overall  51707 non-null  float64
 11  bedrooms                    51707 non-null  int64  
 12  dist                        51707 non-null  float64
 13  metro_dist                  517

In [20]:
import os
os.getcwd()

'C:\\Users\\Emili\\Documents\\Github\\ML-Exam'

In [22]:
def clean_and_encode_airbnb_data(df):
    # Drop specified columns
    columns_to_drop = [
        'lng', 'lat', 'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm'
    ]
    df_cleaned = df.drop(columns=columns_to_drop)
    
    # Rename 'Unnamed: 0' to 'ID'
    df_cleaned = df_cleaned.rename(columns={'Unnamed: 0': 'ID'})
    
    # Identify boolean columns
    bool_cols = df_cleaned.select_dtypes(include='bool').columns.tolist()
    
    # Apply one-hot encoding to boolean columns
    df_encoded = pd.get_dummies(df_cleaned, columns=bool_cols, drop_first=False)
    
    return df_encoded

# Apply the updated function
encoded_df = clean_and_encode_airbnb_data(df)

# Display the updated dataframe columns
encoded_df.columns


Index(['ID', 'realSum', 'room_type', 'person_capacity', 'multi', 'biz',
       'cleanliness_rating', 'guest_satisfaction_overall', 'bedrooms', 'dist',
       'metro_dist', 'City', 'room_shared_False', 'room_shared_True',
       'room_private_False', 'room_private_True', 'host_is_superhost_False',
       'host_is_superhost_True', 'Is_weekend_False', 'Is_weekend_True'],
      dtype='object')

In [24]:
import pandas as pd
import os

def clean_encode_and_save_new(input_filepath: str, output_filename: str):
    """
    Reads the Airbnb dataset CSV at `input_filepath`, 
    drops specified columns, renames 'Unnamed: 0' to 'ID',
    one-hot encodes all boolean columns, 
    and writes the cleaned/encoded DataFrame to a new file inside the same folder.
    """
    # Read the original data
    df = pd.read_csv(input_filepath)
    
    # Drop unwanted columns
    cols_to_drop = ['lng', 'lat', 'attr_index', 'attr_index_norm', 'rest_index', 'rest_index_norm']
    df = df.drop(columns=[c for c in cols_to_drop if c in df.columns])
    
    # Rename ID column if present
    if 'Unnamed: 0' in df.columns:
        df = df.rename(columns={'Unnamed: 0': 'ID'})
    
    # Identify and one-hot encode boolean columns
    bool_cols = df.select_dtypes(include='bool').columns.tolist()
    df = pd.get_dummies(df, columns=bool_cols, drop_first=False)
    
    # Determine output path
    folder = os.path.dirname(input_filepath)
    output_filepath = os.path.join(folder, output_filename)
    
    # Save to a new CSV file
    df.to_csv(output_filepath, index=False)
    
    print(f"Cleaned and encoded data saved to '{output_filepath}'")

# Example usage
clean_encode_and_save_new(
    '../ML-Exam/data/combined_airbnb_data.csv',
    'cleaned_airbnb_data.csv'
)


Cleaned and encoded data saved to '../ML-Exam/data\cleaned_airbnb_data.csv'
