In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PowerTransformer

In [None]:
# Loading dataframe
df = pd.read_csv('London_Listings.csv')

# Original shape
print('Original Shape:', df.shape)

# First five rows
df.head()

In [None]:
# Check types before categorising
print(df.dtypes)

In [None]:
# Categorial and numerical columns
categorical_var = df.select_dtypes(include=['object'])
numerical_var = df.select_dtypes(include=['int64', 'float64'])

print("Categorical Variables:", categorical_var)
print("Numerical Variables:", numerical_var)

In [None]:
# Price Column
df['price'] = df['price'].str.replace('$', '', regex=False).str.replace(',', '', regex=False)
df['price'] = pd.to_numeric(df['price'], errors='coerce')

df['price'].describe()


In [None]:
# Categorical variables, remove empty strings or lists and replace with Nan.
for col in categorical_var:
    df[col] = df[col].replace(['', '[]'], np.nan)
    
missing_col_val = df.isnull().sum()
print('Missing column values:\n', missing_col_val[missing_col_val > 0])

In [None]:
# Drop unnecessary columns, can be later changed.
drop_Columns = ['calendar_last_scraped', 'bathrooms_text', 'latitude', 'longitude']
df.drop(columns=drop_Columns, inplace=True, errors='ignore')

# Updated shape
print("Shape after dropping unnecessary columns:", df.shape)

In [None]:
# Drop duplicates
df.drop_duplicates(inplace=True)

# Drop rows with NaN values
df.dropna(inplace=True)

# Updated shape
print('Shape after dropping duplicates and NaN rows:', df.shape)

In [None]:
# Remove outliers (ONLY RUN ONCE)

def remove_outliers(df, columns):
    for col in columns:
        # Necessary as numberical_var contains elements which have been deleted.
        if col not in df.columns:
            print(f"Column '{col}' does not exist. ")
            continue
        
        # Calculate IQR and bounds
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        l_bound = q1 - 1.5 * iqr
        u_bound = q3 + 1.5 * iqr
        
        # Remove outliers
        df = df[(df[col] >= l_bound) & (df[col] <= u_bound)]
    
    return df

# Call the function
df = remove_outliers(df, numerical_var)

# Display the updated shape
print('Shape after removing outliers:', df.shape)


In [None]:
# Log transformation of prices
df['price_log'] = np.log1p(df['price'])

# Update variable
numerical_var = [
    'accommodates', 'bathrooms', 'bedrooms', 'beds',
    'price', 'number_of_reviews', 'review_scores_rating'
]

# Creating a copy of the dataframe so we can normalise and have a non-normalised version.
df_original = df.copy()

# Normalise numberical features
scaler = StandardScaler()
df[numerical_var] = scaler.fit_transform(df[numerical_var])

print(df.head())


In [None]:
# Prices Across Neighborhoods

# Size of the graph
plt.figure(figsize=(12, 6))

# Boxplot prices across neighbourhood
sns.boxplot(data=df_original, x='neighbourhood', y='price_log')

# Read it horizontally
plt.xticks(rotation=90)

# Add labels and title
plt.title('Price Distribution Across Neighborhoods', fontsize=16)
plt.xlabel('Neighborhood', fontsize=12)
plt.ylabel('Log Price', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Prices Across Number of Possible Tenants

# Set the figure size
plt.figure(figsize=(12, 6))

# Create a boxplot of prices across the number of tenants
sns.boxplot(data=df_original, x='accommodates', y='price_log')

# Add titles and labels
plt.title('Price Distribution by Number of Tenants', fontsize=16)
plt.xlabel('Number of Tenants (Accommodates)', fontsize=12)
plt.ylabel('Log Price', fontsize=12)

# Show the plot
plt.tight_layout()
plt.show()