[Reference](https://drlee.io/pca-and-kmeans-in-data-mining-a-case-study-with-airbnb-listings-aacf8f76a668)

# Dataset Overview

In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# Load the dataframe from the CSV file
df = pd.read_csv('https://raw.githubusercontent.com/fenago/datasets/main/airbnb.csv')

# Data Preprocessing Steps

In [8]:
# Fill missing values with the mean of the column
# df = df.fillna(df.mean())

# Convert categorical columns to numeric using label encoding
# Initialize label encoder
label_encoder = LabelEncoder()

# Columns to label encode
label_encode_columns = ['host_is_superhost', 'neighbourhood', 'property_type', 'instant_bookable']

# Apply label encoding to each column
for column in label_encode_columns:
    df[column] = label_encoder.fit_transform(df[column])

# Apply one-hot encoding to 'city' using get_dummies
df = pd.get_dummies(df, columns=['city'])

df = df.fillna(df.mean())
# Redefine and refit the scaler to the current dataset
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df)

# PCA Application


In [9]:
# Apply PCA
pca = PCA(n_components=3)
pca_result = pca.fit_transform(scaled_features)

# KMeans Clustering

In [10]:
# Apply KMeans clustering on the PCA result
kmeans_pca = KMeans(n_clusters=4, random_state=42)
kmeans_pca.fit(pca_result)



# Analysis of PCA Components

In [11]:
# Get the PCA components (loadings)
pca_components = pca.components_

# Inverse Transformation

In [13]:
# Inverse transform the cluster centers from PCA space back to the original feature space
original_space_centroids = scaler.inverse_transform(pca.inverse_transform(kmeans_pca.cluster_centers_))

# Create a new DataFrame for the inverse transformed cluster centers with column names
centroids_df = pd.DataFrame(original_space_centroids, columns=df.columns)

# Calculate the mean of the original data for comparison
original_means = df.mean(axis=0)

# Prepare the PCA loadings DataFrame
pca_loadings_df = pd.DataFrame(pca_components, columns=df.columns, index=[f'PC{i+1}' for i in range(3)])

# Centroid Analysis

In [15]:
# Append the mean of the original data to the centroids for comparison
centroids_comparison_df = centroids_df.append(original_means, ignore_index=True)

# Store the PCA loadings and centroids comparison DataFrame for further analysis
pca_loadings_df.to_csv('/mnt/data/pca_loadings.csv', index=True)
centroids_comparison_df.to_csv('/mnt/data/centroids_comparison.csv', index=False)

pca_loadings_df, centroids_comparison_df.head()  # Displaying the PCA loadings and the first few rows of the centroids comparison DataFrame