# Data Cleaning (Notebook 1 of 4) 

In [3]:
# Importing libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D

In [6]:
# Loading csv file
data = pd.read_csv('../1_raw_data/cadastre.csv')
data.head(5)

Unnamed: 0,osm_id,ncc_owned,osm_way_id,house_area,house_peri,house_vert,private,house_lati,NATIONALCA,land_area,land_perim,land_verti,land_longi,land_latit,COUNT,house_long,b2lpercent
0,35546036,1,35546036,65,35,7,yes,6971770.0,,65.0,35.0,7.0,-128246.3195,6971771.0,4,-128248.8,100.0
1,35546037,1,35546037,65,34,8,yes,6971743.0,,65.0,35.0,8.0,-128272.9071,6971744.0,4,-128275.85,100.0
2,35551461,1,35551461,65,35,7,yes,6971805.0,,62.0,34.0,8.0,-128283.5621,6971806.0,4,-128286.18,105.0
3,35551462,1,35551462,66,34,8,yes,6971754.0,29324493.0,207.0,62.0,12.0,-128367.6972,6971752.0,1,-128372.04,32.0
4,35551463,1,35551463,72,35,9,yes,6971806.0,,63.0,35.0,8.0,-128339.0663,6971806.0,4,-128340.8,114.0


In [None]:
# Removing rows without values of 'NationalCA' and 'osm_way_id' - to ensure each polygon is associated with a building and / or piece of land
data = data[ ~  (data['NATIONALCA'].isna())  ]
data = data[ ~  (data['osm_way_id'].isna())  ]
data.head(5)

In [None]:
# Filling in empty values with 'no'
index = data['private'] == 'yes'
data['private'][~index] = "no"
data.head(5)

In [None]:
# Copying data to the dataframe variable 
df = data.copy()

In [None]:
# Visualing the data for inspection
df.style.background_gradient("coolwarm")

In [None]:
# Box plot of house_area column to see the spread of house_area values
plt.figure(figsize=(12,6),dpi=300)
sns.boxplot(data = df, x = 'house_area')
plt.show()

In [None]:
# Violin plot of house area column to see spread of house_area
sns.violinplot(data = df, x = 'house_area')

In [None]:
# Statistical details of each feature
df.describe().transpose()

In [None]:
# Datatypes and the shape of each column of the dataframe
df.info()

In [None]:
# Column names for features (x) and for the target (y)
x_index = ['land_area','land_perim', 'land_verti', 'land_longi', 'land_latit','house_area', 'house_peri', 'house_vert', 'house_long',
       'house_lati', 'b2lpercent' ]
y_index = ['building']

In [None]:
# Analysing the correlation of the selected features of data
corr_data = df[x_index].corr()
corr_data.style.background_gradient("coolwarm")

In [None]:
# Feature correlation heatmap
sns.heatmap(df[x_index].corr())

In [None]:
# Returning pairs of highly correlated variables (later we found this is not useful)
corr_matrix = df.corr()

threshold = 0.8
high_corr_var = []

# Iterate over the correlation matrix
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold and abs(corr_matrix.iloc[i, j]) < 1:
            colname = corr_matrix.columns[i], corr_matrix.columns[j]
            high_corr_var.append(colname)

print(high_corr_var)


In [None]:
threshold = 0.8
corr_pairs = []

# Iterate over the correlation matrix
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold and abs(corr_matrix.iloc[i, j]) < 1:
            colname = (corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j])
            corr_pairs.append(colname)

# Sort the pairs by absolute correlation
sorted_corr_pairs = sorted(corr_pairs, key=lambda x: abs(x[2]), reverse=True)

# Get the top 3 pairs
top_3_pairs = sorted_corr_pairs[:3]

In [None]:
for pair in top_3_pairs:
    print(f"Columns: {pair[0]}, {pair[1]} - Correlation: {pair[2]:.2f}")

In [None]:
# Selecting the columns for clustering based on these threee features
X = df[['land_verti', 'land_perim', 'house_area']]
X = X.dropna()
X = X.reset_index()
X

In [None]:
# make 5 type of clusters
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)
labels = kmeans.predict(X)
X['Cluster'] = labels

In [None]:
# 2d plot to see clusters

# Initialize a 2D figure
fig, ax = plt.subplots(figsize=(8, 6))

# Color map
colors = plt.cm.jet(np.linspace(0, 1, 5))

# Plot each cluster
for cluster in range(5):
    cluster_data = X[X['Cluster'] == cluster]
    ax.scatter(cluster_data['land_verti'], cluster_data['land_perim'], 
               color=colors[cluster], label=f"Cluster {cluster}")

# Add labels and title
ax.set_xlabel('land_verti')
ax.set_ylabel('land_perim')
ax.legend()
plt.title('2D Clusters')

# Show the plot
plt.show()

In [None]:
#Visualize the clusters in a 3D plot:
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

# Color map
colors = plt.cm.jet(np.linspace(0, 1, 5))

# Plot each cluster
for cluster in range(5):
    cluster_data = X[X['Cluster'] == cluster]
    ax.scatter(cluster_data['land_verti'], cluster_data['land_perim'], cluster_data['house_area'], 
               color=colors[cluster], label=f"Cluster {cluster}")

ax.set_xlabel('land_verti')
ax.set_ylabel('land_perim')
ax.set_zlabel('house_area')
ax.legend()
plt.title('3D Clusters')
plt.show()

In [None]:
pip install plotly

In [None]:
# Step 1: Import necessary library
import plotly.express as px

def plot_3d_clusters(data, x_col, y_col, z_col, cluster_col):
    """
    Plots a 3D scatter plot of given data with clusters visualized.
    
    Parameters:
    - data: DataFrame with the data
    - x_col: Name of the column for x-axis
    - y_col: Name of the column for y-axis
    - z_col: Name of the column for z-axis
    - cluster_col: Name of the column with cluster labels
    """
    
    # Step 2: Create the 3D scatter plot
    fig = px.scatter_3d(data, 
                        x=x_col, 
                        y=y_col, 
                        z=z_col, 
                        color=cluster_col, 
                        color_continuous_scale='jet', 
                        range_color=[data[cluster_col].min(), data[cluster_col].max()])
    
    # Step 3: Update layout with titles
    fig.update_layout(title='3D Clusters', 
                      scene=dict(xaxis_title=x_col, 
                                 yaxis_title=y_col, 
                                 zaxis_title=z_col))
    
    # Step 4: Show the plot
    fig.show()

# Call the function
plot_3d_clusters(X, 'land_verti', 'land_perim', 'house_area', 'Cluster')

In [None]:
# see total number of each kind of cluster values 
cluster_counts = X['Cluster'].value_counts()
print(cluster_counts)