In [74]:
#import library
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import fcluster
import plotly.express as px

In [75]:
# Read data
df = pd.read_csv("AUDIT-Data_Original_Update.csv")

In [76]:
# Select features for clustering
features_ahc = df[['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', 'Latitude', 'Longitude']]


In [77]:
df.describe()

Unnamed: 0,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,Latitude,Longitude
count,192.0,192.0,192.0,192.0,192.0,192.0,192.0,192.0,192.0,192.0,192.0,192.0,192.0,192.0,192.0
mean,4440.697917,4419.0,4497.640625,4625.838542,4591.536458,4694.854167,4825.963542,4733.322917,4741.885417,4956.895833,5206.427083,5187.994792,5250.302083,-6.539152,107.452771
std,3841.158027,3340.687564,3867.613912,3284.134816,3334.472903,3342.755743,3917.880764,3941.242747,3759.810002,3994.787103,4128.481907,4139.63267,4152.40292,0.951827,0.076721
min,1123.0,1143.0,32.0,1309.0,239.0,421.0,272.0,278.0,303.0,1453.0,1500.0,1475.0,1508.0,-6.763,107.2607
25%,2571.25,2728.25,2650.5,2875.0,2842.0,2865.5,2879.5,2779.5,2742.5,2898.75,3090.0,3021.75,3131.75,-6.66945,107.40075
50%,3482.5,3540.5,3584.0,3855.5,3819.0,3834.0,3886.0,3726.5,3859.0,3873.0,4290.0,4290.0,4224.0,-6.61715,107.4632
75%,4874.25,5144.25,5136.75,5296.75,5314.75,5485.0,5494.5,5274.75,5426.25,5412.0,5657.5,5657.5,5679.75,-6.54635,107.5132
max,36489.0,28724.0,36489.0,27234.0,26849.0,26701.0,36401.0,36401.0,34238.0,37262.0,38370.0,38400.0,38661.0,6.5378,107.5949


In [78]:
df.head()

Unnamed: 0,DESA_1,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,Latitude,Longitude
0,NANGERANG,1801,1957,2058,2057,2317,2086,2086,2044,2097,2144,2144,2092,2207,-6.6971,107.5357
1,SIMPANG,1905,1897,1953,1943,1897,1896,1896,1925,1891,2020,2020,1934,2038,-6.6828,107.5402
2,SAKAMBANG,1401,1596,1630,1641,1583,1583,1583,1498,1592,1567,1570,1496,1623,-6.711,107.5402
3,NAGROG,3342,2343,2353,2542,2524,2417,2417,2415,2429,2580,2580,2412,2659,-6.7132,107.552
4,CIBUNTU,1427,1444,1469,1524,1574,1574,1574,1571,1622,1642,1640,1605,1700,-6.7079,107.555


In [79]:
# Choose the column for the line chart
selected_column = '2020'

In [80]:
#Reset index to make the index a column
df.reset_index(inplace=True)

In [81]:

# Calculate quartiles
quartiles = df[selected_column].quantile([0.25, 0.5, 0.75])

In [82]:

# Create subplots
fig = make_subplots(rows=1, cols=3, subplot_titles=("25th Percentile (Q1)", "50th Percentile (Q2)", "75th Percentile (Q3)"))

# Plot for 25th Percentile (Q1)
fig.add_trace(
    go.Scatter(x=df['index'], y=df[selected_column], mode='lines', name='25th Percentile'),
    row=1, col=1
)
fig.add_annotation(x=0.5, y=-0.2, showarrow=False, text=f"Q1: {quartiles[0.25]}", xref="paper", yref="paper", row=1, col=1)

# Plot for 50th Percentile (Q2)
fig.add_trace(
    go.Scatter(x=df['index'], y=df[selected_column], mode='lines', name='50th Percentile'),
    row=1, col=2
)
fig.add_annotation(x=0.5, y=-0.2, showarrow=False, text=f"Q2: {quartiles[0.5]}", xref="paper", yref="paper", row=1, col=2)

# Plot for 75th Percentile (Q3)
fig.add_trace(
    go.Scatter(x=df['index'], y=df[selected_column], mode='lines', name='75th Percentile'),
    row=1, col=3
)
fig.add_annotation(x=0.5, y=-0.2, showarrow=False, text=f"Q3: {quartiles[0.75]}", xref="paper", yref="paper", row=1, col=3)

# Update layout
fig.update_layout(height=400, width=1200, title_text=f"Quartile Information for {selected_column}")
fig.show()

In [83]:

# Selected features for correlation matrix
selected_features = ['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']

# Calculate correlation matrix
correlation_matrix = df[selected_features].corr()

# Plot heatmap using Plotly Express
fig = px.imshow(correlation_matrix,
                labels=dict(x="Features", y="Features", color="Correlation"),
                x=selected_features,
                y=selected_features,
                color_continuous_scale="viridis",  
                title="Heatmap Korelasi")

# Adjust the size
fig.update_layout(height=600, width=800)

# Show the plot
fig.show()


In [84]:
print("Jumlah nilai yang hilang (NaN atau None) di setiap kolom dalam DataFrame:")
print(df.isnull().sum())

print("\nInsight ke dalam kecenderungan sentral, dispersi, dan distribusi data:")
print(df.describe().T)


Jumlah nilai yang hilang (NaN atau None) di setiap kolom dalam DataFrame:
index        0
DESA_1       0
2011         0
2012         0
2013         0
2014         0
2015         0
2016         0
2017         0
2018         0
2019         0
2020         0
2021         0
2022         0
2023         0
Latitude     0
Longitude    0
dtype: int64

Insight ke dalam kecenderungan sentral, dispersi, dan distribusi data:
           count         mean          std        min         25%         50%  \
index      192.0    95.500000    55.569776     0.0000    47.75000    95.50000   
2011       192.0  4440.697917  3841.158027  1123.0000  2571.25000  3482.50000   
2012       192.0  4419.000000  3340.687564  1143.0000  2728.25000  3540.50000   
2013       192.0  4497.640625  3867.613912    32.0000  2650.50000  3584.00000   
2014       192.0  4625.838542  3284.134816  1309.0000  2875.00000  3855.50000   
2015       192.0  4591.536458  3334.472903   239.0000  2842.00000  3819.00000   
2016       192.0  4

In [85]:
X_ahc = df[['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']]


In [86]:
# Menentukan jumlah cluster
n_clusters = 3  # Gantilah ini dengan jumlah cluster yang Anda inginkan

In [87]:
# Menghitung Linkage Matrix dan Cophenetic Correlation Coefficient untuk metode 'single'
numeric_data = X_ahc[['2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023']]
linkage_matrix_single = linkage(numeric_data, method='single')
cophenet_matrix_single, _ = cophenet(linkage_matrix_single, pdist(numeric_data))
ccc_single = cophenet_matrix_single.mean()
print("Cophenetic Correlation Coefficient (CCC) untuk metode 'single':", ccc_single)

Cophenetic Correlation Coefficient (CCC) untuk metode 'single': 0.9115400881971691


In [88]:
# Melakukan clustering menggunakan AgglomerativeClustering untuk metode 'single'
clusterer_single = AgglomerativeClustering(n_clusters=n_clusters, linkage='single')
X_ahc_single = X_ahc.copy()  # Buat salinan DataFrame untuk metode single
X_ahc_single['cluster_single'] = clusterer_single.fit_predict(numeric_data)



In [89]:
# Menghitung centroid untuk setiap cluster
centroids_single = X_ahc_single.groupby('cluster_single')[['2023']].mean()



In [103]:
# Menentukan kategori kepadatan berdasarkan nilai centroid untuk metode 'complete'
threshold_low_single = 10000  # Contoh threshold untuk "tidak padat"
threshold_high_single = 5000  # Contoh threshold untuk "padat"
X_ahc_single['Landslide Category Complete'] = X_ahc_single['cluster_single'].map(lambda cluster: 
    'Tidak Padat' if centroids_single.loc[cluster].mean() < threshold_low_single else (
    'Padat' if centroids_single.loc[cluster].mean() < threshold_high_single else 
    'Sangat Padat'))

print(X_ahc_single.head())


   2011  2012  2013  2014  2015  2016  2017  2018  2019  2020  2021  2022  \
0  1801  1957  2058  2057  2317  2086  2086  2044  2097  2144  2144  2092   
1  1905  1897  1953  1943  1897  1896  1896  1925  1891  2020  2020  1934   
2  1401  1596  1630  1641  1583  1583  1583  1498  1592  1567  1570  1496   
3  3342  2343  2353  2542  2524  2417  2417  2415  2429  2580  2580  2412   
4  1427  1444  1469  1524  1574  1574  1574  1571  1622  1642  1640  1605   

   2023  cluster_single Landslide Category Complete  
0  2207               0                 Tidak Padat  
1  2038               0                 Tidak Padat  
2  1623               0                 Tidak Padat  
3  2659               0                 Tidak Padat  
4  1700               0                 Tidak Padat  


In [91]:
# Menampilkan dendrogram menggunakan Plotly untuk metode 'single'
fig_single = ff.create_dendrogram(linkage_matrix_single)
fig_single.update_layout(title='Dendrogram (Single Linkage)',
                         xaxis_title='Sample index',
                         yaxis_title='Distance',
                         width=800, height=500)
fig_single.show()

In [92]:
# Menghitung Linkage Matrix dan Cophenetic Correlation Coefficient untuk metode 'average'
linkage_matrix_average = linkage(numeric_data, method='average')
cophenet_matrix_average, _ = cophenet(linkage_matrix_average, pdist(numeric_data))
ccc_average = cophenet_matrix_average.mean()
print("Cophenetic Correlation Coefficient (CCC) untuk metode 'average':", ccc_average)


Cophenetic Correlation Coefficient (CCC) untuk metode 'average': 0.9111267530477787


In [93]:
# Melakukan clustering menggunakan AgglomerativeClustering untuk metode 'average'
clusterer_average = AgglomerativeClustering(n_clusters=n_clusters, linkage='average')
X_ahc_average = X_ahc.copy()  # Buat salinan DataFrame untuk metode average
X_ahc_average['cluster_average'] = clusterer_average.fit_predict(numeric_data)

In [94]:
# Menghitung centroid untuk setiap cluster
centroids_average = X_ahc_average.groupby('cluster_average')[['2023']].mean()

In [95]:
# Menentukan kategori kepadatan berdasarkan nilai centroid untuk metode 'complete'
threshold_low_average = 100000  # Contoh threshold untuk "tidak padat"
threshold_high_average = 500000  # Contoh threshold untuk "padat"
X_ahc_average['Landslide Category Average'] = X_ahc_average['cluster_average'].map(lambda cluster: 
    'Tidak Padat' if centroids_average.loc[cluster].mean() < threshold_low_complete else (
    'Padat' if centroids_average.loc[cluster].mean() < threshold_high_complete else 
    'Sangat Padat'))

print(X_ahc_average.head())


   2011  2012  2013  2014  2015  2016  2017  2018  2019  2020  2021  2022  \
0  1801  1957  2058  2057  2317  2086  2086  2044  2097  2144  2144  2092   
1  1905  1897  1953  1943  1897  1896  1896  1925  1891  2020  2020  1934   
2  1401  1596  1630  1641  1583  1583  1583  1498  1592  1567  1570  1496   
3  3342  2343  2353  2542  2524  2417  2417  2415  2429  2580  2580  2412   
4  1427  1444  1469  1524  1574  1574  1574  1571  1622  1642  1640  1605   

   2023  cluster_average Landslide Category Average  
0  2207                0                Tidak Padat  
1  2038                0                Tidak Padat  
2  1623                0                Tidak Padat  
3  2659                0                Tidak Padat  
4  1700                0                Tidak Padat  


In [96]:
# Menampilkan dendrogram menggunakan Plotly untuk metode 'average'
fig_average = ff.create_dendrogram(linkage_matrix_average)
fig_average.update_layout(title='Dendrogram (Average Linkage)',
                          xaxis_title='Sample index',
                          yaxis_title='Distance',
                          width=800, height=500)
fig_average.show()

In [97]:
# Menampilkan Linkage Matrix dan Cophenetic Correlation Coefficient untuk metode 'complete'
linkage_matrix_complete = linkage(numeric_data, method='complete')
cophenet_matrix_complete, _ = cophenet(linkage_matrix_complete, pdist(numeric_data))
ccc_complete = cophenet_matrix_complete.mean()
print("Cophenetic Correlation Coefficient (CCC) untuk metode 'complete':", ccc_complete)


Cophenetic Correlation Coefficient (CCC) untuk metode 'complete': 0.8384788055867786


In [98]:

# Melakukan clustering menggunakan AgglomerativeClustering untuk metode 'complete'
clusterer_complete = AgglomerativeClustering(n_clusters=n_clusters, linkage='complete')
X_ahc_complete = X_ahc.copy()  # Buat salinan DataFrame untuk metode complete
X_ahc_complete['cluster_complete'] = clusterer_complete.fit_predict(numeric_data)


In [99]:
# Menghitung centroid untuk setiap cluster
centroids_complete = X_ahc_complete.groupby('cluster_complete')[['2023']].mean()


In [100]:
# Menentukan kategori kepadatan berdasarkan nilai centroid untuk metode 'complete'
threshold_low_complete = 100000  # Contoh threshold untuk "tidak padat"
threshold_high_complete = 500000  # Contoh threshold untuk "padat"
X_ahc_complete['Landslide Category Complete'] = X_ahc_complete['cluster_complete'].map(lambda cluster: 
    'Tidak Padat' if centroids_complete.loc[cluster].mean() < threshold_low_complete else (
    'Padat' if centroids_complete.loc[cluster].mean() < threshold_high_complete else 
    'Sangat Padat'))

print(X_ahc_complete.head())


   2011  2012  2013  2014  2015  2016  2017  2018  2019  2020  2021  2022  \
0  1801  1957  2058  2057  2317  2086  2086  2044  2097  2144  2144  2092   
1  1905  1897  1953  1943  1897  1896  1896  1925  1891  2020  2020  1934   
2  1401  1596  1630  1641  1583  1583  1583  1498  1592  1567  1570  1496   
3  3342  2343  2353  2542  2524  2417  2417  2415  2429  2580  2580  2412   
4  1427  1444  1469  1524  1574  1574  1574  1571  1622  1642  1640  1605   

   2023  cluster_complete Landslide Category Complete  
0  2207                 0                 Tidak Padat  
1  2038                 0                 Tidak Padat  
2  1623                 0                 Tidak Padat  
3  2659                 0                 Tidak Padat  
4  1700                 0                 Tidak Padat  


In [101]:
# Menampilkan dendrogram menggunakan Plotly untuk metode 'average'
fig_complete = ff.create_dendrogram(linkage_matrix_complete)
fig_complete.update_layout(title='Dendrogram (Complete Linkage)',
                          xaxis_title='Sample index',
                          yaxis_title='Distance',
                          width=800, height=500)
fig_complete.show()