<a href="https://colab.research.google.com/github/Alphaizz/Insurance/blob/main/FINAL_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Part 1: Data Loading & All Variable Calculation

In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import math # Diperlukan untuk konversi float

file_name = 'data-1-Daily-Activity.csv'
invalid_counts = {} # Sekarang mencakup nilai 0 dan ''
total_records = 0
relevant_cols = {
    0: 'distance_meters',
    1: 'moving_time_seconds',
    6: 'average_speed_kmh',
    8: 'average_heartrate_bpm',
    9: 'average_cadence_spm'
}

# Inisialisasi hitungan
for col_index in relevant_cols.keys():
    invalid_counts[col_index] = 0

# --- 1. PENGHITUNGAN MANUAL NILAI NOL ATAU HILANG ---
with open(file_name, 'r') as f:
    reader = csv.reader(f)
    next(reader) # Skip header

    for row in reader:
        total_records += 1
        for index in relevant_cols.keys():
            cell_value = row[index]

            # Cek jika nilai hilang ('') ATAU nilai adalah 0 setelah konversi
            is_invalid = False

            if cell_value == '':
                is_invalid = True
            else:
                try:
                    # Konversi ke float untuk cek nilai 0
                    if float(cell_value) == 0.0:
                        is_invalid = True
                except ValueError:
                    # Jika ada error konversi (data rusak), anggap tidak valid juga
                    is_invalid = True

            if is_invalid:
                invalid_counts[index] += 1

# Siapkan data untuk plotting
column_names = list(relevant_cols.values())
invalid_data = list(invalid_counts.values())
clean_data = [total_records - invalid for invalid in invalid_data]

# --- 2. VISUALISASI STACKED BAR CHART ---

x_pos = np.arange(len(column_names))
bar_width = 0.8

plt.figure(figsize=(12, 7))

# Plot Data Bersih (Bagian bawah stack)
plt.bar(x_pos, clean_data, color='#4CAF50', edgecolor='white', width=bar_width, label='Data Bersih (Nilai > 0)')

# Plot Data Hilang/Nol (Bagian atas stack)
plt.bar(x_pos, invalid_data, color='#FF5722', edgecolor='white', width=bar_width, bottom=clean_data, label='Data Hilang/Nol (Invalid)')

# Tambahkan label (Total dan Jumlah Hilang/Nol)
for i in range(len(column_names)):
    # Total
    plt.text(x_pos[i], total_records + 20, str(total_records), ha='center', va='bottom', fontsize=9)
    # Invalid/Zero Count
    if invalid_data[i] > 0:
        # Posisikan label di tengah bagian invalid
        text_y = clean_data[i] + invalid_data[i] / 2
        # Cek apakah ada ruang untuk menempatkan teks (jika data bersihnya juga nol)
        if text_y > 0 and invalid_data[i] > total_records * 0.01: # Hanya tampilkan jika angkanya cukup besar
             plt.text(x_pos[i], text_y, str(invalid_data[i]), ha='center', va='center', color='white', fontsize=9)
        elif invalid_data[i] > 0 and invalid_data[i] < total_records * 0.01:
             # Teks kecil di atas bar jika angkanya terlalu kecil
             plt.text(x_pos[i], total_records + 20, f"({invalid_data[i]})", ha='center', va='top', color='#FF5722', fontsize=9)

plt.xlabel("Variabel Data Mentah", fontsize=12)
plt.ylabel(f"Jumlah Baris (Total: {total_records})", fontsize=12)
plt.title("Kualitas Data Mentah: Data Bersih vs. Data Hilang/Nol (0)", fontsize=14)
plt.xticks(x_pos, column_names, rotation=25, ha='right')
plt.legend()
plt.tight_layout()
plt.savefig('eda_data_quality_zero_bar_chart.png')
plt.close()

print(f"Total Baris Data Ditemukan: {total_records}")
print("Data Kualitas (termasuk nilai nol) telah disimpan sebagai 'eda_data_quality_zero_bar_chart.png'")

Total Baris Data Ditemukan: 22159
Data Kualitas (termasuk nilai nol) telah disimpan sebagai 'eda_data_quality_zero_bar_chart.png'


Part 2: Visualization - The "Why 3 Variables?" Heatmap

In [None]:
# Sambungan dari PART 1 (menggunakan aggregated_data_dicts dan fungsi helper)

# --- A. PERSIAPAN DATA UNTUK KORELASI (6 FITUR) ---

# Kolom yang akan digunakan untuk Heatmap
cols = list(aggregated_data_dicts[0].keys()) # Mengambil 6 kunci/nama fitur
num_cols = len(cols)
corr_matrix = [[0] * num_cols for _ in range(num_cols)]

# Transpose data: mengubah list of dicts menjadi list of columns/lists
# Ini dibutuhkan oleh fungsi calculate_correlation
col_lists = {col: [d[col] for d in aggregated_data_dicts] for col in cols}

# --- B. PERHITUNGAN MANUAL MATRIKS KORELASI ---
for i in range(num_cols):
    for j in range(i, num_cols):
        # Gunakan fungsi helper calculate_correlation()
        corr = calculate_correlation(col_lists[cols[i]], col_lists[cols[j]])
        corr_matrix[i][j] = corr
        corr_matrix[j][i] = corr # Matriks korelasi bersifat simetris

print("Matriks Korelasi (6x6) berhasil dihitung secara manual.")


# --- C. VISUALISASI CORRELATION HEATMAP (Visualisasi Diizinkan) ---

plt.figure(figsize=(10, 8))
sns.heatmap(
    corr_matrix,
    annot=True,        # Menampilkan nilai korelasi
    fmt=".2f",         # Format 2 angka desimal
    cmap='coolwarm',   # Skema warna
    linewidths=.5,     # Garis pemisah
    xticklabels=cols,
    yticklabels=cols,
    cbar_kws={'label': 'Koefisien Korelasi'}
)
plt.title('Correlation Heatmap (Justifikasi Variabel)', fontsize=14)
plt.tight_layout()
plt.savefig('manual_correlation_heatmap.png')
plt.close()

# Tampilkan Heatmap sebagai bukti
#
print("Heatmap korelasi telah disimpan sebagai 'manual_correlation_heatmap.png'.")


# --- D. KESIMPULAN & SELEKSI 3 FITUR AKHIR ---

# Berdasarkan Heatmap, kami mengonfirmasi bahwa Total_Distance, Moving_Time,
# dan Frequency sangat berkorelasi (redundant).

# 3 Fitur Final yang dipilih:
# 1. Volume: Total_Distance (Mewakili Volume Aktivitas)
# 2. Intensitas: Avg_Speed (Mewakili Intensitas Aktivitas)
# 3. Fisiologi: Avg_Heart_Rate (Mewakili Respon Tubuh)

clustering_features = [] # Data yang akan digunakan untuk K-Means (sudah Log Transformed dan Scaled)
original_features = []   # Data yang akan digunakan untuk pelaporan (nilai asli)
filtered_badges = []     # List baru untuk menyimpan Badge_IDs pengguna yang valid

features_to_select = ['Total_Distance', 'Avg_Speed', 'Avg_Heart_Rate']

# Iterate through aggregated_data_dicts and corresponding badges simultaneously
for i, data in enumerate(aggregated_data_dicts):

    # 1. TERAPAN ATURAN KETAT (DATA LENGKAP SAJA)
    # Periksa apakah SEMUA 3 fitur kunci bernilai 0
    is_valid_user = True
    for feature in features_to_select:
        if data[feature] == 0:
            is_valid_user = False
            break

    if not is_valid_user:
        # Jika salah satu fitur kunci bernilai 0, pengguna di-skip
        continue

    # 2. LOG TRANSFORM PADA DATA YANG VALID
    # Menggunakan Log(X + 1)
    log_dist = math.log(data['Total_Distance'] + 1)
    log_speed = math.log(data['Avg_Speed'] + 1)
    avg_hr = data['Avg_Heart_Rate'] # Tidak di-Log Transform

    # Simpan fitur untuk Clustering dan Pelaporan
    clustering_features.append([log_dist, log_speed, avg_hr])
    original_features.append({'Total_Distance': data['Total_Distance'], 'Avg_Speed': data['Avg_Speed'], 'Avg_Heart_Rate': avg_hr})
    filtered_badges.append(badges[i]) # Simpan Badge_ID pengguna yang valid

# `display_data` is used in Part 4 for CSV output. It needs to be a list of lists.
display_data = [[item['Total_Distance'], item['Avg_Speed'], item['Avg_Heart_Rate']] for item in original_features]

print(f"\nPART 2 SELESAI. Jumlah pengguna yang lolos 'Data Lengkap (Non-Zero)' adalah: {len(clustering_features)} dari {len(aggregated_data_dicts)}.")

Matriks Korelasi (6x6) berhasil dihitung secara manual.
Heatmap korelasi telah disimpan sebagai 'manual_correlation_heatmap.png'.

PART 2 SELESAI. Jumlah pengguna yang lolos 'Data Lengkap (Non-Zero)' adalah: 240 dari 658.


In [None]:
import csv
import math
import numpy as np # Already imported, but good practice to ensure.
import seaborn as sns # Needed for heatmap in next cell
import random # Needed for manual_kmeans later

# Helper function for Pearson correlation coefficient
def calculate_correlation(list1, list2):
    # Ensure lists are not empty and have same length
    if not list1 or not list2 or len(list1) != len(list2):
        return 0.0

    n = len(list1)

    # Handle cases where all values in a list are the same (zero variance)
    if all(x == list1[0] for x in list1) or all(y == list2[0] for y in list2):
        return 0.0 # Correlation is undefined or 0 in such cases

    mean1 = sum(list1) / n
    mean2 = sum(list2) / n

    numerator = sum((x - mean1) * (y - mean2) for x, y in zip(list1, list2))
    denominator1 = sum((x - mean1)**2 for x in list1)
    denominator2 = sum((y - mean2)**2 for y in list2)

    denom = (denominator1**0.5) * (denominator2**0.5)
    if denom == 0: # Should be caught by the all(x == list1[0]) check above, but for safety
        return 0.0
    return numerator / denom


# --- Data Aggregation for `aggregated_data_dicts` and `badges` ---
user_activity_data = {} # Key: user_id, Value: dict of aggregated raw metrics

# `file_name` is defined in the first cell, assuming it's in scope
with open(file_name, 'r') as f:
    reader = csv.reader(f)
    next(reader) # Skip header

    for row_idx, row in enumerate(reader):
        try:
            # Assuming Badge_ID is the last column (index 10) based on typical activity data structure
            # and inspection of `row` in kernel state from previous cell.
            user_id = row[10]

            if user_id not in user_activity_data:
                user_activity_data[user_id] = {
                    'Total_Distance_sum': 0.0,
                    'Moving_Time_sum': 0.0,
                    'Avg_Speed_vals': [], # Collect values to calculate average
                    'Avg_Heart_Rate_vals': [],
                    'Avg_Cadence_vals': [],
                    'Activity_Count': 0 # For 'Frequency'
                }

            # Columns used:
            # 0: 'distance_meters'
            # 1: 'moving_time_seconds'
            # 6: 'average_speed_kmh'
            # 8: 'average_heartrate_bpm'
            # 9: 'average_cadence_spm'

            # Process Total_Distance (from index 0)
            dist_str = row[0]
            if dist_str != '':
                user_activity_data[user_id]['Total_Distance_sum'] += float(dist_str)

            # Process Moving_Time (from index 1)
            time_str = row[1]
            if time_str != '':
                user_activity_data[user_id]['Moving_Time_sum'] += float(time_str)

            # Process Avg_Speed (from index 6)
            speed_str = row[6]
            if speed_str != '' and float(speed_str) != 0.0:
                user_activity_data[user_id]['Avg_Speed_vals'].append(float(speed_str))

            # Process Avg_Heart_Rate (from index 8)
            hr_str = row[8]
            if hr_str != '' and float(hr_str) != 0.0:
                user_activity_data[user_id]['Avg_Heart_Rate_vals'].append(float(hr_str))

            # Process Avg_Cadence (from index 9)
            cadence_str = row[9]
            if cadence_str != '' and float(cadence_str) != 0.0:
                user_activity_data[user_id]['Avg_Cadence_vals'].append(float(cadence_str))

            # Increment activity count for Frequency
            user_activity_data[user_id]['Activity_Count'] += 1

        except (ValueError, IndexError) as e:
            # print(f"Warning: Skipping row {row_idx+2} due to data parsing error: {e} in row: {row}")
            continue # Skip problematic rows

aggregated_data_dicts = []
badges = [] # List to store Badge_IDs (user_id) for later use in Part 4

for user_id, data in user_activity_data.items():
    badges.append(user_id) # Add user_id to badges list

    # Calculate final averages and create the dictionary for this user
    avg_speed = sum(data['Avg_Speed_vals']) / len(data['Avg_Speed_vals']) if data['Avg_Speed_vals'] else 0.0
    avg_heart_rate = sum(data['Avg_Heart_Rate_vals']) / len(data['Avg_Heart_Rate_vals']) if data['Avg_Heart_Rate_vals'] else 0.0
    avg_cadence = sum(data['Avg_Cadence_vals']) / len(data['Avg_Cadence_vals']) if data['Avg_Cadence_vals'] else 0.0

    aggregated_data_dicts.append({
        'Total_Distance': data['Total_Distance_sum'],
        'Moving_Time': data['Moving_Time_sum'],
        'Avg_Speed': avg_speed,
        'Avg_Heart_Rate': avg_heart_rate,
        'Avg_Cadence': avg_cadence,
        'Frequency': data['Activity_Count']
    })

print(f"Data aggregation complete for {len(aggregated_data_dicts)} users.")

Data aggregation complete for 658 users.


Part 3: The Manual Algorithm

In [None]:
# Sambungan dari PART 2 (menggunakan clustering_features, original_features,
# dan helper functions calculate_mean, calculate_std_dev, transpose_data)

# --- Helper functions for manual calculations ---
def transpose_data(data):
    if not data: return []
    return [[row[i] for row in data] for i in range(len(data[0]))]

def calculate_mean(data_list):
    if not data_list: return 0.0
    return sum(data_list) / len(data_list)

def calculate_std_dev(data_list, mean):
    if not data_list or len(data_list) < 2: return 0.0 # Standard deviation requires at least 2 points
    variance = sum([(x - mean) ** 2 for x in data_list]) / (len(data_list) - 1)
    return math.sqrt(variance)

# 1. Manual Normalization
def normalize(data):
    if not data: return []
    mins = list(data[0]) if data else []
    maxs = list(data[0]) if data else []
    for row in data:
        for i in range(len(row)):
            if row[i] < mins[i]: mins[i] = row[i]
            if row[i] > maxs[i]: maxs[i] = row[i]
    norm = []
    for row in data:
        new_row = []
        for i in range(len(row)):
            denom = maxs[i] - mins[i]
            val = (row[i] - mins[i]) / denom if denom > 0 else 0
            new_row.append(val)
        norm.append(new_row)
    return norm


# 2. Euclidean Distance
def get_dist(p1, p2):
    return sum((p1[i]-p2[i])**2 for i in range(len(p1)))**0.5

# 3. Mean Calculation (already defined, but ensuring it's here for context)
def get_mean(vectors):
    if not vectors: return []
    dim = len(vectors[0])
    sums = [0.0]*dim
    for v in vectors:
        for i in range(dim): sums[i]+=v[i]
    return [s/len(vectors) for s in sums]

# 4. K-Means Loop (Modified to return WSS history for Loss Curve)
def manual_kmeans(data, k=3, iters=20):
    random.seed(42)
    centroids = random.sample(data, k)
    wss_history = []  # Initialize list to store WSS at each iteration

    for iter_num in range(iters):
        clusters = [[] for _ in range(k)]
        indices = [[] for _ in range(k)]

        # Assign each data point to the closest centroid
        for idx, p in enumerate(data):
            dists = [get_dist(p, c) for c in centroids]
            closest = dists.index(min(dists))
            clusters[closest].append(p)
            indices[closest].append(idx)

        # Calculate WSS for the current clustering state
        current_wss = 0.0
        for i_cluster in range(k):
            for p_in_cluster in clusters[i_cluster]:
                if i_cluster < len(centroids) and len(centroids[i_cluster]) == len(p_in_cluster):
                    current_wss += get_dist(p_in_cluster, centroids[i_cluster]) ** 2
                # else: Handle empty cluster or dimension mismatch if necessary
        wss_history.append(current_wss) # Store WSS for this iteration

        # Update centroids
        new_c = []
        for i, c_group in enumerate(clusters):
            if c_group:
                new_c.append(get_mean(c_group))
            else:
                new_c.append(centroids[i]) # Keep old centroid if cluster is empty

        print(f"\n--- Iterasi {iter_num + 1}/{iters} ---")
        for i, c in enumerate(new_c):
            # Format centroid: (Log Dist, Log Speed, Avg HR) - assuming 3 features
            centroid_str = f"({c[0]:.4f}, {c[1]:.4f}, {c[2]:.4f})"
            size = len(indices[i])
            print(f"Cluster {i}: Centroid = {centroid_str}, Ukuran = {size} pengguna")

        # Check for convergence
        if all(get_dist(centroids[i], new_c[i]) < 1e-6 for i in range(k)):
            print("\nAlgoritma Konvergen, menghentikan iterasi lebih awal.")
            centroids = new_c # Update centroids one last time
            break
        centroids = new_c

    return indices, centroids, wss_history # Now returns 3 values

    # 5. Calculate Within-Cluster Sum of Squares (WSS) - This function calculates final WSS
def calculate_wss(data, k, iters=20):
    random.seed(42)
    if not data: return 0.0
    if len(data) < k: # Handle case where k is greater than number of data points
        # Optionally, raise an error or return a specific value
        print("Warning: k is greater than the number of data points. Adjusting k.")
        k = len(data)
        if k == 0: return 0.0

    centroids = random.sample(data, k)

    for _ in range(iters):
        clusters = [[] for _ in range(k)]

        # Assign
        for p in data:
            dists = [get_dist(p, c) for c in centroids]
            closest = dists.index(min(dists))
            clusters[closest].append(p)

        # Update
        new_c = []
        for i, c_group in enumerate(clusters):
            if c_group: new_c.append(get_mean(c_group))
            else: new_c.append(centroids[i]) # Keep old centroid if cluster is empty


        if all(get_dist(centroids[i], new_c[i]) < 1e-6 for i in range(k)):
            centroids = new_c
            break

        centroids = new_c

    # Hitung WSS (Sum of Squared Errors)
    wss = 0
    for i in range(k):
        for p in clusters[i]:
            wss += get_dist(p, centroids[i]) ** 2

    return wss

# --- 1. PREPARASI DATA UNTUK SCALING ---

# Transpose data: mengubah list of rows menjadi list of columns (fitur)
# Dibutuhkan untuk menghitung mean dan std_dev per fitur
transposed_features = transpose_data(clustering_features)
scaled_features = []
means = []
stds = []

# --- 2. PERHITUNGAN MANUAL MEAN DAN STD DEV ---
for col_index in range(len(transposed_features)):
    col_data = transposed_features[col_index]

    # Hitung Statistik Manual
    mean = calculate_mean(col_data)
    std = calculate_std_dev(col_data, mean)

    means.append(mean)
    stds.append(std)

print(f"Mean Manual (Log Distance, Log Speed, Avg HR): {means}")
print(f"Std Dev Manual (Log Distance, Log Speed, Avg HR): {stds}")


# --- 3. PENERAPAN MANUAL STANDARDISASI (Z = (X - Mu) / Sigma) ---
for i in range(len(clustering_features)): # Loop melalui setiap pengguna
    scaled_row = []

    for j in range(len(clustering_features[i])): # Loop melalui setiap fitur (3 fitur)
        val = clustering_features[i][j]
        mean = means[j]
        std = stds[j]

        # Terapkan Z-Score secara manually
        if std != 0:
            scaled_val = (val - mean) / std
        else:
            # Jika std dev = 0 (fitur konstan), nilai Z-Score adalah 0
            scaled_val = 0

        scaled_row.append(scaled_val)

    scaled_features.append(scaled_row)

print("\nPART 3 SELESAI. Data telah berhasil di Standardisasi (Scaled) dan siap untuk K-Means.")

Mean Manual (Log Distance, Log Speed, Avg HR): [14.0290378448561, 7.365519730191664, 1265.9262970798534]
Std Dev Manual (Log Distance, Log Speed, Avg HR): [1.339166853168847, 0.33647599920429117, 174.29644293394162]

PART 3 SELESAI. Data telah berhasil di Standardisasi (Scaled) dan siap untuk K-Means.


**ELBOW METHOD**

In [None]:
# Sambungan dari PART 3 (menggunakan scaled_features dan helper functions)

# K Optimal
K_OPTIMAL = 3
max_k = 8
k_range = range(1, max_k + 1)
wcss_list = [] # Re-initialize for safety if cell is run partially
wcss_history_k3 = [] # Akan diisi dengan riwayat WCSS untuk Loss Curve

# --- 1. METODE ELBOW (WCSS vs. K) ---

print("Memulai Perhitungan Manual Elbow Method (WCSS)...")
# Loop untuk menghitung WCSS untuk setiap K
for k in k_range:
    wss = calculate_wss(scaled_features, k)
    wcss_list.append(wss)
    print(f"K={k}: WCSS={wss:.2f}")

# Visualisasi Elbow Method
plt.figure(figsize=(8, 6))
plt.plot(k_range, wcss_list, marker='o', linestyle='--')
plt.title('Metode Elbow Manual (WCSS vs. K)', fontsize=14)
plt.xlabel('Jumlah Klaster (K)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.xticks(k_range)
plt.grid(True)
plt.savefig('elbow_method_manual.png')
plt.close()


# --- 2. LOSS CURVE (WCSS vs. ITERASI) ---

# Jalankan K-Means final untuk K=3 dan ambil WCSS History
print(f"\nMenghitung Loss Curve (WCSS vs. Iterasi) untuk K={K_OPTIMAL}...")

# Run manual_kmeans (now modified to return WSS history)
# indices_k3 and centroids_k3 are not directly used in this cell but can be for debugging/further steps
indices_k3, centroids_k3, wcss_history_k3 = manual_kmeans(scaled_features, K_OPTIMAL)

# Visualisasi Loss Curve (WCSS vs. Iterasi)
plt.figure(figsize=(8, 6))
# Rentang X: 1 hingga jumlah iterasi yang terjadi
plt.plot(range(1, len(wcss_history_k3) + 1), wcss_history_k3, marker='o', color='purple')
plt.title(f'Loss Curve (WCSS vs. Iterasi) untuk K={K_OPTIMAL}', fontsize=14)
plt.xlabel('Iterasi')
plt.ylabel('WCSS (Menurun Menuju Konvergensi)')
plt.grid(True)
plt.savefig('convergence_loss_curve.png')
plt.close()

print(f"\nPART 4 SELESAI. K Optimal ditetapkan = {K_OPTIMAL}.")
print("1. Plot Elbow Method telah disimpan sebagai 'elbow_method_manual.png'.")
print("2. Plot Loss Curve (Konvergensi) telah disimpan sebagai 'convergence_loss_curve.png'.")

Memulai Perhitungan Manual Elbow Method (WCSS)...
K=1: WCSS=717.00
K=2: WCSS=445.72
K=3: WCSS=331.35
K=4: WCSS=280.89
K=5: WCSS=245.62
K=6: WCSS=227.86
K=7: WCSS=211.24
K=8: WCSS=197.45

Menghitung Loss Curve (WCSS vs. Iterasi) untuk K=3...

--- Iterasi 1/20 ---
Cluster 0: Centroid = (0.2693, 0.9119, 1.3601), Ukuran = 62 pengguna
Cluster 1: Centroid = (0.1903, -0.4891, -0.9083), Ukuran = 92 pengguna
Cluster 2: Centroid = (-0.3977, -0.1342, -0.0089), Ukuran = 86 pengguna

--- Iterasi 2/20 ---
Cluster 0: Centroid = (0.4627, 0.9142, 1.1587), Ukuran = 75 pengguna
Cluster 1: Centroid = (0.2206, -0.4993, -0.8985), Ukuran = 90 pengguna
Cluster 2: Centroid = (-0.7274, -0.3150, -0.0805), Ukuran = 75 pengguna

--- Iterasi 3/20 ---
Cluster 0: Centroid = (0.5025, 0.8766, 1.0341), Ukuran = 83 pengguna
Cluster 1: Centroid = (0.2734, -0.4046, -0.8030), Ukuran = 96 pengguna
Cluster 2: Centroid = (-1.1140, -0.5560, -0.1433), Ukuran = 61 pengguna

--- Iterasi 4/20 ---
Cluster 0: Centroid = (0.5147, 0.84

Part 4: Execution & File Generation

In [None]:
# Sambungan dari PART 4 (menggunakan scaled_features, original_features, badges, dan helper functions)

from mpl_toolkits.mplot3d import Axes3D # Untuk plot 3D

# K Optimal telah ditetapkan berdasarkan Elbow Method di Part 4
K_OPTIMAL = 3

# --- A. FINAL K-MEANS RUN (Mendapatkan Label Klaster) ---
print(f"Menjalankan K-Means final dengan K={K_OPTIMAL}...")

# Jalankan K-Means final untuk mendapatkan labels
# manual_kmeans returns: cluster_indices_per_group, final_centroids, wss_history
cluster_indices_per_group, final_centroids, _ = manual_kmeans(scaled_features, K_OPTIMAL)

# Reconstruct final_labels from cluster_indices_per_group
final_labels = [0] * len(scaled_features)
for cluster_id, indices_in_cluster in enumerate(cluster_indices_per_group):
    for original_idx in indices_in_cluster:
        final_labels[original_idx] = cluster_id


# --- B. PROFILING CLUSTER (Manual Group By and Mean) ---

cluster_profiles = {}
for i in range(K_OPTIMAL):
    cluster_profiles[i] = {'Total_Distance': [], 'Avg_Speed': [], 'Avg_Heart_Rate': []}

# 1. Mengelompokkan Fitur ASLI (non-scaled) berdasarkan Label Klaster
for i in range(len(filtered_badges)): # Menggunakan filtered_badges yang sesuai
    idx = final_labels[i]
    data = original_features[i]

    cluster_profiles[idx]['Total_Distance'].append(data['Total_Distance'])
    cluster_profiles[idx]['Avg_Speed'].append(data['Avg_Speed'])
    cluster_profiles[idx]['Avg_Heart_Rate'].append(data['Avg_Heart_Rate'])

# 2. Menghitung Mean (Rata-rata) per Klaster untuk Profiling
profile_summary = []
for idx, profile in cluster_profiles.items():
    mean_dist = calculate_mean(profile['Total_Distance'])
    profile_summary.append({
        'Cluster_Index': idx,
        'Mean_Total_Distance': mean_dist,
        'Mean_Avg_Speed': calculate_mean(profile['Avg_Speed']),
        'Mean_Avg_Heart_Rate': calculate_mean(profile['Avg_Heart_Rate']),
        'Count': len(profile['Total_Distance'])
    })

# 3. Menentukan Nama Klaster (Sortir berdasarkan Mean Total Distance)
profile_summary.sort(key=lambda x: x['Mean_Total_Distance'])

cluster_name_map = {}
cluster_name_map[profile_summary[0]['Cluster_Index']] = "1-Bronze (Basic)" # Terendah
cluster_name_map[profile_summary[1]['Cluster_Index']] = "2-Silver (Standard)" # Sedang
cluster_name_map[profile_summary[2]['Cluster_Index']] = "3-Gold (Premium)" # Tertinggi

print("\nProfil Klaster (Rata-rata Fitur Asli):")
for summary in profile_summary:
    name = cluster_name_map[summary['Cluster_Index']]
    print(f"- {name}: Jarak={summary['Mean_Total_Distance']:.0f}m, Speed={summary['Mean_Avg_Speed']:.2f} km/h, HR={summary['Mean_Avg_Heart_Rate']:.0f} bpm (N={summary['Count']})")


# --- C. SCATTER PLOT HASIL CLUSTERING ---

# Map warna untuk visualisasi
color_map = {"1-Bronze (Basic)": 'blue', "2-Silver (Standard)": 'green', "3-Gold (Premium)": 'red'}

# Persiapan data untuk plotting
x_3d = [p[0] for p in scaled_features] # Log Distance
y_3d = [p[1] for p in scaled_features] # Log Speed
z_3d = [p[2] for p in scaled_features] # Avg Heart Rate
c_3d = [color_map[cluster_name_map[label]] for label in final_labels]

# 1. Scatter Plot 3D (Menggunakan ketiga fitur)
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# Scatter data points
ax.scatter(x_3d, y_3d, z_3d, c=c_3d, marker='o', alpha=0.7)

# Scatter centroids
centroid_x = [c[0] for c in final_centroids]
centroid_y = [c[1] for c in final_centroids]
centroid_z = [c[2] for c in final_centroids]
ax.scatter(centroid_x, centroid_y, centroid_z, marker='X', s=300, c='black')

ax.set_title('Hasil Clustering K-Means (3D Plot)', fontsize=14)
ax.set_xlabel('Log Distance (Scaled)')
ax.set_ylabel('Log Speed (Scaled)')
ax.set_zlabel('Avg Heart Rate (Scaled)')
plt.tight_layout()
plt.savefig('scatter_plot_3d.png')
plt.close()
print("\n3D Scatter Plot telah disimpan sebagai 'scatter_plot_3d.png'")


# 2. Scatter Plot 2D (Untuk representasi yang lebih sederhana)
plt.figure(figsize=(10, 8))
plt.scatter(x_3d, y_3d, c=c_3d, alpha=0.7)

# Tambahkan label centroid (hanya 2 dimensi pertama)
centroid_colors = [color_map[cluster_name_map[i]] for i in cluster_name_map]
plt.scatter(centroid_x, centroid_y, marker='X', s=200, c='black', edgecolors=centroid_colors, linewidths=2)

plt.title('Hasil Clustering K-Means (Log Distance vs. Log Speed)', fontsize=14)
plt.xlabel('Log Distance (Standardized)')
plt.ylabel('Log Speed (Standardized)')
plt.grid(True)
plt.savefig('scatter_plot_2d.png')
plt.close()
print("2D Scatter Plot telah disimpan sebagai 'scatter_plot_2d.png'")


# --- D. EXPORT FINAL RESULT (Manual CSV Writing) ---

csv_filename = "segmentasi_pengguna_manual_final.csv"
header_row = ["Badge_ID", "Total_Distance_Meters", "Avg_Speed_Kmh", "Avg_Heart_Rate_Bpm", "Cluster_Name"]

final_output_report = []
for i in range(len(filtered_badges)): # Menggunakan filtered_badges
    # Mengambil data asli dari filtered list
    data = original_features[i]
    # Mengambil nama klaster
    name = cluster_name_map[final_labels[i]]

    final_output_report.append([
        filtered_badges[i], # Menggunakan filtered_badges
        data['Total_Distance'],
        data['Avg_Speed'],
        data['Avg_Heart_Rate'],
        name
    ])


with open(csv_filename, 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header_row)
    writer.writerows(final_output_report)

print(f"\nPART 5 SELESAI. Hasil segmentasi akhir telah diexport ke: {csv_filename}")

Menjalankan K-Means final dengan K=3...

--- Iterasi 1/20 ---
Cluster 0: Centroid = (0.2693, 0.9119, 1.3601), Ukuran = 62 pengguna
Cluster 1: Centroid = (0.1903, -0.4891, -0.9083), Ukuran = 92 pengguna
Cluster 2: Centroid = (-0.3977, -0.1342, -0.0089), Ukuran = 86 pengguna

--- Iterasi 2/20 ---
Cluster 0: Centroid = (0.4627, 0.9142, 1.1587), Ukuran = 75 pengguna
Cluster 1: Centroid = (0.2206, -0.4993, -0.8985), Ukuran = 90 pengguna
Cluster 2: Centroid = (-0.7274, -0.3150, -0.0805), Ukuran = 75 pengguna

--- Iterasi 3/20 ---
Cluster 0: Centroid = (0.5025, 0.8766, 1.0341), Ukuran = 83 pengguna
Cluster 1: Centroid = (0.2734, -0.4046, -0.8030), Ukuran = 96 pengguna
Cluster 2: Centroid = (-1.1140, -0.5560, -0.1433), Ukuran = 61 pengguna

--- Iterasi 4/20 ---
Cluster 0: Centroid = (0.5147, 0.8437, 1.0095), Ukuran = 85 pengguna
Cluster 1: Centroid = (0.2688, -0.3487, -0.6985), Ukuran = 106 pengguna
Cluster 2: Centroid = (-1.4744, -0.7093, -0.2401), Ukuran = 49 pengguna

--- Iterasi 5/20 ---
C

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Data ini harusnya dihasilkan di PART 5

# Asumsi data profil final yang sudah diurutkan (sesuai Mean Distance):
# 1-Bronze (Terendah) = 37 pengguna
# 2-Silver (Sedang)   = 117 pengguna
# 3-Gold (Tertinggi)  = 86 pengguna

# Catatan: Jumlah pengguna disesuaikan dengan output iterasi terakhir Anda
cluster_sizes_raw = [37, 117, 86]
# Diurutkan berdasarkan jumlah distance (terendah ke tertinggi)
cluster_names = ["1-Bronze (Basic)", "2-Silver (Standard)", "3-Gold (Premium)"]
cluster_colors = ['#8B4513', '#C0C0C0', '#FFD700'] # Warna Coklat/Perunggu, Perak, Emas

total_users_final = sum(cluster_sizes_raw)
percentages = [(size / total_users_final) * 100 for size in cluster_sizes_raw]

plt.figure(figsize=(9, 6))
bars = plt.bar(cluster_names, cluster_sizes_raw, color=cluster_colors, width=0.7)

# Menambahkan label nilai absolut dan persentase di atas setiap bar
for bar, size, percentage in zip(bars, cluster_sizes_raw, percentages):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 5, f'{size}\n({percentage:.1f}%)',
             ha='center', va='bottom', fontsize=10)

plt.title(f'Distribusi Jumlah Pengguna di Klaster (Total N={total_users_final})', fontsize=14)
plt.ylabel('Jumlah Pengguna', fontsize=12)
plt.xlabel('Nama Klaster', fontsize=12)
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig('cluster_size_distribution.png')
plt.close()

print("Bar Chart distribusi ukuran klaster telah disimpan sebagai 'cluster_size_distribution.png'.")

Bar Chart distribusi ukuran klaster telah disimpan sebagai 'cluster_size_distribution.png'.


In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np
import math

# --- 1. LOAD DAN AGGREGASI DATA AWAL (658 PENGGUNA) ---
file_name = 'data-1-Daily-Activity.csv'
users_agg_raw = {}

# (Memuat dan membersihkan data harian, sama seperti Part 1)
with open(file_name, 'r') as f:
    reader = csv.reader(f)
    next(reader)

    for row in reader:
        try:
            badge = row[10]
            d = float(row[0]) if row[0] else 0
            mt = float(row[1]) if row[1] else 0
            asp = float(row[6]) if row[6] else 0
            hr = float(row[8]) if row[8] else 0

            # Filter 'd==0 and mt==0' masih diterapkan di level record
            if d == 0 and mt == 0: continue

            if badge not in users_agg_raw:
                users_agg_raw[badge] = {'d': [], 'asp': [], 'hr': []}

            users_agg_raw[badge]['d'].append(d)
            users_agg_raw[badge]['asp'].append(asp)
            users_agg_raw[badge]['hr'].append(hr)
        except ValueError:
            continue

# Aggregasi data per pengguna
initial_aggregated_data = []
for b, v in users_agg_raw.items():
    freq = len(v['d'])
    if freq == 0: continue

    initial_aggregated_data.append({
        'user_id': b,
        'Total_Distance': sum(v['d']),
        'Avg_Speed': sum(v['asp']) / freq,
        'Avg_Heart_Rate': sum(v['hr']) / freq
    })

total_initial_users = len(initial_aggregated_data)

# --- 2. PERHITUNGAN DATA BERSIH VS. DATA HILANG/NOL (LEVEL PENGGUNA) ---

# Inisialisasi penghitung untuk pengguna dengan nilai Nol (0) di level aggregasi
zero_users_count = {
    'Total_Distance': 0,
    'Avg_Speed': 0,
    'Avg_Heart_Rate': 0,
}

# Lakukan perhitungan
for data in initial_aggregated_data:
    if data['Total_Distance'] == 0:
        zero_users_count['Total_Distance'] += 1

    if data['Avg_Speed'] == 0:
        zero_users_count['Avg_Speed'] += 1

    if data['Avg_Heart_Rate'] == 0:
        zero_users_count['Avg_Heart_Rate'] += 1

# Hitung pengguna dengan data bersih (nilai > 0)
clean_users_count = {}
for key in zero_users_count.keys():
    clean_users_count[key] = total_initial_users - zero_users_count[key]


# --- 3. VISUALISASI STACKED BAR CHART (DATA BERSIH VS. HILANG/NOL) ---

categories = ['Total_Distance', 'Avg_Speed', 'Avg_Heart_Rate']
clean_counts = [clean_users_count[cat] for cat in categories]
zero_counts = [zero_users_count[cat] for cat in categories]

# Warna: Hijau untuk Bersih, Oranye untuk Nol
color_clean = '#4CAF50' # Hijau
color_zero = '#FF9800'  # Oranye

plt.figure(figsize=(12, 7))

# Bar Bawah (Data Bersih)
plt.bar(categories, clean_counts, color=color_clean, label='Pengguna Data Bersih (Nilai > 0)')

# Bar Atas (Data Hilang/Nol)
# Bar ini akan ditumpuk di atas clean_counts
plt.bar(categories, zero_counts, bottom=clean_counts, color=color_zero, label='Pengguna Data Hilang/Nol (0)')

# Menambahkan Label Total di puncak setiap bar (semuanya harus 658)
for i, total in enumerate(clean_counts):
    total_val = clean_counts[i] + zero_counts[i]
    plt.text(i, total_val + 10, f'{total_val}', ha='center', va='bottom', fontsize=11, fontweight='bold')

    # Menambahkan Label Nilai Nol (Oranye)
    if zero_counts[i] > 0:
        # Posisikan label di tengah bagian oranye
        plt.text(i, clean_counts[i] + (zero_counts[i]/2), f'{zero_counts[i]}', ha='center', va='center', color='white', fontsize=10)

    # Menambahkan Label Nilai Bersih (Hijau)
    if clean_counts[i] > 0 and zero_counts[i] < (total_val * 0.9):
        # Posisikan label di tengah bagian hijau (jika bagian oranye tidak terlalu besar)
        plt.text(i, clean_counts[i]/2, f'{clean_counts[i]}', ha='center', va='center', color='white', fontsize=10)


plt.title(f'Kualitas Data Agregasi: Pengguna Bersih vs. Pengguna Nol (N={total_initial_users} Pengguna)', fontsize=16)
plt.ylabel('Jumlah Pengguna', fontsize=12)
plt.legend(loc='upper right')
plt.xticks(rotation=15, ha='right')
plt.ylim(0, total_initial_users + 50) # Memberi ruang untuk label total
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig('kualitas_data_agregasi_stacked.png')
plt.close()

print(f"Total Pengguna Awal: {total_initial_users}")
print(f"Pengguna dengan Total_Distance = 0: {zero_users_count['Total_Distance']}")
print(f"Pengguna dengan Avg_Speed = 0: {zero_users_count['Avg_Speed']}")
print(f"Pengguna dengan Avg_Heart_Rate = 0: {zero_users_count['Avg_Heart_Rate']}")
print("Visualisasi Stacked Bar Chart telah disimpan sebagai 'kualitas_data_agregasi_stacked.png'.")

Total Pengguna Awal: 658
Pengguna dengan Total_Distance = 0: 0
Pengguna dengan Avg_Speed = 0: 0
Pengguna dengan Avg_Heart_Rate = 0: 418
Visualisasi Stacked Bar Chart telah disimpan sebagai 'kualitas_data_agregasi_stacked.png'.


In [None]:
import csv
import matplotlib.pyplot as plt
import numpy as np

file_name = 'data-1-Daily-Activity.csv'
raw_distances = []
user_activity_counts = {}

# --- 1. PENGHITUNGAN MANUAL (LOAD RAW DATA & COUNT FREQUENCY) ---

with open(file_name, 'r') as f:
    reader = csv.reader(f)
    next(reader) # Skip header

    for row in reader:
        try:
            distance = float(row[0]) if row[0] else 0
            badge = row[10]

            # Kumpulkan jarak untuk Plot 1
            if distance > 0:
                raw_distances.append(distance)

            # Hitung frekuensi per pengguna untuk Plot 2
            if badge in user_activity_counts:
                user_activity_counts[badge] += 1
            else:
                user_activity_counts[badge] = 1

        except (ValueError, IndexError):
            continue

# Ekstrak Frekuensi untuk Plot 2
frequencies = list(user_activity_counts.values())

# --- 2. VISUALISASI DUA PLOT ---

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
fig.suptitle('Analisis Distribusi Data Aktivitas', fontsize=16)

# A. PLOT 1: DISTRIBUSI JARAK AKTIVITAS HARIAN (Raw Data)
# Gunakan Log Transform pada sumbu X karena data jarak pasti skewed
# Matplotlib dapat menghitung log scale di sumbu X.
axes[0].hist(raw_distances, bins=50, color='#1f77b4', edgecolor='black', log=False)
axes[0].set_title('Distribusi Jarak Harian (Raw Data)', fontsize=13)
axes[0].set_xlabel('Jarak Aktivitas (meter)')
axes[0].set_ylabel('Jumlah Aktivitas (Record)', fontsize=11)
axes[0].ticklabel_format(style='plain', axis='y') # Cegah notasi ilmiah pada sumbu Y
axes[0].grid(axis='y', linestyle='--', alpha=0.6)

# B. PLOT 2: FREKUENSI AKTIVITAS PER PENGGUNA (Users Frequency)
# Menunjukkan seberapa sering 658 pengguna mencatat data.
axes[1].hist(frequencies, bins=max(frequencies) if max(frequencies) < 50 else 50, color='#ff7f0e', edgecolor='black')
axes[1].set_title(f'Frekuensi Aktivitas per Pengguna (N={len(frequencies)} Pengguna)', fontsize=13)
axes[1].set_xlabel('Jumlah Record Harian per Pengguna')
axes[1].set_ylabel('Jumlah Pengguna', fontsize=11)
axes[1].grid(axis='y', linestyle='--', alpha=0.6)
axes[1].set_xlim(0, np.percentile(frequencies, 95)) # Batasi sumbu X agar plot tidak terlalu lebar karena ada outliers (super user)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig('eda_daily_frequency_distribution.png')
plt.close()

print(f"Visualisasi Distribusi Data Harian dan Frekuensi Pengguna telah disimpan sebagai 'eda_daily_frequency_distribution.png'.")

Visualisasi Distribusi Data Harian dan Frekuensi Pengguna telah disimpan sebagai 'eda_daily_frequency_distribution.png'.


In [None]:
import csv
import pandas as pd
import numpy as np

file_name = 'data-1-Daily-Activity.csv' # Nama file akan disesuaikan jika berbeda

# --- 1. AGREGASI DATA PER PENGGUNA (Badge_ID) ---
user_activity_data = {}

with open(file_name, 'r') as f:
    reader = csv.reader(f)
    next(reader)

    for row_idx, row in enumerate(reader):
        try:
            user_id = row[10] # Badge_ID

            if user_id not in user_activity_data:
                user_activity_data[user_id] = {
                    'Total_Distance_sum': 0.0,
                    'Moving_Time_sum': 0.0,
                    'Avg_Speed_vals': [],
                    'Avg_Heart_Rate_vals': [],
                    'Avg_Cadence_vals': [],
                    'Activity_Count': 0
                }

            dist = float(row[0]) if row[0] else 0.0
            time = float(row[1]) if row[1] else 0.0
            speed = float(row[6]) if row[6] else 0.0
            hr = float(row[8]) if row[8] else 0.0
            cadence = float(row[9]) if row[9] else 0.0

            user_activity_data[user_id]['Total_Distance_sum'] += dist
            user_activity_data[user_id]['Moving_Time_sum'] += time

            if speed > 0.0: user_activity_data[user_id]['Avg_Speed_vals'].append(speed)
            if hr > 0.0: user_activity_data[user_id]['Avg_Heart_Rate_vals'].append(hr)
            if cadence > 0.0: user_activity_data[user_id]['Avg_Cadence_vals'].append(cadence)

            user_activity_data[user_id]['Activity_Count'] += 1
        except (ValueError, IndexError):
            continue

aggregated_data_dicts = []
for user_id, data in user_activity_data.items():
    avg_speed = np.mean(data['Avg_Speed_vals']) if data['Avg_Speed_vals'] else 0.0
    avg_heart_rate = np.mean(data['Avg_Heart_Rate_vals']) if data['Avg_Heart_Rate_vals'] else 0.0
    avg_cadence = np.mean(data['Avg_Cadence_vals']) if data['Avg_Cadence_vals'] else 0.0

    aggregated_data_dicts.append({
        'Total_Distance': data['Total_Distance_sum'],
        'Moving_Time': data['Moving_Time_sum'],
        'Avg_Speed': avg_speed,
        'Avg_Heart_Rate': avg_heart_rate,
        'Avg_Cadence': avg_cadence,
        'Frequency': data['Activity_Count']
    })

# --- 2. STATISTIK DESKRIPTIF DENGAN PANDAS ---
df_agg = pd.DataFrame(aggregated_data_dicts)
descriptive_stats = df_agg.describe().T
print("\n--- STATISTIK DESKRIPTIF DATA TERAGREGASI ---")
print(descriptive_stats)
descriptive_stats.to_csv('descriptive_stats_agg.csv')
print("\nStatistik deskriptif telah disimpan sebagai 'descriptive_stats_agg.csv'")


--- STATISTIK DESKRIPTIF DATA TERAGREGASI ---
                count          mean           std      min            25%  \
Total_Distance  658.0  1.450069e+06  1.798777e+06  25100.0  220728.250000   
Moving_Time     658.0  8.187792e+04  7.934625e+04    840.0   14995.000000   
Avg_Speed       658.0  1.480819e+03  5.096931e+02     10.0    1184.475000   
Avg_Heart_Rate  658.0  4.617360e+02  6.188215e+02      0.0       0.000000   
Avg_Cadence     658.0  9.702703e+02  5.343824e+02      0.0    1002.426573   
Frequency       658.0  3.367629e+01  3.067049e+01      1.0       6.000000   

                         50%           75%           max  
Total_Distance  938841.50000  1.929158e+06  1.808272e+07  
Moving_Time      61111.50000  1.258072e+05  4.652980e+05  
Avg_Speed         1357.77027  1.673210e+03  5.083855e+03  
Avg_Heart_Rate       0.00000  1.158833e+03  1.767000e+03  
Avg_Cadence       1123.37500  1.278326e+03  2.066523e+03  
Frequency           26.50000  5.300000e+01  1.810000e+02  


In [None]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

file_name = 'data-1-Daily-Activity.csv' # Pastikan nama file ini sudah diunggah ke Colab

# --- HELPER FUNCTIONS (Diambil dari Part 3 Asli) ---
def calculate_mean(data_list):
    if not data_list: return 0.0
    return sum(data_list) / len(data_list)

def calculate_std_dev(data_list, mean):
    # Menggunakan (N-1) untuk Standard Deviation Sampel (sesuai NumPy/Pandas)
    if not data_list or len(data_list) < 2: return 0.0
    variance = sum([(x - mean) ** 2 for x in data_list]) / (len(data_list) - 1)
    return math.sqrt(variance)

# --- 1. DATA AGGREGATION & INITIAL DATA QUALITY CHECK (Per Baris Harian) ---

user_activity_data = {}
total_records = 0
relevant_cols_index = {
    0: 'distance_meters', 1: 'moving_time_seconds',
    6: 'average_speed_kmh', 8: 'average_heartrate_bpm', 9: 'average_cadence_spm'
}
invalid_counts = {k: 0 for k in relevant_cols_index.keys()}

with open(file_name, 'r') as f:
    reader = csv.reader(f)
    next(reader)

    for row_idx, row in enumerate(reader):
        total_records += 1

        # A. Hitung Kualitas Data Harian (Nol/Kosong)
        for index in relevant_cols_index.keys():
            cell_value = row[index]
            is_invalid = cell_value == '' or (cell_value and float(cell_value) == 0.0)
            if is_invalid:
                invalid_counts[index] += 1

        # B. Agregasi Data Per Pengguna
        try:
            user_id = row[10] # Badge_ID
            if user_id not in user_activity_data:
                user_activity_data[user_id] = {
                    'Total_Distance_sum': 0.0, 'Moving_Time_sum': 0.0,
                    'Avg_Speed_vals': [], 'Avg_Heart_Rate_vals': [], 'Avg_Cadence_vals': [],
                    'Activity_Count': 0
                }

            # Parsing data, menggunakan 0.0 jika kosong
            dist = float(row[0]) if row[0] else 0.0
            time = float(row[1]) if row[1] else 0.0
            speed = float(row[6]) if row[6] else 0.0
            hr = float(row[8]) if row[8] else 0.0
            cadence = float(row[9]) if row[9] else 0.0

            user_activity_data[user_id]['Total_Distance_sum'] += dist
            user_activity_data[user_id]['Moving_Time_sum'] += time

            # Kumpulkan nilai > 0 untuk penghitungan rata-rata yang benar
            if speed > 0.0: user_activity_data[user_id]['Avg_Speed_vals'].append(speed)
            if hr > 0.0: user_activity_data[user_id]['Avg_Heart_Rate_vals'].append(hr)
            if cadence > 0.0: user_activity_data[user_id]['Avg_Cadence_vals'].append(cadence)

            user_activity_data[user_id]['Activity_Count'] += 1
        except (ValueError, IndexError):
            continue

# --- 2. FINAL AGGREGATION & DATA FILTERING (Menghasilkan 658 dan 240 pengguna) ---

aggregated_data_dicts = []
original_features = [] # 240 pengguna valid, nilai mentah
clustering_features_log = [] # 240 pengguna valid, nilai Log(X+1)

for data in user_activity_data.values():
    avg_speed = np.mean(data['Avg_Speed_vals']) if data['Avg_Speed_vals'] else 0.0
    avg_heart_rate = np.mean(data['Avg_Heart_Rate_vals']) if data['Avg_Heart_Rate_vals'] else 0.0
    avg_cadence = np.mean(data['Avg_Cadence_vals']) if data['Avg_Cadence_vals'] else 0.0

    agg_dict = {
        'Total_Distance': data['Total_Distance_sum'], 'Moving_Time': data['Moving_Time_sum'],
        'Avg_Speed': avg_speed, 'Avg_Heart_Rate': avg_heart_rate, 'Avg_Cadence': avg_cadence,
        'Frequency': data['Activity_Count']
    }
    aggregated_data_dicts.append(agg_dict)

    # Filtering untuk 3 Fitur Kunci (untuk K-Means)
    features_to_select = ['Total_Distance', 'Avg_Speed', 'Avg_Heart_Rate']
    is_valid_user = all(agg_dict[feature] != 0 for feature in features_to_select)

    if is_valid_user:
        # Data Asli (Raw) untuk 240 pengguna
        original_features.append({
            'Total_Distance': agg_dict['Total_Distance'], 'Avg_Speed': agg_dict['Avg_Speed'], 'Avg_Heart_Rate': agg_dict['Avg_Heart_Rate']
        })

        # Data Log Transformed (Log(X+1)) untuk 240 pengguna
        log_dist = math.log(agg_dict['Total_Distance'] + 1)
        log_speed = math.log(agg_dict['Avg_Speed'] + 1)
        avg_hr = agg_dict['Avg_Heart_Rate']
        clustering_features_log.append([log_dist, log_speed, avg_hr])

# --- 3. STANDARD SCALING (MENGHASILKAN DATA FINAL UNTUK PLOT) ---

# Transpose dan hitung statistik
transposed_log_features = [[row[i] for row in clustering_features_log] for i in range(len(clustering_features_log[0]))]
scaled_features = []
means = [calculate_mean(col) for col in transposed_log_features]
stds = [calculate_std_dev(col, means[i]) for i, col in enumerate(transposed_log_features)]

# Terapkan Standardisasi Z-Score
for row in clustering_features_log:
    scaled_row = [(row[j] - means[j]) / stds[j] if stds[j] != 0 else 0 for j in range(len(row))]
    scaled_features.append(scaled_row)

# Konversi ke DataFrame untuk Plot Distribusi
df_raw = pd.DataFrame(original_features)
df_log = pd.DataFrame(clustering_features_log, columns=['Log_Distance', 'Log_Speed', 'Avg_Heart_Rate'])
df_scaled = pd.DataFrame(scaled_features, columns=['Scaled_Log_Distance', 'Scaled_Log_Speed', 'Scaled_Avg_Heart_Rate'])

# --- 4. VISUALISASI 1: KUALITAS DATA HARIAN (STACKED BAR CHART) ---
column_names = list(relevant_cols_index.values())
invalid_data = list(invalid_counts.values())
clean_data = [total_records - invalid for invalid in invalid_data]

plt.figure(figsize=(12, 7))
x_pos = np.arange(len(column_names))
bar_width = 0.8
plt.bar(x_pos, clean_data, color='#4CAF50', edgecolor='white', width=bar_width, label='Data Bersih (Nilai > 0)')
plt.bar(x_pos, invalid_data, color='#FF5722', edgecolor='white', width=bar_width, bottom=clean_data, label='Data Hilang/Nol (Invalid)')
plt.title("Kualitas Data Mentah: Data Bersih vs. Data Hilang/Nol (0) (N=22159 Records)", fontsize=14)
plt.xticks(x_pos, column_names, rotation=25, ha='right')
plt.legend()
plt.tight_layout()
plt.savefig('eda_data_quality_daily_bar_chart.png')
plt.close()
print(f"Total Baris Data Ditemukan: {total_records}")
print("Visualisasi Kualitas Data Harian telah disimpan sebagai 'eda_data_quality_daily_bar_chart.png'")


# --- 5. VISUALISASI 2: STATISTIK DESKRIPTIF (N=658) ---
df_agg_all = pd.DataFrame(aggregated_data_dicts)
descriptive_stats = df_agg_all.describe().T

print("\n--- STATISTIK DESKRIPTIF DATA TERAGREGASI (N=658 Pengguna) ---")
print(descriptive_stats)
descriptive_stats.to_csv('descriptive_stats_agg.csv')
print("Statistik deskriptif telah disimpan sebagai 'descriptive_stats_agg.csv'")


# --- 6. VISUALISASI 3: PERBANDINGAN DISTRIBUSI (3x3 HISTOGRAM) (N=240) ---
feature_cols = ['Total_Distance', 'Avg_Speed', 'Avg_Heart_Rate']
log_cols = ['Log_Distance', 'Log_Speed', 'Avg_Heart_Rate']
scaled_cols = ['Scaled_Log_Distance', 'Scaled_Log_Speed', 'Scaled_Avg_Heart_Rate']

fig, axes = plt.subplots(3, 3, figsize=(15, 15))
fig.suptitle('Visualisasi Distribusi Fitur Sebelum dan Sesudah Transformasi (N=240 Pengguna)', fontsize=16, y=1.02)
bins_count = 30

dfs = [df_raw, df_log, df_scaled]
titles = ['Data Agregasi Mentah', 'Setelah Log(X+1) Transform', 'Setelah Standard Scaling (Z-Score)']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
col_map = [feature_cols, log_cols, scaled_cols]

for i in range(3):
    df = dfs[i]
    current_cols = col_map[i]

    for j in range(3):
        ax = axes[i, j]
        col = current_cols[j]

        ax.hist(df[col], bins=bins_count, color=colors[i], edgecolor='black', alpha=0.7)

        mean_val = df[col].mean()
        ax.axvline(mean_val, color='r', linestyle='dashed', linewidth=2, label=f'Mean: {mean_val:.2f}')

        if i == 0: ax.set_title(f'Distribusi {feature_cols[j]}', fontsize=12)
        if j == 0: ax.set_ylabel(titles[i], fontsize=11, fontweight='bold')
        if i == 2: ax.set_xlabel(col)

        ax.legend(fontsize=8)
        ax.grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout(rect=[0, 0.03, 1, 0.98])
plt.savefig('eda_distribution_comparison_3x3.png')
plt.close() # Aman menggunakan close() setelah savefig()
print("Visualisasi perbandingan distribusi telah disimpan sebagai 'eda_distribution_comparison_3x3.png'")

Total Baris Data Ditemukan: 22159
Visualisasi Kualitas Data Harian telah disimpan sebagai 'eda_data_quality_daily_bar_chart.png'

--- STATISTIK DESKRIPTIF DATA TERAGREGASI (N=658 Pengguna) ---
                count          mean           std      min            25%  \
Total_Distance  658.0  1.450069e+06  1.798777e+06  25100.0  220728.250000   
Moving_Time     658.0  8.187792e+04  7.934625e+04    840.0   14995.000000   
Avg_Speed       658.0  1.480819e+03  5.096931e+02     10.0    1184.475000   
Avg_Heart_Rate  658.0  4.617360e+02  6.188215e+02      0.0       0.000000   
Avg_Cadence     658.0  9.702703e+02  5.343824e+02      0.0    1002.426573   
Frequency       658.0  3.367629e+01  3.067049e+01      1.0       6.000000   

                         50%           75%           max  
Total_Distance  938841.50000  1.929158e+06  1.808272e+07  
Moving_Time      61111.50000  1.258072e+05  4.652980e+05  
Avg_Speed         1357.77027  1.673210e+03  5.083855e+03  
Avg_Heart_Rate       0.00000  1