In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [21]:
dataset_completions = "/content/drive/MyDrive/Colab Notebooks/pace_analysis/dev_completions.xlsx"
df_completions = pd.read_excel(dataset_completions)
df_completions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 1032 non-null   int64         
 1   user_id            1032 non-null   int64         
 2   journey_id         1032 non-null   int64         
 3   created_at         1032 non-null   datetime64[ns]
 4   updated_at         1032 non-null   datetime64[ns]
 5   enrolling_times    1032 non-null   int64         
 6   enrollments_at     1032 non-null   object        
 7   last_enrolled_at   1032 non-null   datetime64[ns]
 8   study_duration     1032 non-null   int64         
 9   Repeat_enrollment  1032 non-null   int64         
dtypes: datetime64[ns](3), int64(6), object(1)
memory usage: 80.8+ KB


In [22]:
df_completions.head()

Unnamed: 0,id,user_id,journey_id,created_at,updated_at,enrolling_times,enrollments_at,last_enrolled_at,study_duration,Repeat_enrollment
0,104582,96989,32,2020-04-11 14:39:11,2020-04-11 14:39:11,3,"2018-08-29 11:19:45,2019-02-05 09:20:57,2020-0...",2020-02-26 17:05:28,44,1
1,30951,96989,51,2019-08-10 02:42:35,2019-08-10 02:42:35,1,2019-06-20 21:01:59,2019-06-20 21:01:59,50,0
2,74313,96989,74,2019-12-23 06:22:02,2019-12-23 06:22:02,2,"2019-10-29 09:34:40,2019-10-31 09:38:30",2019-10-31 09:38:30,52,1
3,26219,96989,83,2019-06-20 05:45:36,2019-06-20 05:45:36,1,2019-06-14 18:15:40,2019-06-14 18:15:40,5,0
4,23887,96989,104,2019-05-18 08:58:46,2019-05-18 08:58:46,1,2019-05-10 14:30:08,2019-05-10 14:30:08,7,0


In [23]:
dataset_tracking= "/content/drive/MyDrive/Colab Notebooks/pace_analysis/dev_tracking.xlsx"
df_tracking =pd.read_excel(dataset_tracking)
df_tracking.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100176 entries, 0 to 100175
Data columns (total 12 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   id                             100176 non-null  int64         
 1   journey_id                     100176 non-null  int64         
 2   tutorial_id                    100176 non-null  int64         
 3   developer_id                   100176 non-null  int64         
 4   status                         100176 non-null  int64         
 5   last_viewed                    100176 non-null  datetime64[ns]
 6   first_opened_at                100176 non-null  datetime64[ns]
 7   completed_at                   78806 non-null   datetime64[ns]
 8   developer_journey_status_hash  100176 non-null  object        
 9   learning_duration              78806 non-null   float64       
 10  learning_hour                  100176 non-null  int64         
 11  

In [24]:
def parse_enroll_list(val):
    if pd.isna(val):
        return []
    parts = [p.strip() for p in str(val).split(",") if p.strip()]
    return [pd.to_datetime(p, errors="coerce") for p in parts]

In [25]:
df_completions["enroll_list"] = df_completions["enrollments_at"].apply(parse_enroll_list)
df_completions["first_enrolled_at"] = df_completions["enroll_list"].apply(
    lambda xs: min(xs) if xs else pd.NaT
)
df_completions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 1032 non-null   int64         
 1   user_id            1032 non-null   int64         
 2   journey_id         1032 non-null   int64         
 3   created_at         1032 non-null   datetime64[ns]
 4   updated_at         1032 non-null   datetime64[ns]
 5   enrolling_times    1032 non-null   int64         
 6   enrollments_at     1032 non-null   object        
 7   last_enrolled_at   1032 non-null   datetime64[ns]
 8   study_duration     1032 non-null   int64         
 9   Repeat_enrollment  1032 non-null   int64         
 10  enroll_list        1032 non-null   object        
 11  first_enrolled_at  1032 non-null   datetime64[ns]
dtypes: datetime64[ns](4), int64(6), object(2)
memory usage: 96.9+ KB


In [26]:
df_completions.head()

Unnamed: 0,id,user_id,journey_id,created_at,updated_at,enrolling_times,enrollments_at,last_enrolled_at,study_duration,Repeat_enrollment,enroll_list,first_enrolled_at
0,104582,96989,32,2020-04-11 14:39:11,2020-04-11 14:39:11,3,"2018-08-29 11:19:45,2019-02-05 09:20:57,2020-0...",2020-02-26 17:05:28,44,1,"[2018-08-29 11:19:45, 2019-02-05 09:20:57, 202...",2018-08-29 11:19:45
1,30951,96989,51,2019-08-10 02:42:35,2019-08-10 02:42:35,1,2019-06-20 21:01:59,2019-06-20 21:01:59,50,0,[2019-06-20 21:01:59],2019-06-20 21:01:59
2,74313,96989,74,2019-12-23 06:22:02,2019-12-23 06:22:02,2,"2019-10-29 09:34:40,2019-10-31 09:38:30",2019-10-31 09:38:30,52,1,"[2019-10-29 09:34:40, 2019-10-31 09:38:30]",2019-10-29 09:34:40
3,26219,96989,83,2019-06-20 05:45:36,2019-06-20 05:45:36,1,2019-06-14 18:15:40,2019-06-14 18:15:40,5,0,[2019-06-14 18:15:40],2019-06-14 18:15:40
4,23887,96989,104,2019-05-18 08:58:46,2019-05-18 08:58:46,1,2019-05-10 14:30:08,2019-05-10 14:30:08,7,0,[2019-05-10 14:30:08],2019-05-10 14:30:08


In [27]:
df_completions["completions_duration"] = df_completions["updated_at"] - df_completions["first_enrolled_at"]
df_completions["completions_duration_day"] = df_completions["completions_duration"].dt.days
df_completions["completions_duration_second"] = df_completions["completions_duration"].dt.total_seconds()
df_completions = df_completions[df_completions["completions_duration_second"].notna()]
df_completions = df_completions[df_completions["completions_duration_second"] >=0]

In [28]:
journey_median = (
    df_completions.groupby("journey_id")["completions_duration_day"]
        .median()
)
df_completions["journey_median_pace_day"] = df_completions["journey_id"].map(journey_median)
df_completions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype          
---  ------                       --------------  -----          
 0   id                           1032 non-null   int64          
 1   user_id                      1032 non-null   int64          
 2   journey_id                   1032 non-null   int64          
 3   created_at                   1032 non-null   datetime64[ns] 
 4   updated_at                   1032 non-null   datetime64[ns] 
 5   enrolling_times              1032 non-null   int64          
 6   enrollments_at               1032 non-null   object         
 7   last_enrolled_at             1032 non-null   datetime64[ns] 
 8   study_duration               1032 non-null   int64          
 9   Repeat_enrollment            1032 non-null   int64          
 10  enroll_list                  1032 non-null   object         
 11  first_enrolled_at            1

In [29]:
# # Drop any existing columns that might be related to 'journey_median_pace_day' to ensure a clean state.
# # This handles cases where 'journey_median_pace_day_x', 'journey_median_pace_day_y', or
# # a 'journey_median_pace_day' column with duplicate content might exist.
# columns_to_drop = [col for col in df_completions.columns if 'journey_median_pace_day' in col]
# df_completions = df_completions.drop(columns=columns_to_drop, errors='ignore')

In [30]:
df_completions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype          
---  ------                       --------------  -----          
 0   id                           1032 non-null   int64          
 1   user_id                      1032 non-null   int64          
 2   journey_id                   1032 non-null   int64          
 3   created_at                   1032 non-null   datetime64[ns] 
 4   updated_at                   1032 non-null   datetime64[ns] 
 5   enrolling_times              1032 non-null   int64          
 6   enrollments_at               1032 non-null   object         
 7   last_enrolled_at             1032 non-null   datetime64[ns] 
 8   study_duration               1032 non-null   int64          
 9   Repeat_enrollment            1032 non-null   int64          
 10  enroll_list                  1032 non-null   object         
 11  first_enrolled_at            1

In [31]:
df_completions["relative_pace"] = (
    df_completions["completions_duration_day"] / df_completions["journey_median_pace_day"])
df_completions["relative_pace"].replace([np.inf, -np.inf], np.nan, inplace=True)
df_completions["pace_percent_vs_median"] = (1 - df_completions["relative_pace"]) * 100


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_completions["relative_pace"].replace([np.inf, -np.inf], np.nan, inplace=True)


In [32]:
def categorize_pace(rp: float) -> str:
    if pd.isna(rp):
        return "unknown"
    if rp <= 0.75:
        return "fast"
    if rp <= 1.25:
        return "normal"
    return "slow"

df_completions["pace_category"] = df_completions["relative_pace"].apply(categorize_pace)
comp_clean = df_completions.drop(
    columns=["enroll_list", "study_duration"],
    errors="ignore")

In [33]:
comp_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype          
---  ------                       --------------  -----          
 0   id                           1032 non-null   int64          
 1   user_id                      1032 non-null   int64          
 2   journey_id                   1032 non-null   int64          
 3   created_at                   1032 non-null   datetime64[ns] 
 4   updated_at                   1032 non-null   datetime64[ns] 
 5   enrolling_times              1032 non-null   int64          
 6   enrollments_at               1032 non-null   object         
 7   last_enrolled_at             1032 non-null   datetime64[ns] 
 8   Repeat_enrollment            1032 non-null   int64          
 9   first_enrolled_at            1032 non-null   datetime64[ns] 
 10  completions_duration         1032 non-null   timedelta64[ns]
 11  completions_duration_day     1

Kolom `pace_percent_vs_median` dihitung berdasarkan `relative_pace`. Mari kita bedah satu per satu:

1.  **`relative_pace` (Kecepatan Relatif):**
    *   Ini adalah rasio antara `completions_duration_day` (durasi penyelesaian sebenarnya dalam hari) dengan `journey_median_pace_day` (durasi median penyelesaian untuk perjalanan/journey yang sama). Durasi median ini berfungsi sebagai patokan atau rata-rata tipikal untuk menyelesaikan suatu perjalanan.
    *   **Rumus:** `relative_pace` = `completions_duration_day` / `journey_median_pace_day`
    *   **Fungsi:** Menunjukkan seberapa cepat atau lambat seorang pengguna menyelesaikan suatu perjalanan dibandingkan dengan durasi median perjalanan tersebut. Misalnya:
        *   Jika `relative_pace` = 1, berarti pengguna menyelesaikan perjalanan dalam waktu yang sama dengan median.
        *   Jika `relative_pace` < 1 (misal 0.5), berarti pengguna menyelesaikan lebih cepat dari median.
        *   Jika `relative_pace` > 1 (misal 1.5), berarti pengguna menyelesaikan lebih lambat dari median.

2.  **`pace_percent_vs_median` (Persentase Kecepatan vs Median):**
    *   Kolom ini mengukur seberapa jauh, dalam persentase, kecepatan penyelesaian seorang pengguna berbeda dari kecepatan median perjalanan tersebut.
    *   **Rumus:** `pace_percent_vs_median` = (1 - `relative_pace`) * 100
    *   **Fungsi:** Memberikan indikator yang lebih mudah dibaca tentang performa pengguna dibandingkan dengan patokan median:
        *   **Nilai positif:** Pengguna menyelesaikan perjalanan lebih cepat dari median. Semakin besar nilainya, semakin cepat pengguna tersebut.
            *   Contoh: Jika `pace_percent_vs_median` = 50, berarti pengguna 50% lebih cepat dari median.
        *   **Nilai negatif:** Pengguna menyelesaikan perjalanan lebih lambat dari median. Semakin kecil (lebih negatif) nilainya, semakin lambat pengguna tersebut.
            *   Contoh: Jika `pace_percent_vs_median` = -25, berarti pengguna 25% lebih lambat dari median.
        *   **Nilai nol:** Pengguna menyelesaikan perjalanan tepat pada durasi median.

Secara keseluruhan, `pace_percent_vs_median` membantu kita mengidentifikasi siapa saja pengguna yang sangat cepat, normal, atau lambat dalam menyelesaikan suatu perjalanan dibandingkan dengan rata-rata pengguna lain pada perjalanan yang sama.

In [34]:
comp_clean.to_excel("clean_compeltions.xlsx", index=False)

In [35]:
trk = df_tracking.copy()
trk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100176 entries, 0 to 100175
Data columns (total 12 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   id                             100176 non-null  int64         
 1   journey_id                     100176 non-null  int64         
 2   tutorial_id                    100176 non-null  int64         
 3   developer_id                   100176 non-null  int64         
 4   status                         100176 non-null  int64         
 5   last_viewed                    100176 non-null  datetime64[ns]
 6   first_opened_at                100176 non-null  datetime64[ns]
 7   completed_at                   78806 non-null   datetime64[ns]
 8   developer_journey_status_hash  100176 non-null  object        
 9   learning_duration              78806 non-null   float64       
 10  learning_hour                  100176 non-null  int64         
 11  

In [36]:
trk.head()

Unnamed: 0,id,journey_id,tutorial_id,developer_id,status,last_viewed,first_opened_at,completed_at,developer_journey_status_hash,learning_duration,learning_hour,learning_day
0,456484121,555,30195,5410865,1,2025-10-31 09:12:47,2025-10-31 09:12:47,2025-10-31 09:13:03,0x047CFFC6,0.000185,9,Friday
1,456484151,555,30200,5410865,1,2025-10-31 09:13:03,2025-10-31 09:13:03,2025-10-31 09:13:06,0x047CFFC6,3.5e-05,9,Friday
2,456484160,555,30205,5410865,1,2025-10-31 09:13:06,2025-10-31 09:13:06,2025-10-31 09:13:09,0x047CFFC6,3.5e-05,9,Friday
3,456484178,555,30210,5410865,1,2025-10-31 09:13:09,2025-10-31 09:13:09,2025-10-31 09:13:15,0x047CFFC6,6.9e-05,9,Friday
4,456484202,555,30215,5410865,1,2025-10-31 09:13:15,2025-10-31 09:13:15,2025-10-31 09:13:20,0x047CFFC6,5.8e-05,9,Friday


In [37]:
trk["learning_duration_seconds"] = trk["learning_duration"].astype(float) * 86400.0

In [38]:
agg_active = (
    trk.groupby(["developer_id", "journey_id"])
       .agg(
           total_active_seconds=("learning_duration_seconds", "sum"),
           mean_active_seconds_per_tutorial=("learning_duration_seconds", "mean"),
           n_tutorials=("tutorial_id", "nunique"),
           n_completed_events=("completed_at", "count"),
       )
       .reset_index()
)
# 2) Durasi antar-materi: selisih antara first_opened_at tutorial sebelumnya dan berikutnya
trk_sorted = trk.sort_values(["developer_id", "journey_id", "first_opened_at"])
trk_sorted["gap_between_tutorials_seconds"] = (
    trk_sorted.groupby(["developer_id", "journey_id"])["first_opened_at"]
    .diff()
    .dt.total_seconds()
)

gap_agg = (
    trk_sorted.groupby(["developer_id", "journey_id"])
    .agg(
        mean_gap_between_tutorials_seconds=("gap_between_tutorials_seconds", "mean"),
        median_gap_between_tutorials_seconds=("gap_between_tutorials_seconds", "median"),
    )
    .reset_index()
)

# Gabungkan fitur active time + gap antar-materi
tracking_features = agg_active.merge(
    gap_agg, on=["developer_id", "journey_id"], how="left"
)

In [39]:
pace_table = comp_clean.merge(
    tracking_features,
    left_on=["user_id", "journey_id"],
    right_on=["developer_id", "journey_id"],
    how="left",
    suffixes=("", "_trk"),
)
pace_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 24 columns):
 #   Column                                Non-Null Count  Dtype          
---  ------                                --------------  -----          
 0   id                                    1032 non-null   int64          
 1   user_id                               1032 non-null   int64          
 2   journey_id                            1032 non-null   int64          
 3   created_at                            1032 non-null   datetime64[ns] 
 4   updated_at                            1032 non-null   datetime64[ns] 
 5   enrolling_times                       1032 non-null   int64          
 6   enrollments_at                        1032 non-null   object         
 7   last_enrolled_at                      1032 non-null   datetime64[ns] 
 8   Repeat_enrollment                     1032 non-null   int64          
 9   first_enrolled_at                     1032 non-null   datetime6

### Mengapa `mean_active_seconds_per_tutorial` memiliki nilai kosong (NaN)?

Kolom `mean_active_seconds_per_tutorial` dihitung dengan mengambil rata-rata `learning_duration_seconds` untuk setiap kombinasi `developer_id` (pengguna) dan `journey_id` (perjalanan).

Jika untuk suatu `developer_id` dan `journey_id` tertentu, tidak ada data `learning_duration_seconds` yang valid (semuanya kosong atau NaN), maka nilai rata-rata yang dihitung juga akan menjadi NaN. Ini berarti bahwa untuk kombinasi pengguna-perjalanan tersebut, tidak ada informasi durasi belajar yang tercatat untuk tutorial-tutorial yang ada di dalamnya.

Hal ini bisa terjadi jika:
*   Pengguna tidak menyelesaikan atau bahkan tidak memulai tutorial apapun dalam perjalanan tersebut.
*   Data `learning_duration` (yang kemudian dikonversi menjadi `learning_duration_seconds`) memang tidak tercatat untuk tutorial tersebut.

### Apa tujuan dari kolom `mean_active_seconds_per_tutorial`?

Tujuan utama dari kolom ini adalah untuk mengukur **rata-rata waktu aktif seorang pengguna dalam menyelesaikan setiap tutorial dalam suatu perjalanan**. Ini adalah metrik penting untuk memahami:

*   **Keterlibatan Pengguna:** Seberapa banyak waktu yang secara rata-rata diinvestasikan pengguna pada setiap bagian kecil (tutorial) dari suatu perjalanan belajar.
*   **Pacing (Kecepatan Belajar):** Bersama dengan metrik lain, ini bisa membantu menganalisis pola kecepatan belajar pengguna. Misalnya, apakah pengguna yang cepat cenderung menghabiskan waktu rata-rata yang lebih sedikit per tutorial, atau sebaliknya?
*   **Identifikasi Anomali:** Nilai yang sangat tinggi atau sangat rendah bisa mengindikasikan perilaku pengguna yang tidak biasa atau masalah dengan data itu sendiri.

In [40]:
pace_table.head()

Unnamed: 0,id,user_id,journey_id,created_at,updated_at,enrolling_times,enrollments_at,last_enrolled_at,Repeat_enrollment,first_enrolled_at,...,relative_pace,pace_percent_vs_median,pace_category,developer_id,total_active_seconds,mean_active_seconds_per_tutorial,n_tutorials,n_completed_events,mean_gap_between_tutorials_seconds,median_gap_between_tutorials_seconds
0,104582,96989,32,2020-04-11 14:39:11,2020-04-11 14:39:11,3,"2018-08-29 11:19:45,2019-02-05 09:20:57,2020-0...",2020-02-26 17:05:28,1,2018-08-29 11:19:45,...,18.184615,-1718.461538,slow,96989,0.0,,26,0,2042828.0,3063.0
1,30951,96989,51,2019-08-10 02:42:35,2019-08-10 02:42:35,1,2019-06-20 21:01:59,2019-06-20 21:01:59,0,2019-06-20 21:01:59,...,0.226244,77.375566,fast,96989,901415.0,69339.615385,28,13,190567.7,613.0
2,74313,96989,74,2019-12-23 06:22:02,2019-12-23 06:22:02,2,"2019-10-29 09:34:40,2019-10-31 09:38:30",2019-10-31 09:38:30,1,2019-10-29 09:34:40,...,1.173913,-17.391304,normal,96989,0.0,,59,0,554729.2,255.5
3,26219,96989,83,2019-06-20 05:45:36,2019-06-20 05:45:36,1,2019-06-14 18:15:40,2019-06-14 18:15:40,0,2019-06-14 18:15:40,...,0.104167,89.583333,fast,96989,424079.0,12850.878788,50,33,731610.1,295.0
4,23887,96989,104,2019-05-18 08:58:46,2019-05-18 08:58:46,1,2019-05-10 14:30:08,2019-05-10 14:30:08,0,2019-05-10 14:30:08,...,0.194444,80.555556,fast,96989,496351.0,8272.516667,72,60,9276.789,343.0


In [41]:
pace_table["mean_active_seconds_per_tutorial"] = (
    pace_table["mean_active_seconds_per_tutorial"].fillna(0)
)
pace_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 24 columns):
 #   Column                                Non-Null Count  Dtype          
---  ------                                --------------  -----          
 0   id                                    1032 non-null   int64          
 1   user_id                               1032 non-null   int64          
 2   journey_id                            1032 non-null   int64          
 3   created_at                            1032 non-null   datetime64[ns] 
 4   updated_at                            1032 non-null   datetime64[ns] 
 5   enrolling_times                       1032 non-null   int64          
 6   enrollments_at                        1032 non-null   object         
 7   last_enrolled_at                      1032 non-null   datetime64[ns] 
 8   Repeat_enrollment                     1032 non-null   int64          
 9   first_enrolled_at                     1032 non-null   datetime6

In [42]:
features = [
    "completions_duration_day",
    "pace_percent_vs_median",
    "total_active_seconds",
    "mean_active_seconds_per_tutorial",
    "mean_gap_between_tutorials_seconds",
    "enrolling_times",
]

X = pace_table[features].fillna(0)

In [43]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [44]:
results = {}
for k in range(2, 8):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    sil = silhouette_score(X_scaled, labels)
    results[k] = sil
    print(f"K = {k} -> Silhouette Score = {sil:.6f}")

print("\nHasil Silhouette per K:", results)

K = 2 -> Silhouette Score = 0.657360
K = 3 -> Silhouette Score = 0.659203
K = 4 -> Silhouette Score = 0.560424
K = 5 -> Silhouette Score = 0.562431
K = 6 -> Silhouette Score = 0.574374
K = 7 -> Silhouette Score = 0.566247

Hasil Silhouette per K: {2: np.float64(0.6573600973009626), 3: np.float64(0.659202615433781), 4: np.float64(0.5604238366894891), 5: np.float64(0.5624309399258433), 6: np.float64(0.5743740008021899), 7: np.float64(0.5662467746624349)}


In [45]:
best_k = 3
kmeans_final = KMeans(n_clusters=best_k, random_state=42, n_init=10)
pace_table["pace_cluster"] = kmeans_final.fit_predict(X_scaled)

In [46]:
score = silhouette_score(X_scaled, pace_table["pace_cluster"])
print("Silhouette Score:", score)

Silhouette Score: 0.659202615433781


In [47]:
cluster_summary = pace_table.groupby("pace_cluster")[features].mean()
print("\nRata-rata fitur per cluster:")
print(cluster_summary)


Rata-rata fitur per cluster:
              completions_duration_day  pace_percent_vs_median  \
pace_cluster                                                     
0                          1132.770642            -1003.088517   
1                           235.322126             -178.744890   
2                            19.000000                0.000000   

              total_active_seconds  mean_active_seconds_per_tutorial  \
pace_cluster                                                           
0                     2.857082e+09                      4.530658e+07   
1                     1.281650e+08                      1.878343e+06   
2                     0.000000e+00                      0.000000e+00   

              mean_gap_between_tutorials_seconds  enrolling_times  
pace_cluster                                                       
0                                   1.154260e+06         1.623853  
1                                   5.131076e+05         1.196312  
2     

In [48]:
# 3. Urutkan cluster berdasarkan durasi (semakin kecil -> semakin cepat)
order = cluster_summary["completions_duration_day"].sort_values().index.tolist()
# Mapping: cluster dengan durasi paling kecil = Fast, dst.
cluster_label_map = {
    order[0]: "Fast Learner",
    order[1]: "Normal Learner",
    order[2]: "Slow Learner",
}

pace_table["pace_cluster_label"] = pace_table["pace_cluster"].map(cluster_label_map)

In [49]:
pace_table.head()

Unnamed: 0,id,user_id,journey_id,created_at,updated_at,enrolling_times,enrollments_at,last_enrolled_at,Repeat_enrollment,first_enrolled_at,...,pace_category,developer_id,total_active_seconds,mean_active_seconds_per_tutorial,n_tutorials,n_completed_events,mean_gap_between_tutorials_seconds,median_gap_between_tutorials_seconds,pace_cluster,pace_cluster_label
0,104582,96989,32,2020-04-11 14:39:11,2020-04-11 14:39:11,3,"2018-08-29 11:19:45,2019-02-05 09:20:57,2020-0...",2020-02-26 17:05:28,1,2018-08-29 11:19:45,...,slow,96989,0.0,0.0,26,0,2042828.0,3063.0,1,Normal Learner
1,30951,96989,51,2019-08-10 02:42:35,2019-08-10 02:42:35,1,2019-06-20 21:01:59,2019-06-20 21:01:59,0,2019-06-20 21:01:59,...,fast,96989,901415.0,69339.615385,28,13,190567.7,613.0,1,Normal Learner
2,74313,96989,74,2019-12-23 06:22:02,2019-12-23 06:22:02,2,"2019-10-29 09:34:40,2019-10-31 09:38:30",2019-10-31 09:38:30,1,2019-10-29 09:34:40,...,normal,96989,0.0,0.0,59,0,554729.2,255.5,1,Normal Learner
3,26219,96989,83,2019-06-20 05:45:36,2019-06-20 05:45:36,1,2019-06-14 18:15:40,2019-06-14 18:15:40,0,2019-06-14 18:15:40,...,fast,96989,424079.0,12850.878788,50,33,731610.1,295.0,1,Normal Learner
4,23887,96989,104,2019-05-18 08:58:46,2019-05-18 08:58:46,1,2019-05-10 14:30:08,2019-05-10 14:30:08,0,2019-05-10 14:30:08,...,fast,96989,496351.0,8272.516667,72,60,9276.789,343.0,1,Normal Learner


In [50]:
pace_table.to_excel("pace_analisys.xlsx", index=False)

In [51]:
import joblib
# Save model + scaler + mapping
joblib.dump(kmeans_final, "pace_model_kmeans.pkl")
joblib.dump(scaler, "pace_scaler.pkl")
joblib.dump(cluster_label_map, "pace_cluster_map.pkl")

print("Model, scaler, label mapping saved!")

Model, scaler, label mapping saved!


# inference

In [52]:

# Load model components
kmeans = joblib.load("pace_model_kmeans.pkl")
scaler = joblib.load("pace_scaler.pkl")
cluster_label_map = joblib.load("pace_cluster_map.pkl")

# Features used for inference
features = [
    "completions_duration_day",
    "pace_percent_vs_median",
    "total_active_seconds",
    "mean_active_seconds_per_tutorial",
    "mean_gap_between_tutorials_seconds",
    "enrolling_times",
]

def predict_pace(user_data: dict):
    """
    user_data: dict dengan 6 fitur utama pace
    return: label pace + cluster id + scaled features
    """

    # Convert dict â†’ DataFrame
    df = pd.DataFrame([user_data])

    # Fill missing with 0 (tracking often missing)
    df = df.fillna(0)

    # Extract features
    X = df[features]

    # Scale
    X_scaled = scaler.transform(X)

    # Predict cluster
    cluster_id = int(kmeans.predict(X_scaled)[0])

    # Map cluster to label
    label = cluster_label_map[cluster_id]

    return {
        "cluster_id": cluster_id,
        "pace_label": label,
        "features_scaled": X_scaled.tolist(),
    }


In [60]:
fast_dummy = [
    {
        "completions_duration_day": 19,
        "pace_percent_vs_median": 0,
        "total_active_seconds": 0,
        "mean_active_seconds_per_tutorial": 0,
        "mean_gap_between_tutorials_seconds": 8.28e+07,
        "enrolling_times": 1
    },
    {
        "completions_duration_day": 10,
        "pace_percent_vs_median": 45,
        "total_active_seconds": 0,
        "mean_active_seconds_per_tutorial": 0,
        "mean_gap_between_tutorials_seconds": 7.0e+07,
        "enrolling_times": 1
    },
    {
        "completions_duration_day": 25,
        "pace_percent_vs_median": -10,
        "total_active_seconds": 0,
        "mean_active_seconds_per_tutorial": 0,
        "mean_gap_between_tutorials_seconds": 9.0e+07,
        "enrolling_times": 1
    }
]

normal_dummy = [
    {
        "completions_duration_day": 235,
        "pace_percent_vs_median": -178,
        "total_active_seconds": 1.28e+08,
        "mean_active_seconds_per_tutorial": 1.87e+06,
        "mean_gap_between_tutorials_seconds": 5.13e+05,
        "enrolling_times": 1
    },
    {
        "completions_duration_day": 180,
        "pace_percent_vs_median": -100,
        "total_active_seconds": 9.0e+07,
        "mean_active_seconds_per_tutorial": 1.5e+06,
        "mean_gap_between_tutorials_seconds": 4.0e+05,
        "enrolling_times": 1
    },
    {
        "completions_duration_day": 280,
        "pace_percent_vs_median": -220,
        "total_active_seconds": 1.5e+08,
        "mean_active_seconds_per_tutorial": 2.0e+06,
        "mean_gap_between_tutorials_seconds": 6.0e+05,
        "enrolling_times": 2
    }
]
slow_dummy = [
    {
        "completions_duration_day": 1132,
        "pace_percent_vs_median": -1003,
        "total_active_seconds": 2.85e+09,
        "mean_active_seconds_per_tutorial": 4.53e+07,
        "mean_gap_between_tutorials_seconds": 1.15e+06,
        "enrolling_times": 2
    },
    {
        "completions_duration_day": 1000,
        "pace_percent_vs_median": -900,
        "total_active_seconds": 2.5e+09,
        "mean_active_seconds_per_tutorial": 4.0e+07,
        "mean_gap_between_tutorials_seconds": 1.0e+06,
        "enrolling_times": 2
    },
    {
        "completions_duration_day": 1300,
        "pace_percent_vs_median": -1100,
        "total_active_seconds": 3.0e+09,
        "mean_active_seconds_per_tutorial": 5.0e+07,
        "mean_gap_between_tutorials_seconds": 1.3e+06,
        "enrolling_times": 3
    }
]
dummy_data = {
    "fast": fast_dummy,
    "normal": normal_dummy,
    "slow": slow_dummy
}

In [55]:
def generate_insight(user_data, prediction):
    pace = prediction["pace_label"]
    pct = user_data["pace_percent_vs_median"]
    gap = user_data["mean_gap_between_tutorials_seconds"]

    if pace == "Fast Learner":
        return f"Kamu termasuk Fast Learner! Kamu belajar {pct}% lebih cepat dari median peserta, dan jeda antar materi sangat pendek ({gap} detik). Pertahankan performamu!"

    elif pace == "Normal Learner":
        return f"Kamu termasuk Normal Learner. Kecepatan belajarmu mendekati rata-rata pengguna lain, dengan jeda antar materi {gap} detik. Kamu memiliki ritme belajar yang stabil."

    else:
        return f"Kamu termasuk Slow Learner. Kamu membutuhkan lebih banyak waktu dibanding rata-rata peserta. Cobalah mengurangi jeda antar materi (saat ini {gap} detik) agar tempomu meningkat."


In [61]:
from pprint import pprint

def generate_insight(user_data, prediction):
    pace = prediction["pace_label"]
    pct = user_data["pace_percent_vs_median"]
    gap = user_data["mean_gap_between_tutorials_seconds"]

    if pace == "Fast Learner":
        return f"Kamu termasuk Fast Learner! Kamu belajar {pct}% lebih cepat dari median peserta, dan jeda antar materi sangat pendek ({gap} detik). Pertahankan performamu!"

    elif pace == "Normal Learner":
        return f"Kamu termasuk Normal Learner. Kecepatan belajarmu mendekati rata-rata pengguna lain, dengan jeda antar materi {gap} detik. Kamu memiliki ritme belajar yang stabil."

    else:
        return f"Kamu termasuk Slow Learner. Kamu membutuhkan lebih banyak waktu dibanding rata-rata peserta. Cobalah mengurangi jeda antar materi (saat ini {gap} detik) agar tempomu meningkat."

for sample in dummy_data["fast"]:
    pred = predict_pace(sample)
    insight = generate_insight(sample, pred)
    print("\nFAST SAMPLE")
    pprint(sample)
    print(pred)
    print(insight)

for sample in dummy_data["normal"]:
    pred = predict_pace(sample)
    insight = generate_insight(sample, pred)
    print("\nNORMAL SAMPLE")
    pprint(sample)
    print(pred)
    print(insight)

for sample in dummy_data["slow"]:
    pred = predict_pace(sample)
    insight = generate_insight(sample, pred)
    print("\nSLOW SAMPLE")
    pprint(sample)
    print(pred)
    print(insight)


FAST SAMPLE
{'completions_duration_day': 19,
 'enrolling_times': 1,
 'mean_active_seconds_per_tutorial': 0,
 'mean_gap_between_tutorials_seconds': 82800000.0,
 'pace_percent_vs_median': 0,
 'total_active_seconds': 0}
{'cluster_id': 2, 'pace_label': 'Fast Learner', 'features_scaled': [[-0.6297360082195537, 0.22097859112022733, -0.36821832998608145, -0.3689209420874012, 27.827131286423054, -0.37081541925310213]]}
Kamu termasuk Fast Learner! Kamu belajar 0% lebih cepat dari median peserta, dan jeda antar materi sangat pendek (82800000.0 detik). Pertahankan performamu!

FAST SAMPLE
{'completions_duration_day': 10,
 'enrolling_times': 1,
 'mean_active_seconds_per_tutorial': 0,
 'mean_gap_between_tutorials_seconds': 70000000.0,
 'pace_percent_vs_median': 45,
 'total_active_seconds': 0}
{'cluster_id': 2, 'pace_label': 'Fast Learner', 'features_scaled': [[-0.6479656733102291, 0.25888256276669736, -0.36821832998608145, -0.3689209420874012, 23.490756173422582, -0.37081541925310213]]}
Kamu terma

In [57]:
for sample in dummy_data["slow"]:
    print("\nSLOW SAMPLE")
    pred = predict_pace(sample)
    insight = generate_insight(sample, pred)
    pprint(sample)
    print(pred)
    print(insight)


SLOW SAMPLE
{'completions_duration_day': 250,
 'enrolling_times': 2,
 'mean_active_seconds_per_tutorial': 50,
 'mean_gap_between_tutorials_seconds': 200000,
 'pace_percent_vs_median': -180,
 'total_active_seconds': 500}
{'cluster_id': 1, 'pace_label': 'Normal Learner', 'features_scaled': [[-0.16184127089221745, 0.06936270453434716, -0.36821788770244634, -0.36891808817290456, -0.1560393646581016, 1.1660581256031286]]}
Kamu termasuk Normal Learner. Kecepatan belajarmu mendekati rata-rata pengguna lain, dengan jeda antar materi 200000 detik. Kamu memiliki ritme belajar yang stabil.

SLOW SAMPLE
{'completions_duration_day': 500,
 'enrolling_times': 3,
 'mean_active_seconds_per_tutorial': 2000,
 'mean_gap_between_tutorials_seconds': 800000,
 'pace_percent_vs_median': -400,
 'total_active_seconds': 100000}
{'cluster_id': 1, 'pace_label': 'Normal Learner', 'features_scaled': [[0.344538314959878, -0.11594560129283972, -0.368129873259065, -0.3688067855075337, 0.04722821876379542, 2.70293167045

In [58]:
for sample in dummy_data["slow"]:
    print("\nSLOW SAMPLE")
    pred = predict_pace(sample)
    insight = generate_insight(sample, pred)
    pprint(sample)
    print(pred)
    print(insight)


SLOW SAMPLE
{'completions_duration_day': 250,
 'enrolling_times': 2,
 'mean_active_seconds_per_tutorial': 50,
 'mean_gap_between_tutorials_seconds': 200000,
 'pace_percent_vs_median': -180,
 'total_active_seconds': 500}
{'cluster_id': 1, 'pace_label': 'Normal Learner', 'features_scaled': [[-0.16184127089221745, 0.06936270453434716, -0.36821788770244634, -0.36891808817290456, -0.1560393646581016, 1.1660581256031286]]}
Kamu termasuk Normal Learner. Kecepatan belajarmu mendekati rata-rata pengguna lain, dengan jeda antar materi 200000 detik. Kamu memiliki ritme belajar yang stabil.

SLOW SAMPLE
{'completions_duration_day': 500,
 'enrolling_times': 3,
 'mean_active_seconds_per_tutorial': 2000,
 'mean_gap_between_tutorials_seconds': 800000,
 'pace_percent_vs_median': -400,
 'total_active_seconds': 100000}
{'cluster_id': 1, 'pace_label': 'Normal Learner', 'features_scaled': [[0.344538314959878, -0.11594560129283972, -0.368129873259065, -0.3688067855075337, 0.04722821876379542, 2.70293167045

In [59]:
user_sample = {
    "completions_duration_day": 25,
    "pace_percent_vs_median": 30,
    "total_active_seconds": 18000,
    "mean_active_seconds_per_tutorial": 900,
    "mean_gap_between_tutorials_seconds": 300,
    "enrolling_times": 1
}

output = predict_pace(user_sample)
print(output)

{'cluster_id': 1, 'pace_label': 'Normal Learner', 'features_scaled': [[-0.6175828981591034, 0.24624790555120735, -0.3682024077752185, -0.36886957162646083, -0.22369359200702296, -0.37081541925310213]]}


In [None]:
pred = predict_pace(user_sample)
insight = generate_insight(user_sample, pred)
print(insight)


### Karakteristik 'Fast Learner'

Seorang pengguna dikategorikan sebagai 'Fast Learner' jika mereka memiliki profil yang mirip dengan **Cluster 2** dari model K-Means:

*   **`completions_duration_day` (Durasi Penyelesaian)**: Ini adalah indikator utama. Rata-rata durasi penyelesaian untuk 'Fast Learner' adalah yang paling rendah, yaitu sekitar **19 hari**. Ini menunjukkan bahwa mereka menyelesaikan perjalanan belajar jauh lebih cepat dibandingkan dengan kelompok 'Normal' dan 'Slow' Learner.

*   **`pace_percent_vs_median` (Persentase Kecepatan vs Median)**: Untuk cluster 'Fast Learner', nilai ini rata-rata **0.0**. Ini bisa berarti bahwa meskipun mereka menyelesaikan secara keseluruhan paling cepat, mereka menyelesaikan perjalanannya dalam waktu yang *sama* dengan median durasi untuk perjalanan tersebut. Ini mungkin disebabkan oleh karakteristik journey yang memang singkat atau data yang dominan pada nilai median.

*   **`total_active_seconds` (Total Waktu Aktif)**: Rata-rata waktu aktif untuk kelompok ini cenderung **sangat rendah atau bahkan 0 detik**. Ini bisa mengindikasikan:
    *   Pengguna sudah familiar dengan materi dan langsung menyelesaikan tutorial tanpa menghabiskan banyak waktu aktif.
    *   Ada kemungkinan data tracking aktivitas belajar mereka tidak tercatat secara lengkap atau efektif.

*   **`mean_active_seconds_per_tutorial` (Rata-rata Waktu Aktif per Tutorial)**: Mirip dengan total waktu aktif, nilai ini juga **sangat rendah atau 0**, menunjukkan keterlibatan yang minimal pada setiap tutorial secara individual.

*   **`mean_gap_between_tutorials_seconds` (Rata-rata Jeda Antar Tutorial)**: Kelompok ini menunjukkan rata-rata jeda antar tutorial yang **sangat tinggi**, yaitu sekitar **82 juta detik** (sekitar 2.5 tahun). Ini adalah nilai yang *counter-intuitive* untuk 'Fast Learner' jika diartikan sebagai jeda sebenarnya. Lebih mungkin, ini menunjukkan situasi di mana:
    *   Ada jeda yang sangat panjang karena tutorial pertama dan terakhir yang tercatat berjauhan padahal aktivitas sebenarnya singkat.
    *   Data jeda ini mungkin didominasi oleh kasus-kasus ekstrem atau data tracking yang tidak konsisten.

*   **`enrolling_times` (Jumlah Pendaftaran)**: Rata-rata jumlah pendaftaran untuk 'Fast Learner' adalah **1 kali**, menunjukkan bahwa mereka cenderung menyelesaikan perjalanan dalam satu kali upaya pendaftaran.

**Kesimpulan:**

Kategori 'Fast Learner' sebagian besar ditentukan oleh **waktu penyelesaian keseluruhan yang sangat singkat (`completions_duration_day`)**. Meskipun metrik aktivitas (`total_active_seconds`, `mean_active_seconds_per_tutorial`) dan jeda antar tutorial (`mean_gap_between_tutorials_seconds`) mungkin menunjukkan anomali (kemungkinan karena data tracking atau pola belajar yang unik seperti skip materi), durasi penyelesaian yang rendah adalah faktor penentu utama untuk cluster ini. Ini menunjukkan pengguna yang efisien dalam menyelesaikan modul, terlepas dari detail interaksi mikro mereka dengan setiap tutorial.

### Bagaimana Kategori 'Fast', 'Normal', dan 'Slow Learner' Ditentukan?

Penentuan kategori pembelajar (Fast, Normal, Slow) dilakukan melalui langkah-langkah berikut:

1.  **K-Means Clustering**: Data pengguna, yang telah di-feature engineer (misalnya `completions_duration_day`, `pace_percent_vs_median`, `total_active_seconds`, dll.), dikelompokkan menjadi 3 cluster menggunakan algoritma K-Means.

2.  **Analisis Rata-rata Durasi Penyelesaian**: Untuk setiap cluster yang terbentuk, dihitung rata-rata `completions_duration_day` (durasi penyelesaian dalam hari). Durasi ini menjadi indikator utama seberapa cepat atau lambat anggota cluster tersebut menyelesaikan pembelajaran.

3.  **Pemberian Label Berdasarkan Urutan Durasi**: Setelah mendapatkan rata-rata durasi untuk setiap cluster, cluster-cluster tersebut kemudian diurutkan dari durasi terkecil hingga terbesar:
    *   **Cluster dengan rata-rata `completions_duration_day` TERKECIL** diberi label **'Fast Learner'**.
    *   **Cluster dengan rata-rata `completions_duration_day` MENENGAH** diberi label **'Normal Learner'**.
    *   **Cluster dengan rata-rata `completions_duration_day` TERBESAR** diberi label **'Slow Learner'**.

Dengan metode ini, setiap pengguna yang masuk ke dalam salah satu dari 3 cluster tersebut secara otomatis akan mendapatkan label kecepatan belajar yang sesuai.