In [2]:
import pandas as pd

# Load dataset hasil cleaning
df = pd.read_csv('merged_cleaned.csv')

# Cek ukuran dan 5 baris awal
print("✅ Dataset berhasil dimuat:", df.shape)
df.head()


✅ Dataset berhasil dimuat: (2013, 22)


Unnamed: 0,developer_id,journey_id,tutorial_id,status,first_opened_at,completed_at,last_viewed,reviewer_id,rating,enrolling_times,...,exam_registration_id,total_questions,score,is_passed,display_name,email,user_role,created_at,city_id,learning_duration_min
0,96989,26,599,1,,,2017-05-26 21:34:00,,,,...,,,,,igihcksn,igihcksn@gmail.com,2,2017-04-03 19:16:00,428,0.0
1,96989,32,1152,1,2018-08-29 11:19:00,,,1295.0,4.0,3.0,...,,,,,igihcksn,igihcksn@gmail.com,2,2017-04-03 19:16:00,428,0.0
2,96989,72,2183,1,2018-09-02 17:11:00,,,160797.0,3.0,,...,,,,,igihcksn,igihcksn@gmail.com,2,2017-04-03 19:16:00,428,0.0
3,96989,92,2246,1,2018-10-04 23:09:00,,,,,,...,,,,,igihcksn,igihcksn@gmail.com,2,2017-04-03 19:16:00,428,0.0
4,96989,14,1160,1,2018-11-29 11:38:00,,,,,,...,,,,,igihcksn,igihcksn@gmail.com,2,2017-04-03 19:16:00,428,0.0


In [3]:
# Cek tipe data & missing values
df.info()

# Cek jumlah missing value di tiap kolom
df.isnull().sum().sort_values(ascending=False).head(15)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2013 entries, 0 to 2012
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   developer_id           2013 non-null   int64  
 1   journey_id             2013 non-null   int64  
 2   tutorial_id            2013 non-null   int64  
 3   status                 2013 non-null   int64  
 4   first_opened_at        1512 non-null   object 
 5   completed_at           176 non-null    object 
 6   last_viewed            208 non-null    object 
 7   reviewer_id            529 non-null    float64
 8   rating                 549 non-null    float64
 9   enrolling_times        1032 non-null   float64
 10  study_duration         1032 non-null   object 
 11  avg_submission_rating  474 non-null    float64
 12  exam_registration_id   9 non-null      float64
 13  total_questions        9 non-null      float64
 14  score                  9 non-null      float64
 15  is_p

Unnamed: 0,0
total_questions,2004
exam_registration_id,2004
score,2004
is_passed,2004
completed_at,1837
last_viewed,1805
avg_submission_rating,1539
reviewer_id,1484
rating,1464
study_duration,981


In [4]:
# Konversi kolom waktu ke datetime
time_cols = ['first_opened_at', 'completed_at', 'last_viewed', 'created_at']
for col in time_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')

# Konversi study_duration ke menit (kalau ada string format jam:menit:detik)
df['study_duration'] = pd.to_timedelta(df['study_duration'], errors='coerce').dt.total_seconds() / 60

# Cek hasil konversi
df[time_cols + ['study_duration']].head()


Unnamed: 0,first_opened_at,completed_at,last_viewed,created_at,study_duration
0,NaT,NaT,2017-05-26 21:34:00,2017-04-03 19:16:00,
1,2018-08-29 11:19:00,NaT,NaT,2017-04-03 19:16:00,7.333333e-10
2,2018-09-02 17:11:00,NaT,NaT,2017-04-03 19:16:00,
3,2018-10-04 23:09:00,NaT,NaT,2017-04-03 19:16:00,
4,2018-11-29 11:38:00,NaT,NaT,2017-04-03 19:16:00,


In [5]:
# Hitung jumlah tutorial unik yang sudah diakses per user
materials_completed = (
    df.groupby('developer_id')['tutorial_id']
    .nunique()
    .reset_index()
    .rename(columns={'tutorial_id': 'materials_completed'})
)

# Cek hasilnya
materials_completed.head(10)


Unnamed: 0,developer_id,materials_completed
0,3390,87
1,5774,66
2,11836,80
3,17833,64
4,32258,114
5,96989,46
6,102556,93
7,110119,64
8,113842,60
9,153116,60


In [6]:
# Pastikan kolom first_opened_at sudah datetime
df['first_opened_at'] = pd.to_datetime(df['first_opened_at'], errors='coerce')

# Hitung jumlah hari unik user aktif (berdasarkan tanggal)
active_days = (
    df.dropna(subset=['first_opened_at'])
      .assign(active_date=df['first_opened_at'].dt.date)
      .groupby('developer_id')['active_date']
      .nunique()
      .reset_index()
      .rename(columns={'active_date': 'active_days'})
)

# Cek hasilnya
active_days.head(10)


Unnamed: 0,developer_id,active_days
0,3390,46
1,5774,34
2,11836,33
3,17833,40
4,32258,59
5,96989,32
6,102556,61
7,110119,30
8,113842,26
9,153116,32


In [7]:
# Gabungkan materials_completed dan active_days jadi satu dataframe
features = pd.merge(
    materials_completed,
    active_days,
    on='developer_id',
    how='outer'
).fillna(0)

# Cek hasil
features.head(10)


Unnamed: 0,developer_id,materials_completed,active_days
0,3390,87,46
1,5774,66,34
2,11836,80,33
3,17833,64,40
4,32258,114,59
5,96989,46,32
6,102556,93,61
7,110119,64,30
8,113842,60,26
9,153116,60,32


In [8]:
# Hitung rata-rata rating per user
avg_rating = (
    df.groupby('developer_id')['rating']
      .mean()
      .reset_index()
      .rename(columns={'rating': 'avg_rating'})
)

# Cek hasil
avg_rating.head(10)


Unnamed: 0,developer_id,avg_rating
0,3390,1.344828
1,5774,1.0
2,11836,2.0
3,17833,3.12
4,32258,2.642857
5,96989,2.590909
6,102556,2.242424
7,110119,2.75
8,113842,1.95
9,153116,2.3125


In [9]:
# Gabungkan avg_rating ke dataframe fitur
features = pd.merge(
    features,
    avg_rating,
    on='developer_id',
    how='left'
).fillna(0)

# Cek hasil
features.head(10)


Unnamed: 0,developer_id,materials_completed,active_days,avg_rating
0,3390,87,46,1.344828
1,5774,66,34,1.0
2,11836,80,33,2.0
3,17833,64,40,3.12
4,32258,114,59,2.642857
5,96989,46,32,2.590909
6,102556,93,61,2.242424
7,110119,64,30,2.75
8,113842,60,26,1.95
9,153116,60,32,2.3125


In [10]:
# Hitung rata-rata nilai ujian per user
avg_score = (
    df.groupby('developer_id')['score']
      .mean()
      .reset_index()
      .rename(columns={'score': 'avg_score'})
)

# Cek hasil
avg_score.head(10)


Unnamed: 0,developer_id,avg_score
0,3390,
1,5774,
2,11836,80.0
3,17833,
4,32258,
5,96989,
6,102556,100.0
7,110119,
8,113842,
9,153116,


In [11]:
# Gabungkan avg_score ke dataframe fitur
features = pd.merge(
    features,
    avg_score,
    on='developer_id',
    how='left'
).fillna(0)

# Cek hasil
features.head(10)


Unnamed: 0,developer_id,materials_completed,active_days,avg_rating,avg_score
0,3390,87,46,1.344828,0.0
1,5774,66,34,1.0,0.0
2,11836,80,33,2.0,80.0
3,17833,64,40,3.12,0.0
4,32258,114,59,2.642857,0.0
5,96989,46,32,2.590909,0.0
6,102556,93,61,2.242424,100.0
7,110119,64,30,2.75,0.0
8,113842,60,26,1.95,0.0
9,153116,60,32,2.3125,0.0


In [12]:
# Hitung total durasi belajar per user (dalam menit)
study_duration_total = (
    df.groupby('developer_id')['study_duration']
      .sum()
      .reset_index()
      .rename(columns={'study_duration': 'study_duration_total'})
)

# Cek hasil
study_duration_total.head(10)


Unnamed: 0,developer_id,study_duration_total
0,3390,4.58e-08
1,5774,2.473333e-08
2,11836,7.88e-08
3,17833,3.111667e-08
4,32258,7.976667e-08
5,96989,1.44e-08
6,102556,1.414333e-07
7,110119,2.313333e-08
8,113842,1.31e-08
9,153116,4.37e-08


In [13]:
# Gabungkan total durasi belajar ke dataframe fitur
features = pd.merge(
    features,
    study_duration_total,
    on='developer_id',
    how='left'
).fillna(0)

# Cek hasil
features.head(10)


Unnamed: 0,developer_id,materials_completed,active_days,avg_rating,avg_score,study_duration_total
0,3390,87,46,1.344828,0.0,4.58e-08
1,5774,66,34,1.0,0.0,2.473333e-08
2,11836,80,33,2.0,80.0,7.88e-08
3,17833,64,40,3.12,0.0,3.111667e-08
4,32258,114,59,2.642857,0.0,7.976667e-08
5,96989,46,32,2.590909,0.0,1.44e-08
6,102556,93,61,2.242424,100.0,1.414333e-07
7,110119,64,30,2.75,0.0,2.313333e-08
8,113842,60,26,1.95,0.0,1.31e-08
9,153116,60,32,2.3125,0.0,4.37e-08


In [14]:
# Tambahkan consistency_score (dibatasi maksimum 1)
features['consistency_score'] = (features['active_days'] / 30).clip(upper=1)

# Cek hasil
features.head(10)


Unnamed: 0,developer_id,materials_completed,active_days,avg_rating,avg_score,study_duration_total,consistency_score
0,3390,87,46,1.344828,0.0,4.58e-08,1.0
1,5774,66,34,1.0,0.0,2.473333e-08,1.0
2,11836,80,33,2.0,80.0,7.88e-08,1.0
3,17833,64,40,3.12,0.0,3.111667e-08,1.0
4,32258,114,59,2.642857,0.0,7.976667e-08,1.0
5,96989,46,32,2.590909,0.0,1.44e-08,1.0
6,102556,93,61,2.242424,100.0,1.414333e-07,1.0
7,110119,64,30,2.75,0.0,2.313333e-08,1.0
8,113842,60,26,1.95,0.0,1.31e-08,0.866667
9,153116,60,32,2.3125,0.0,4.37e-08,1.0


In [15]:
# Tambahkan flag perilaku
features['fast_learner_flag'] = (features['materials_completed'] > 5).astype(int)
features['reflective_learner_flag'] = (features['avg_rating'] < 3).astype(int)

# Lihat hasilnya
features.head(10)


Unnamed: 0,developer_id,materials_completed,active_days,avg_rating,avg_score,study_duration_total,consistency_score,fast_learner_flag,reflective_learner_flag
0,3390,87,46,1.344828,0.0,4.58e-08,1.0,1,1
1,5774,66,34,1.0,0.0,2.473333e-08,1.0,1,1
2,11836,80,33,2.0,80.0,7.88e-08,1.0,1,1
3,17833,64,40,3.12,0.0,3.111667e-08,1.0,1,0
4,32258,114,59,2.642857,0.0,7.976667e-08,1.0,1,1
5,96989,46,32,2.590909,0.0,1.44e-08,1.0,1,1
6,102556,93,61,2.242424,100.0,1.414333e-07,1.0,1,1
7,110119,64,30,2.75,0.0,2.313333e-08,1.0,1,1
8,113842,60,26,1.95,0.0,1.31e-08,0.866667,1,1
9,153116,60,32,2.3125,0.0,4.37e-08,1.0,1,1


In [16]:
# Simpan hasil feature engineering
features.to_csv('features_ready.csv', index=False)

print("✅ features_ready.csv berhasil disimpan di folder data/processed/")
print("📊 Ukuran data:", features.shape)
print("🧠 Kolom fitur:", features.columns.tolist())


✅ features_ready.csv berhasil disimpan di folder data/processed/
📊 Ukuran data: (31, 9)
🧠 Kolom fitur: ['developer_id', 'materials_completed', 'active_days', 'avg_rating', 'avg_score', 'study_duration_total', 'consistency_score', 'fast_learner_flag', 'reflective_learner_flag']


In [22]:
import pandas as pd

# Load kedua dataset
features = pd.read_csv("features_ready.csv")
study_duration = pd.read_csv("study_duration_recomputed.csv")

print("✅ Data berhasil dimuat:")
print("features:", features.shape)
print("study_duration:", study_duration.shape)
print("\nKolom di features:", features.columns.tolist())
print("Kolom di study_duration:", study_duration.columns.tolist())


✅ Data berhasil dimuat:
features: (31, 9)
study_duration: (31, 2)

Kolom di features: ['developer_id', 'materials_completed', 'active_days', 'avg_rating', 'avg_score', 'study_duration_total', 'consistency_score', 'fast_learner_flag', 'reflective_learner_flag']
Kolom di study_duration: ['developer_id', 'study_duration_total']


In [23]:
# Hapus kolom durasi lama
features = features.drop(columns=['study_duration_total'], errors='ignore')

# Gabungkan dengan hasil durasi baru
features = pd.merge(features, study_duration, on='developer_id', how='left')

# Pastikan kolom numerik dibulatkan ke 2 angka di belakang koma
cols_to_round = ['avg_rating', 'avg_score', 'study_duration_total', 'consistency_score']
features[cols_to_round] = features[cols_to_round].round(2)

# Flag tetap integer
features[['fast_learner_flag', 'reflective_learner_flag']] = (
    features[['fast_learner_flag', 'reflective_learner_flag']].astype(int)
)

# Lihat 5 baris pertama hasil akhir
print("📊 Contoh hasil setelah update kolom durasi belajar:")
print(features.head())


📊 Contoh hasil setelah update kolom durasi belajar:
   developer_id  materials_completed  active_days  avg_rating  avg_score  \
0          3390                   87           46        1.34        0.0   
1          5774                   66           34        1.00        0.0   
2         11836                   80           33        2.00       80.0   
3         17833                   64           40        3.12        0.0   
4         32258                  114           59        2.64        0.0   

   consistency_score  fast_learner_flag  reflective_learner_flag  \
0                1.0                  1                        1   
1                1.0                  1                        1   
2                1.0                  1                        1   
3                1.0                  1                        0   
4                1.0                  1                        1   

   study_duration_total  
0                 961.0  
1                 962.0  
2   

In [24]:
max_days = features['active_days'].max()
features['consistency_score'] = (features['active_days'] / max_days).round(2)


In [25]:
print(features[['developer_id', 'active_days', 'consistency_score']].sort_values(by='consistency_score', ascending=False).head(10))


    developer_id  active_days  consistency_score
14        531259           73               1.00
6         102556           61               0.84
4          32258           59               0.81
24       3557828           58               0.79
23       2395575           55               0.75
18       1202367           54               0.74
22       2135328           53               0.73
21       2131303           52               0.71
19       1319457           52               0.71
10        433737           50               0.68


In [27]:
features.to_csv("features_ready_cleaned.csv", index=False)
print("✅ File features_ready_cleaned.csv berhasil disimpan!")


✅ File features_ready_cleaned.csv berhasil disimpan!
