In [1]:
import pandas as pd

# Load data dari folder raw/
users = pd.read_csv('users_clean.csv')
tracking = pd.read_csv('tracking_clean.csv')
submission = pd.read_csv('submission_clean.csv')
complete = pd.read_csv('complete_clean.csv')
registration = pd.read_csv('registration_clean.csv')
exam = pd.read_csv('exam_clean.csv')

# Cek struktur awal
print("Users:", users.shape)
print("Tracking:", tracking.shape)
print("Submission:", submission.shape)
print("Complete:", complete.shape)
print("Registration:", registration.shape)
print("Exam:", exam.shape)



Users: (31, 10)
Tracking: (101736, 7)
Submission: (2262, 19)
Complete: (1032, 10)
Registration: (16759, 11)
Exam: (17438, 7)


In [2]:
# 1Ô∏è‚É£ Base utama dari tracking (aktivitas belajar)
df = tracking.copy()

# 2Ô∏è‚É£ Gabung dengan submission (nilai tugas)
df = pd.merge(
    df, submission,
    left_on=['developer_id', 'journey_id'],
    right_on=['submitter_id', 'journey_id'],
    how='left'
)

# 3Ô∏è‚É£ Gabung dengan complete (hasil journey)
df = pd.merge(
    df, complete,
    left_on=['developer_id', 'journey_id'],
    right_on=['user_id', 'journey_id'],
    how='left'
)

# 4Ô∏è‚É£ Gabung registration + exam jadi satu dulu
exam_full = pd.merge(
    registration, exam,
    left_on='id',
    right_on='exam_registration_id',
    how='left'
)

# 5Ô∏è‚É£ Gabungkan hasil ujian ke data utama
df = pd.merge(
    df, exam_full,
    left_on=['developer_id', 'journey_id'],
    right_on=['examinees_id', 'tutorial_id'],
    how='left'
)

# 6Ô∏è‚É£ Tambahkan identitas user (aman dari duplikat kolom)
users_cleaned = users.copy()

# Drop kolom yang sering bentrok
dupes = [c for c in users_cleaned.columns if c in df.columns]
if dupes:
    users_cleaned = users_cleaned.drop(columns=dupes)

df = pd.merge(
    df,
    users_cleaned,
    left_on='developer_id',
    right_on='id',
    how='left',
    suffixes=('', '_user')
)

df.to_csv('merged_data.csv', index=False)
print("‚úÖ merged_data.csv created:", df.shape)



‚úÖ merged_data.csv created: (292493, 61)


In [3]:
drop_cols = [c for c in df.columns if any(x in c.lower() for x in ['applink', 'appcomment', 'note'])]
if drop_cols:
    df = df.drop(columns=drop_cols)
    print(f"üßΩ Kolom berikut dihapus karena tidak diperlukan: {drop_cols}")
else:
    print("‚ÑπÔ∏è Tidak ditemukan kolom aplink, appcomment, atau note.")

üßΩ Kolom berikut dihapus karena tidak diperlukan: ['note']


In [4]:
print("\nüßπ Mulai cleaning dan transformasi...")

# Konversi kolom waktu ke datetime
datetime_cols = [
    'first_opened_at', 'completed_at', 'last_viewed',
    'created_at_x', 'updated_at_x', 'created_at_y', 'updated_at_y'
]
for col in datetime_cols:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors='coerce')

# Hapus duplikat berdasarkan kombinasi unik
expected_cols = ['developer_id', 'journey_id', 'tutorial_id']
available_cols = [c for c in expected_cols if c in df.columns]
if len(available_cols) > 0:
    df = df.drop_duplicates(subset=available_cols, keep='first')
    print(f"‚úÖ Duplikat dihapus berdasarkan: {available_cols}")
else:
    print("‚ö†Ô∏è Tidak ada kolom dari daftar kunci duplikat ditemukan!")

# Hitung durasi belajar per materi (dalam menit)
if 'completed_at' in df.columns and 'first_opened_at' in df.columns:
    df['learning_duration_min'] = (
        (df['completed_at'] - df['first_opened_at']).dt.total_seconds() / 60
    )
    df['learning_duration_min'] = df['learning_duration_min'].fillna(0).clip(lower=0)
    print("‚úÖ learning_duration_min berhasil dihitung!")
else:
    print("‚ö†Ô∏è Kolom waktu tidak lengkap, durasi belajar tidak dihitung.")


üßπ Mulai cleaning dan transformasi...
‚úÖ Duplikat dihapus berdasarkan: ['developer_id', 'journey_id']
‚úÖ learning_duration_min berhasil dihitung!


In [6]:
df.to_csv('merged_data.csv', index=False)
print("\nüéâ Cleaning selesai! Hasil akhir disimpan di: data/processed/merged_data.csv")
print("Ukuran akhir dataset:", df.shape)


üéâ Cleaning selesai! Hasil akhir disimpan di: data/processed/merged_data.csv
Ukuran akhir dataset: (2013, 61)


In [7]:
drop_cols = [
    'id_x_x', 'id_y_x', 'id_x_y', 'id_y_y',
    'quiz_id', 'submitter_id', 'version_id',
    'app_link', 'app_comment', 'admin_comment',
    'status_y', 'as_trial_subscriber', 'current_reviewer',
    'started_review_at', 'ended_review_at',
    'submission_duration', 'created_at_x_x', 'updated_at_x',
    'created_at_y_x', 'updated_at_y',
    'enrollments_at', 'last_enrolled_at',
    'deleted_at', 'deadline_at', 'retake_limit_at',
    'exam_finished_at', 'look_report_at',
    'exam_module_id', 'tutorial_id_y', 'examinees_id',
    'status', 'created_at_x_y', 'created_at_y_y', 'updated_at',
    'id', 'name', 'phone', 'user_verification_status'
]


In [8]:
# Hapus kolom tidak penting
drop_cols = [c for c in drop_cols if c in df.columns]
df = df.drop(columns=drop_cols)

print(f"üßΩ {len(drop_cols)} kolom dihapus: {drop_cols}")


üßΩ 38 kolom dihapus: ['id_x_x', 'id_y_x', 'id_x_y', 'id_y_y', 'quiz_id', 'submitter_id', 'version_id', 'app_link', 'app_comment', 'admin_comment', 'status_y', 'as_trial_subscriber', 'current_reviewer', 'started_review_at', 'ended_review_at', 'submission_duration', 'created_at_x_x', 'updated_at_x', 'created_at_y_x', 'updated_at_y', 'enrollments_at', 'last_enrolled_at', 'deleted_at', 'deadline_at', 'retake_limit_at', 'exam_finished_at', 'look_report_at', 'exam_module_id', 'tutorial_id_y', 'examinees_id', 'status', 'created_at_x_y', 'created_at_y_y', 'updated_at', 'id', 'name', 'phone', 'user_verification_status']


In [9]:
df.rename(columns={
    'tutorial_id_x': 'tutorial_id',
    'status_x': 'status'
}, inplace=True)


In [10]:
if 'user_id' in df.columns:
    if df['developer_id'].equals(df['user_id']):
        print("‚úÖ developer_id dan user_id identik, user_id dihapus.")
        df.drop(columns=['user_id'], inplace=True)
    else:
        print("‚ö†Ô∏è Nilai berbeda, jangan hapus dulu! Periksa manual.")


‚ö†Ô∏è Nilai berbeda, jangan hapus dulu! Periksa manual.


In [11]:
mask_diff = df['developer_id'] != df['user_id']
diff_rows = df.loc[mask_diff, ['developer_id', 'user_id', 'journey_id']]

print(f"üîé Jumlah baris berbeda: {mask_diff.sum()}")
print("Beberapa contoh perbedaan:")
print(diff_rows.head(10))


üîé Jumlah baris berbeda: 981
Beberapa contoh perbedaan:
      developer_id  user_id  journey_id
0            96989      NaN          26
8            96989      NaN          72
106          96989      NaN          92
118          96989      NaN          14
126          96989      NaN          60
421          96989      NaN          80
442          96989      NaN         135
1191         96989      NaN         116
1306         96989      NaN         202
1582         96989      NaN          65


In [12]:
# Isi user_id yang kosong dengan developer_id
df['user_id'] = df['user_id'].fillna(df['developer_id'])
print("üß© user_id yang kosong diisi dengan developer_id.")

# Hapus kolom user_id karena sudah redundant
df.drop(columns=['user_id'], inplace=True)
print("‚úÖ Kolom user_id dihapus, developer_id tetap dipakai sebagai ID utama.")


üß© user_id yang kosong diisi dengan developer_id.
‚úÖ Kolom user_id dihapus, developer_id tetap dipakai sebagai ID utama.


In [14]:
df.to_csv('merged_cleaned.csv', index=False)
print("‚úÖ merged_cleaned.csv diperbarui dengan hasil cleaning (user_id dihapus)")


‚úÖ merged_cleaned.csv diperbarui dengan hasil cleaning (user_id dihapus)
