 Tahap 1: Pengumpulan & Eksplorasi Data

In [185]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [186]:
df = pd.read_csv('../data/StudentsPerformance.csv')

# Hitung rata-rata nilai
df['average_score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)

# Label kelulusan
df['Passed'] = df['average_score'].apply(lambda x: 1 if x >= 60 else 0)

# Mapping nilai kategori ke label lebih deskriptif
df['race/ethnicity'] = df['race/ethnicity'].map({
    'group A': 'Kelompok Sosial Ekonomi 1 (Rendah)',
    'group B': 'Kelompok Sosial Ekonomi 2',
    'group C': 'Kelompok Sosial Ekonomi 3 (Menengah)',
    'group D': 'Kelompok Sosial Ekonomi 4',
    'group E': 'Kelompok Sosial Ekonomi 5 (Tinggi)'
})
df['lunch'] = df['lunch'].map({
    'standard': 'Bayar Sendiri',
    'free/reduced': 'Subsidi Sekolah'
})
df['test preparation course'] = df['test preparation course'].map({
    'none': 'Tidak Ikut Bimbel',
    'completed': 'Selesai Ikut Bimbel'
})

# Simpan data yang telah dibersihkan
df.to_csv('../data/student_cleaned.csv', index=False)

Tahap 2: Feature Selection & Split

In [187]:
X = df[[
    'gender',
    'race/ethnicity',
    'parental level of education',
    'lunch',
    'test preparation course',
    'math score',
    'reading score',
    'writing score'
]]
y = df['Passed']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Tahap 3: Preprocessing & Pipeline

In [188]:
cat_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
], remainder='passthrough')

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Train model
pipeline.fit(X_train, y_train)

# Predict & evaluate
y_pred = pipeline.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[ 61   1]
 [  3 135]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.98      0.97        62
           1       0.99      0.98      0.99       138

    accuracy                           0.98       200
   macro avg       0.97      0.98      0.98       200
weighted avg       0.98      0.98      0.98       200



Tahap 4: Save Model & Pipeline

In [189]:
import os
import joblib

# Buat folder model jika belum ada
os.makedirs('../model', exist_ok=True)

# Simpan pipeline lengkap (preprocessor + model)
joblib.dump(pipeline, '../model/pipeline.pkl')

# (Opsional) Simpan hanya model classifier
joblib.dump(pipeline.named_steps['classifier'], '../model/model.pkl')

# (Opsional) Simpan nama-nama fitur
joblib.dump(X.columns.tolist(), '../model/feature_names.pkl')


['../model/feature_names.pkl']