 Tahap 1: Pengumpulan & Eksplorasi Data

1. Load Dataset

In [152]:
import pandas as pd

# Load dataset
df = pd.read_csv('../data/StudentsPerformance.csv')

# Tampilkan 5 baris pertama
df.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [153]:
df['average_score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)
df['Passed'] = df['average_score'].apply(lambda x: 1 if x >= 60 else 0)


In [154]:
df['race/ethnicity'] = df['race/ethnicity'].map({
    'group A': 'Kelompok Sosial Ekonomi 1 (Rendah)',
    'group B': 'Kelompok Sosial Ekonomi 2',
    'group C': 'Kelompok Sosial Ekonomi 3 (Menengah)',
    'group D': 'Kelompok Sosial Ekonomi 4',
    'group E': 'Kelompok Sosial Ekonomi 5 (Tinggi)'
})


In [155]:
df['lunch'] = df['lunch'].map({
    'standard': 'Bayar Sendiri',
    'free/reduced': 'Subsidi Sekolah'
})


In [156]:
df['test preparation course'] = df['test preparation course'].map({
    'none': 'Tidak Ikut Bimbel',
    'completed': 'Selesai Ikut Bimbel'
})


In [157]:
df.to_csv('../data/student_cleaned.csv', index=False)


Tahap 2: Pemodelan Machine Learning

In [158]:
import pandas as pd

df = pd.read_csv('../data/student_cleaned.csv')
df.head()


Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,average_score,Passed
0,female,Kelompok Sosial Ekonomi 2,bachelor's degree,Bayar Sendiri,Tidak Ikut Bimbel,72,72,74,72.666667,1
1,female,Kelompok Sosial Ekonomi 3 (Menengah),some college,Bayar Sendiri,Selesai Ikut Bimbel,69,90,88,82.333333,1
2,female,Kelompok Sosial Ekonomi 2,master's degree,Bayar Sendiri,Tidak Ikut Bimbel,90,95,93,92.666667,1
3,male,Kelompok Sosial Ekonomi 1 (Rendah),associate's degree,Subsidi Sekolah,Tidak Ikut Bimbel,47,57,44,49.333333,0
4,male,Kelompok Sosial Ekonomi 3 (Menengah),some college,Bayar Sendiri,Tidak Ikut Bimbel,76,78,75,76.333333,1


In [159]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import joblib

cat_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']
num_features = ['math score', 'reading score', 'writing score']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Training
pipeline.fit(X_train, y_train)

# Simpan pipeline sekaligus
joblib.dump(pipeline, '../model/pipeline.pkl')


['../model/pipeline.pkl']

In [160]:
X = df[[
    'gender',
    'race/ethnicity',
    'parental level of education',
    'lunch',
    'test preparation course',
    'math score',
    'reading score',
    'writing score'
]]

y = df['Passed']


In [161]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

categorical_features = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='passthrough')


In [162]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [163]:
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

clf.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [164]:
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[ 61   1]
 [  3 135]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.97        62
           1       0.99      0.98      0.99       138

    accuracy                           0.98       200
   macro avg       0.97      0.98      0.98       200
weighted avg       0.98      0.98      0.98       200



In [165]:
import joblib

# Simpan model Random Forest
joblib.dump(model, '../model/model.pkl')

# Simpan nama fitur (kalau perlu)
joblib.dump(X.columns.tolist(), '../model/feature_names.pkl')

# Jika pakai preprocessor (ColumnTransformer), simpan juga
joblib.dump(preprocessor, '../model/preprocessor.pkl')


['../model/preprocessor.pkl']