In [3]:

# 1. Import library
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

# 2. Load dataset
train = pd.read_csv("data_training.csv")
test = pd.read_csv("data_testing.csv")

print("Data Training Shape:", train.shape)
print("Data Testing Shape:", test.shape)
print("\nKolom Training:", train.columns.tolist())
print("Kolom Testing:", test.columns.tolist())

# 3. Pengecekan missing value
print("\nCek Missing Value:\n", train.isnull().sum())
train = train.dropna()  # jika ada NA, hapus atau bisa juga imputasi

# 4. Pisahkan fitur dan target
X = train.drop(columns=["quality", "Id"]) # Drop 'Id' column from training data
y = train["quality"]

# 5. Normalisasi (karena fitur kimiawi memiliki skala berbeda)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(test.drop(columns=["Id"], errors="ignore"))

# 6. Split data training menjadi train dan validasi
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# 7. Buat model klasifikasi (contoh: Random Forest)
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    max_depth=None,
    min_samples_split=2,
    n_jobs=-1
)

# 8. Latih model
model.fit(X_train, y_train)

# 9. Evaluasi model di data validasi
y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print("\nAkurasi Validasi:", round(acc, 4))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("\nLaporan Klasifikasi:\n", classification_report(y_val, y_pred))

# 10. Cross-validation untuk melihat kestabilan model
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
print("\nRata-rata Akurasi Cross-Val:", np.mean(cv_scores).round(4))

# 11. Prediksi pada data testing
y_test_pred = model.predict(X_test_scaled)

# 12. Simpan hasil prediksi
hasil_prediksi = pd.DataFrame({
    "Id": test["Id"],
    "quality": y_test_pred
})

# Ganti 028 dengan 3 digit terakhir NIM kamu
hasil_prediksi.to_csv("hasilprediksi_028.csv", index=False)
print("\nFile hasilprediksi_028.csv berhasil disimpan!")

# 13. Simpan model dan scaler untuk deployment
joblib.dump(model, "model_wine_quality.pkl")
joblib.dump(scaler, "scaler_wine.pkl")
print("Model dan scaler disimpan.")

Data Training Shape: (857, 13)
Data Testing Shape: (286, 12)

Kolom Training: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'quality', 'Id']
Kolom Testing: ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol', 'Id']

Cek Missing Value:
 fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
Id                      0
dtype: int64

Akurasi Validasi: 0.6105

Confusion Matrix:
 [[ 0  0  0  1  0  0]
 [ 0  0  5  0  0  0]
 [ 0  0 58 14  1  0]
 [ 0  0 24 40  4  0]
 [ 0  0  0 16  6  0]
 [ 0  0  0  0  2 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Rata-rata Akurasi Cross-Val: 0.6371

File hasilprediksi_028.csv berhasil disimpan!
Model dan scaler disimpan.
