In [5]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [6]:
training_df = pd.read_csv('training_dataset.csv')
validation_df = pd.read_csv('validation_set.csv')

training_df.head(), validation_df.head()

(   customer_number  usia               pekerjaan status_perkawinan  \
 0           531036    63  sosial media specialis           menikah   
 1           999241    43                 teknisi           menikah   
 2           995002    29  sosial media specialis            lajang   
 3           932750    40           pekerja kasar           menikah   
 4           684699    40  sosial media specialis            lajang   
 
           pendidikan gagal_bayar_sebelumnya pinjaman_rumah pinjaman_pribadi  \
 0  Pendidikan Tinggi                     no            yes               no   
 1  Pendidikan Tinggi                     no            yes               no   
 2  Pendidikan Tinggi                     no            yes              yes   
 3                SMA                     no             no               no   
 4  Pendidikan Tinggi                     no             no               no   
 
   jenis_kontak bulan_kontak_terakhir  ... hari_sejak_kontak_sebelumnya  \
 0     cellular

In [7]:
X = training_df.drop(columns=['berlangganan_deposito'])
y = training_df['berlangganan_deposito']

In [8]:
# Identifikasi kolom numerik dan kategorikal
numeric_features = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
numeric_features.remove("customer_number")  # Jangan ubah customer_number
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

# Buat preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Buat pipeline model
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Latih model
model.fit(X.drop(columns=["customer_number"]), y)

# Prediksi probabilitas pada validation set
X_val = validation_df.drop(columns=["customer_number"])
prob_predictions = model.predict_proba(X_val)[:, 1]  # Probabilitas kelas 1

# Buat dataframe hasil
submission_df = pd.DataFrame({
    "customer_number": validation_df["customer_number"],
    "berlangganan_deposito": prob_predictions
})

# Simpan sebagai CSV
submission_path = "prediksi_berlangganan.csv"
submission_df.to_csv(submission_path, index=False)

submission_df.head(), submission_path

(   customer_number  berlangganan_deposito
 0           445420                   0.04
 1           585604                   0.07
 2           888824                   0.02
 3           816820                   0.02
 4           542716                   0.05,
 'prediksi_berlangganan.csv')

In [9]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

# Ganti model dalam pipeline menjadi XGBoostClassifier untuk akurasi lebih tinggi
xgb_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        use_label_encoder=False,
        random_state=42,
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1
    ))
])

# Cross-validation untuk estimasi akurasi
scores = cross_val_score(
    xgb_model,
    X.drop(columns=["customer_number"]),
    y,
    cv=5,
    scoring="roc_auc"
)

# Latih ulang model di seluruh data
xgb_model.fit(X.drop(columns=["customer_number"]), y)

# Prediksi probabilitas pada validation set
xgb_prob_predictions = xgb_model.predict_proba(X_val)[:, 1]

# Buat file hasil prediksi
xgb_submission_df = pd.DataFrame({
    "customer_number": validation_df["customer_number"],
    "berlangganan_deposito": xgb_prob_predictions
})

xgb_submission_path = "/mnt/data/prediksi_xgboost_berlangganan.csv"
xgb_submission_df.to_csv(xgb_submission_path, index=False)

scores.mean(), xgb_submission_df.head(), xgb_submission_path


Parameters: { "use_label_encoder" } are not used.

Traceback (most recent call last):
  File "C:\Users\62822\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\metrics\_scorer.py", line 140, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "C:\Users\62822\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\metrics\_scorer.py", line 380, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "C:\Users\62822\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\metrics\_scorer.py", line 90, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\62822\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\

AttributeError: 'super' object has no attribute '__sklearn_tags__'

In [10]:
%pip install XGBClassifier

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement XGBClassifier (from versions: none)

[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\62822\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: No matching distribution found for XGBClassifier
