In [4]:
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

# Step 1: Load the data from the URL
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
columns = [
    "Status of existing checking account", "Duration in month", "Credit history", "Purpose",
    "Credit amount", "Savings account/bonds", "Present employment since", "Installment rate in percentage of disposable income",
    "Personal status and sex", "Other debtors / guarantors", "Present residence since", "Property",
    "Age in years", "Other installment plans", "Housing", "Number of existing credits at this bank",
    "Job", "Number of people being liable to provide maintenance for", "Telephone", "Foreign worker", "Target"
]
df = pd.read_csv(url, delim_whitespace=True, header=None, names=columns)

# Step 2: Select relevant columns
df_selected = df[["Age in years", "Credit amount", "Duration in month", "Present employment since", "Housing", "Target"]]

# Step 3: Preprocess categorical columns (encode "Present employment since" and "Housing")
le_employment = LabelEncoder()
le_housing = LabelEncoder()

df_selected['Present employment since'] = le_employment.fit_transform(df_selected['Present employment since'])
df_selected['Housing'] = le_housing.fit_transform(df_selected['Housing'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Present employment since'] = le_employment.fit_transform(df_selected['Present employment since'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Housing'] = le_housing.fit_transform(df_selected['Housing'])


In [12]:
le_employment.classes_

array(['A71', 'A72', 'A73', 'A74', 'A75'], dtype=object)

In [5]:
# Step 4: Fix target labels: map 1 -> 0 and 2 -> 1
df_selected['Target'] = df_selected['Target'].map({1: 0, 2: 1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Target'] = df_selected['Target'].map({1: 0, 2: 1})


In [6]:
# Step 5: Split the data into train and test sets
X = df_selected.drop("Target", axis=1)
y = df_selected["Target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Step 6: Initialize the XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

In [8]:
# Step 7: Set up GridSearchCV to tune hyperparameters
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


Parameters: { "use_label_encoder" } are not used.



In [9]:
# Step 8: Get the best model and evaluate its performance
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [10]:
# Step 9: Output the best hyperparameters and accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Accuracy:", accuracy)

Best Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
Accuracy: 0.705


In [11]:
import pickle

# Save the best model to a file
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)