In [69]:
import pandas as pd

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"

columns = [
    "Status_Checking_Account", "Duration_Months", "Credit_History",
    "Purpose", "Credit_Amount", "Savings_Account",
    "Employment_Duration", "Installment_Rate",
    "Personal_Status_Sex", "Other_Debtors",
    "Residence_Duration", "Property",
    "Age", "Other_Installment_Plans",
    "Housing", "Existing_Credits",
    "Job", "Dependents",
    "Telephone", "Foreign_Worker",
    "Target"
]

df = pd.read_csv(url, sep=" ", header=None, names=columns)


In [86]:
print("Shape:", df.shape)
print("\nTarget distribution:")
print(df["Target"].value_counts())

Shape: (1000, 21)

Target distribution:
Target
1    700
0    300
Name: count, dtype: int64


In [71]:
print("Before conversion:", df["Target"].unique())

df["Target"] = df["Target"].astype(str)
df["Target"] = df["Target"].map({"1": 1, "2": 0})

print("After conversion:", df["Target"].unique())
print("Null values:", df["Target"].isnull().sum())


Before conversion: [1 2]
After conversion: [1 0]
Null values: 0


In [72]:
#drop target from the features
X = df.drop("Target", axis=1)
y = df["Target"]

In [73]:
categorical_cols = X.select_dtypes(include="object").columns
numerical_cols = X.select_dtypes(exclude="object").columns

print("Categorical:", len(categorical_cols))
print("Numerical:", len(numerical_cols))

Categorical: 13
Numerical: 7


In [74]:
#encode categorical features

from sklearn.preprocessing import LabelEncoder

X_encoded = X.copy()

for col in categorical_cols:
    le = LabelEncoder()
    X_encoded[col] = le.fit_transform(X_encoded[col])


In [75]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(X_train.shape, X_test.shape)


(800, 20) (200, 20)


In [76]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test_scaled[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [77]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, matthews_corrcoef,
    confusion_matrix, classification_report
)


In [78]:
def get_model(model_name):

    if model_name == "Logistic Regression":
        return LogisticRegression(max_iter=1000, random_state=42)

    elif model_name == "Decision Tree":
        return DecisionTreeClassifier(max_depth=5,
                               min_samples_split=10,
                               random_state=42)

    elif model_name == "KNN":
        return KNeighborsClassifier(n_neighbors=7)

    elif model_name == "Naive Bayes":
        return GaussianNB()


    elif model_name == "Random Forest":
        return  RandomForestClassifier(n_estimators=100, random_state=42)

    elif model_name == "XGBoost":
        return XGBClassifier(
    n_estimators=200,
    max_depth=4,
    learning_rate=0.05,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)


In [79]:
def evaluate_model(model, X_test, y_test):

    y_pred = model.predict(X_test)

    # For AUC we need probabilities
    y_prob = model.predict_proba(X_test)[:, 1]

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred),
        "MCC": matthews_corrcoef(y_test, y_pred)
    }

    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return metrics, cm, report


In [80]:
model_name = "Logistic Regression"
model = get_model(model_name)

# IMPORTANT: Use scaled data for LR & KNN
if model_name in ["Logistic Regression", "KNN"]:
    model.fit(X_train_scaled, y_train)
    metrics, cm, report = evaluate_model(model, X_test_scaled, y_test)
else:
    model.fit(X_train, y_train)
    metrics, cm, report = evaluate_model(model, X_test, y_test)

print("Metrics:")
print(metrics)

print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(report)


Metrics:
{'Accuracy': 0.745, 'AUC': np.float64(0.783452380952381), 'Precision': 0.8068965517241379, 'Recall': 0.8357142857142857, 'F1 Score': 0.8210526315789474, 'MCC': np.float64(0.3787535437491282)}

Confusion Matrix:
[[ 32  28]
 [ 23 117]]

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.53      0.56        60
           1       0.81      0.84      0.82       140

    accuracy                           0.74       200
   macro avg       0.69      0.68      0.69       200
weighted avg       0.74      0.74      0.74       200



In [81]:
models_list = [
    "Logistic Regression",
    "Decision Tree",
    "KNN",
    "Naive Bayes",
    "Random Forest",
    "XGBoost"
]

results = []
trained_models = {}

for name in models_list:
    model = get_model(name)

    if name in ["Logistic Regression", "KNN"]:
        model.fit(X_train_scaled, y_train)
        metrics, _, _ = evaluate_model(model, X_test_scaled, y_test)
    else:
        model.fit(X_train, y_train)
        metrics, _, _ = evaluate_model(model, X_test, y_test)

    metrics["Model"] = name
    results.append(metrics)

    trained_models[name] = model

results_df = pd.DataFrame(results)
results_df


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Unnamed: 0,Accuracy,AUC,Precision,Recall,F1 Score,MCC,Model
0,0.745,0.783452,0.806897,0.835714,0.821053,0.378754,Logistic Regression
1,0.72,0.757262,0.868421,0.707143,0.779528,0.423146,Decision Tree
2,0.71,0.714167,0.769737,0.835714,0.80137,0.270803,KNN
3,0.695,0.75119,0.831933,0.707143,0.764479,0.348959,Naive Bayes
4,0.775,0.809464,0.806452,0.892857,0.847458,0.431124,Random Forest
5,0.76,0.78869,0.798701,0.878571,0.836735,0.394089,XGBoost


In [82]:
import joblib

for name, model in trained_models.items():
    file_name = name.replace(" ", "_") + ".pkl"
    joblib.dump(model, file_name)

joblib.dump(scaler, "scaler.pkl")
joblib.dump(numerical_cols, "numerical_cols.pkl")

print("All models saved successfully!")


All models saved successfully!


In [83]:
import joblib

test_model = joblib.load("Random_Forest.pkl")
print(test_model)

RandomForestClassifier(random_state=42)


In [84]:
print(test_model.n_estimators)


100


In [85]:
print(test_model.feature_importances_)


[0.10879693 0.1057941  0.05604292 0.06797536 0.13281018 0.04408836
 0.04979409 0.04026274 0.03625761 0.02176261 0.04148979 0.04751329
 0.10116703 0.02950183 0.02480064 0.0212145  0.02953641 0.01484674
 0.02131467 0.00503022]


In [87]:
# Combine X_test and y_test
test_df = X_test.copy()
test_df["Target"] = y_test.values

# Save to CSV
test_df.to_csv("2025AA05848_assignment2_test_data.csv", index=False)

print("Test data CSV created successfully!")


Test data CSV created successfully!
