In [22]:
import pandas as pd

class DataLoader:
    def __init__(self, file_path):
        """
        Inicjalizuje DataLoader z ścieżką do pliku CSV.
        :param file_path: Ścieżka do pliku CSV
        """
        self.file_path = file_path
        self.df = None
        self.checking_account_mapping = {
            "A14": 3,   # No checking account
            "A13": 2,   # No balance
            "A12": 2,   # No balance
            "A11": 1    # Some Balance
        }
        self.credit_history_mapping = {
            "A34": 1,   # Some Problems
            "A33": 1,   # Some Problems
            "A32": 3,   # No Problems (in this bank)
            "A31": 3,   # No Problems (in this bank)
            "A30": 2,   # Paid Up
        }
        self.purpose_mapping = {
            "A40": 1,   # New car
            "A41": 2,   # Used car
            "A42": 3,   # Home Related
            "A43": 3,   # Home Related
            "A44": 3,   # Home Related
            "A45": 3,   # Home Related
            "A46": 4,   # Other
            "A47": 4,   # Other
            "A48": 4,   # Other
            "A49": 4,   # Other
            "A410": 4   # Other
        }
        self.savings_mapping = {
            "A61": 2,   # Below 100 DM
            "A62": 3,   # [100, 1000] DM
            "A63": 3,   # [100, 1000] DM
            "A64": 4,   # Above 1000 DM
            "A65": 1    # None
        }
        self.employment_length_mapping = {
            "A71": 1,   # Below 1 year (including unemployed)
            "A72": 2,   # Below 1 year (including unemployed)
            "A73": 3,   # [1, 4)
            "A74": 4,   # [4, 7)
            "A75": 5    # Above 7
        }
        self.sex_marital_status_mapping = {
            "A91": 1,   # Male Divorced/Single
            "A92": 3,   # Female
            "A93": 1,   # Male Divorced/Single
            "A94": 2,   # Male Married/Widowed
            "A95": 3    # Female
        }
        self.guarantor = {
            "A101": 1,  # None
            "A102": 2,  # Yes
            "A103": 2   # Yes
        }
        self.other_installment_plans_mapping = {
            "A141": 1,  # Other Banks or Dept Stores
            "A142": 1,  # Other Banks or Dept Stores
            "A143": 2   # None
        }
    
    def load_data(self):
        """
        Wczytuje dane z pliku CSV i przetwarza je.
        """
        self.df = pd.read_csv(self.file_path, sep='\\s+', header=None)
        self._preprocess_data()

    def _preprocess_data(self):
        """
        Przetwarza dane, usuwa niepotrzebne kolumny i stosuje mapowania.
        """
        # Usuń niepotrzebne kolumny
        self.df = self.df.drop(labels=[1,4,7,10,11,12,14,16,17,18,19], axis=1)
        
        # Zmień nazwy kolumn
        self.df.columns = [
            "Checking account balance",
            "Credit history",
            "Purpose",
            "Savings",
            "Employment Length",
            "Sex/Marital Status",
            "Guarantor",
            "Other installment plans",
            "No of Credits at this bank",
            "Mark"
        ]
        
        # Zastosowanie mapowania do odpowiednich kolumn
        self.df["Checking account balance"] = self.df["Checking account balance"].map(self.checking_account_mapping)
        self.df["Credit history"] = self.df["Credit history"].map(self.credit_history_mapping)
        self.df["Purpose"] = self.df["Purpose"].map(self.purpose_mapping)
        self.df["Savings"] = self.df["Savings"].map(self.savings_mapping)
        self.df["Employment Length"] = self.df["Employment Length"].map(self.employment_length_mapping)
        self.df["Sex/Marital Status"] = self.df["Sex/Marital Status"].map(self.sex_marital_status_mapping)
        self.df["Guarantor"] = self.df["Guarantor"].map(self.guarantor)
        self.df["Other installment plans"] = self.df["Other installment plans"].map(self.other_installment_plans_mapping)
    
    def get_data(self):
        """
        Zwraca przetworzone dane.
        :return: Przetworzone dane jako DataFrame
        """
        if self.df is None:
            raise ValueError("Data not loaded. Please call `load_data()` first.")
        return self.df

# Przykład użycia
data_loader = DataLoader('german.data')
data_loader.load_data()
df = data_loader.get_data()
print(df.head())


   Checking account balance  Credit history  Purpose  Savings  \
0                         1               1        3        1   
1                         2               3        3        2   
2                         3               1        4        2   
3                         1               3        3        2   
4                         1               1        1        2   

   Employment Length  Sex/Marital Status  Guarantor  Other installment plans  \
0                  5                   1          1                        2   
1                  3                   3          1                        2   
2                  4                   1          1                        2   
3                  4                   1          2                        2   
4                  3                   1          1                        2   

   No of Credits at this bank  Mark  
0                           2     1  
1                           1     2  
2             

In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

# Podział danych na cechy (X) i etykiety (y)
y = df["Mark"]  
X = df.drop("Mark", axis=1)

# Przekształcenie wartości w y dla XGBoost
y_binary = y.map({1: 0, 2: 1})

# Podział danych na zbiory treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

In [24]:
def evaluate_model(pipeline, X, y, model_name):
    print(f"Evaluating {model_name}...")
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
    print(f"{model_name} Cross-Validation Accuracy Scores: {cv_scores}")
    print(f"{model_name} Mean Cross-Validation Accuracy: {cv_scores.mean()}\n")

In [25]:
# Pipeline do RandomForestClassifier
pipeline_rf = Pipeline([
    ('scaler', StandardScaler()),  # Skalowanie cech
    ('classifier', RandomForestClassifier(random_state=42))  # Klasyfikator
])

# Ocena modelu RandomForest z walidacją krzyżową
evaluate_model(pipeline_rf, X, y_binary, 'Random Forest')

# Trenowanie modelu RandomForest
pipeline_rf.fit(X_train, y_train)
y_pred = pipeline_rf.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))

# Pipeline do LogisticRegression
pipeline_lr = Pipeline([
    ('scaler', StandardScaler()),  # Skalowanie cech
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))  # Klasyfikator
])

# Ocena modelu Logistic Regression z walidacją krzyżową
evaluate_model(pipeline_lr, X, y_binary, 'Logistic Regression')

# Trenowanie modelu Logistic Regression
pipeline_lr.fit(X_train, y_train)
y_pred = pipeline_lr.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))

# Pipeline do GradientBoostingClassifier
pipeline_gb = Pipeline([
    ('scaler', StandardScaler()),  # Skalowanie cech
    ('classifier', GradientBoostingClassifier(random_state=42))  # Klasyfikator
])

# Ocena modelu Gradient Boosting z walidacją krzyżową
evaluate_model(pipeline_gb, X, y_binary, 'Gradient Boosting')

# Trenowanie modelu Gradient Boosting
pipeline_gb.fit(X_train, y_train)
y_pred = pipeline_gb.predict(X_test)
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, y_pred))
print("Gradient Boosting Accuracy:", accuracy_score(y_test, y_pred))

# Pipeline do XGBClassifier
pipeline_xgb = Pipeline([
    ('scaler', StandardScaler()),  # Skalowanie cech
    ('classifier', XGBClassifier(random_state=42))  # Klasyfikator
])

# Ocena modelu XGBoost z walidacją krzyżową
evaluate_model(pipeline_xgb, X, y_binary, 'XGBoost')

# Trenowanie modelu XGBoost
pipeline_xgb.fit(X_train, y_train)
y_pred = pipeline_xgb.predict(X_test)
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred))
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred))

# GridSearchCV dla RandomForestClassifier
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# Pipeline do RandomForest z GridSearchCV
grid_search_rf = GridSearchCV(pipeline_rf, param_grid, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

# Najlepszy model
best_model = grid_search_rf.best_estimator_

# Ocena najlepszego modelu z walidacją krzyżową
evaluate_model(best_model, X, y_binary, 'Best Model (Random Forest with GridSearchCV)')

# Przewidywanie na zbiorze testowym
y_pred = best_model.predict(X_test)
print("Best Model (Random Forest with GridSearchCV) Classification Report:")
print(classification_report(y_test, y_pred))
print("Best Model Accuracy:", accuracy_score(y_test, y_pred))

Evaluating Random Forest...
Random Forest Cross-Validation Accuracy Scores: [0.675 0.72  0.74  0.715 0.7  ]
Random Forest Mean Cross-Validation Accuracy: 0.71

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.81      0.79       141
           1       0.49      0.44      0.46        59

    accuracy                           0.70       200
   macro avg       0.63      0.62      0.63       200
weighted avg       0.69      0.70      0.70       200

Random Forest Accuracy: 0.7
Evaluating Logistic Regression...
Logistic Regression Cross-Validation Accuracy Scores: [0.675 0.705 0.73  0.685 0.73 ]
Logistic Regression Mean Cross-Validation Accuracy: 0.705

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.87      0.81       141
           1       0.50      0.32      0.39        59

    accuracy                           0.70       200
   macro