In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)



In [9]:
# Wczytanie danych
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data"
column_names = [
    "Id", "RI", "Na", "Mg", "Al", "Si", "K", "Ca", "Ba", "Fe", "Type"
]
data = pd.read_csv(data_url, names=column_names)
data = data.drop(['Id'], axis=1)

In [10]:
# Podstawowe statystyki
print(data.describe())

               RI          Na          Mg          Al          Si           K  \
count  214.000000  214.000000  214.000000  214.000000  214.000000  214.000000   
mean     1.518365   13.407850    2.684533    1.444907   72.650935    0.497056   
std      0.003037    0.816604    1.442408    0.499270    0.774546    0.652192   
min      1.511150   10.730000    0.000000    0.290000   69.810000    0.000000   
25%      1.516522   12.907500    2.115000    1.190000   72.280000    0.122500   
50%      1.517680   13.300000    3.480000    1.360000   72.790000    0.555000   
75%      1.519157   13.825000    3.600000    1.630000   73.087500    0.610000   
max      1.533930   17.380000    4.490000    3.500000   75.410000    6.210000   

               Ca          Ba          Fe        Type  
count  214.000000  214.000000  214.000000  214.000000  
mean     8.956963    0.175047    0.057009    2.780374  
std      1.423153    0.497219    0.097439    2.103739  
min      5.430000    0.000000    0.000000    1

In [11]:
# Liczba wystąpień każdej klasy
print(data["Type"].value_counts())

2    76
1    70
7    29
3    17
5    13
6     9
Name: Type, dtype: int64


In [13]:
# Informacje o typach danych
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RI      214 non-null    float64
 1   Na      214 non-null    float64
 2   Mg      214 non-null    float64
 3   Al      214 non-null    float64
 4   Si      214 non-null    float64
 5   K       214 non-null    float64
 6   Ca      214 non-null    float64
 7   Ba      214 non-null    float64
 8   Fe      214 non-null    float64
 9   Type    214 non-null    int64  
dtypes: float64(9), int64(1)
memory usage: 16.8 KB
None


In [16]:
# Podział na cechy (X) i etykiety (y)
X = data.drop("Type", axis=1)
y = data["Type"]

# Podział na zestaw uczący i walidacyjny
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=39)

In [17]:
# Porównanie wyników bez przetworzenia danych
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred = tree_clf.predict(X_val)
print("Wyniki bez przetworzenia danych:")
print(classification_report(y_val, y_pred))

Wyniki bez przetworzenia danych:
              precision    recall  f1-score   support

           1       0.71      0.91      0.80        11
           2       0.64      0.50      0.56        14
           3       0.60      1.00      0.75         3
           5       0.50      0.25      0.33         4
           6       1.00      0.67      0.80         3
           7       0.89      1.00      0.94         8

    accuracy                           0.72        43
   macro avg       0.72      0.72      0.70        43
weighted avg       0.71      0.72      0.70        43



In [18]:
# Przetworzenie danych - przykłady różnych metod
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

discretizer = KBinsDiscretizer(n_bins=5, encode="ordinal", strategy="uniform")
X_train_discretized = discretizer.fit_transform(X_train)
X_val_discretized = discretizer.transform(X_val)

pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)

kbest = SelectKBest(chi2, k=5)
X_train_kbest = kbest.fit_transform(X_train, y_train)
X_val_kbest = kbest.transform(X_val)

In [20]:
# Porównanie wyników po przetworzeniu danych
tree_clf_scaled = DecisionTreeClassifier(random_state=42)
tree_clf_scaled.fit(X_train_scaled, y_train)
y_pred_scaled = tree_clf_scaled.predict(X_val_scaled)
print("Wyniki po standaryzacji:")
print(classification_report(y_val, y_pred_scaled))

Wyniki po standaryzacji:
              precision    recall  f1-score   support

           1       0.71      0.91      0.80        11
           2       0.64      0.50      0.56        14
           3       0.60      1.00      0.75         3
           5       0.50      0.25      0.33         4
           6       1.00      0.67      0.80         3
           7       0.89      1.00      0.94         8

    accuracy                           0.72        43
   macro avg       0.72      0.72      0.70        43
weighted avg       0.71      0.72      0.70        43



In [25]:
tree_clf_discretized = DecisionTreeClassifier(random_state=42)
tree_clf_discretized.fit(X_train_discretized, y_train)
y_pred_discretized = tree_clf_discretized.predict(X_val_discretized)
print("Wyniki po dyskretyzacji:")
print(classification_report(y_val, y_pred_discretized))

Wyniki po dyskretyzacji:
              precision    recall  f1-score   support

           1       0.50      0.64      0.56        11
           2       0.60      0.64      0.62        14
           3       0.00      0.00      0.00         3
           5       1.00      0.75      0.86         4
           6       0.67      0.67      0.67         3
           7       0.88      0.88      0.88         8

    accuracy                           0.65        43
   macro avg       0.61      0.60      0.60        43
weighted avg       0.63      0.65      0.63        43



In [27]:
tree_clf_pca = DecisionTreeClassifier(random_state=42)
tree_clf_pca.fit(X_train_pca, y_train)
y_pred_pca = tree_clf_pca.predict(X_val_pca)
print("Wyniki po redukcji wymiarowości (PCA):")
print(classification_report(y_val, y_pred_pca))

Wyniki po redukcji wymiarowości (PCA):
              precision    recall  f1-score   support

           1       0.62      0.73      0.67        11
           2       0.78      0.50      0.61        14
           3       0.17      0.33      0.22         3
           5       1.00      0.75      0.86         4
           6       0.75      1.00      0.86         3
           7       0.88      0.88      0.88         8

    accuracy                           0.67        43
   macro avg       0.70      0.70      0.68        43
weighted avg       0.73      0.67      0.69        43



In [29]:
tree_clf_kbest = DecisionTreeClassifier(random_state=42)
tree_clf_kbest.fit(X_train_kbest, y_train)
y_pred_kbest = tree_clf_kbest.predict(X_val_kbest)
print("Wyniki po selekcji cech (SelectKBest):")
print(classification_report(y_val, y_pred_kbest))

Wyniki po selekcji cech (SelectKBest):
              precision    recall  f1-score   support

           1       0.50      0.55      0.52        11
           2       0.56      0.64      0.60        14
           3       0.33      0.33      0.33         3
           5       1.00      0.25      0.40         4
           6       1.00      0.67      0.80         3
           7       0.89      1.00      0.94         8

    accuracy                           0.63        43
   macro avg       0.71      0.57      0.60        43
weighted avg       0.66      0.63      0.62        43



In [31]:
from sklearn.model_selection import GridSearchCV

# Naiwny klasyfikator Bayesa
nb_clf = GaussianNB()
nb_clf.fit(X_train, y_train)
y_pred_nb = nb_clf.predict(X_val)
print("Wyniki dla naiwnego klasyfikatora Bayesa:")
print(classification_report(y_val, y_pred_nb))

Wyniki dla naiwnego klasyfikatora Bayesa:
              precision    recall  f1-score   support

           1       0.41      0.64      0.50        11
           2       0.43      0.21      0.29        14
           3       0.40      0.67      0.50         3
           5       0.50      0.25      0.33         4
           6       1.00      1.00      1.00         3
           7       0.89      1.00      0.94         8

    accuracy                           0.56        43
   macro avg       0.60      0.63      0.59        43
weighted avg       0.55      0.56      0.53        43



In [32]:
# Drzewo decyzyjne - hiperparametry
param_grid = {
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5],
}

tree_clf_grid = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(tree_clf_grid, param_grid, cv=5)
grid_search.fit(X_train, y_train)
y_pred_grid = grid_search.predict(X_val)
print("Wyniki dla drzewa decyzyjnego (GridSearchCV):")
print(classification_report(y_val, y_pred_grid))

Wyniki dla drzewa decyzyjnego (GridSearchCV):
              precision    recall  f1-score   support

           1       0.71      0.91      0.80        11
           2       0.71      0.71      0.71        14
           3       0.50      0.33      0.40         3
           5       1.00      0.75      0.86         4
           6       1.00      0.67      0.80         3
           7       1.00      1.00      1.00         8

    accuracy                           0.79        43
   macro avg       0.82      0.73      0.76        43
weighted avg       0.80      0.79      0.79        43

