In [3]:
from little_help import load_csv, get_train_and_test_sets_target_encoded
# Teraz standaryzacja
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split


In [4]:
print("Wczytanie danych ...")
data = load_csv('income_evaluation.csv')
print(data.head())

Wczytanie danych ...
   age          workclass   fnlwgt   education   education-num  \
0   39          State-gov    77516   Bachelors              13   
1   50   Self-emp-not-inc    83311   Bachelors              13   
2   38            Private   215646     HS-grad               9   
3   53            Private   234721        11th               7   
4   28            Private   338409   Bachelors              13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

    capital-gain   capital-loss   hours-per-week  native-country  income  
0           2174              0 

In [5]:
print(data.columns.tolist())


X_train, X_test, y_train, y_test = get_train_and_test_sets_target_encoded(data, " income")

['age', ' workclass', ' fnlwgt', ' education', ' education-num', ' marital-status', ' occupation', ' relationship', ' race', ' sex', ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country', ' income']


In [6]:

#1. Normalizacja
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_encoded_train = encoder.fit_transform(X_train)
X_encoded_test = encoder.transform(X_test)

# 2. Standaryzacja
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_encoded_train)
X_test_scaled = scaler.transform(X_encoded_test)

# 3. PCA - redukcja wymiarowości
pca = PCA(n_components=10)  # redukujemy do 10 komponentów
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# 4. KMeans - klasteryzacja
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(X_train_pca)

# 5. Majority voting - przypisanie etykiet do klastrów
cluster_labels = np.zeros_like(kmeans.labels_)
for i in range(2):
    mask = (kmeans.labels_ == i)
    values, counts = np.unique(y_train[mask], return_counts=True)
    cluster_labels[mask] = values[np.argmax(counts)]

# 6. Predykcja na danych testowych
test_clusters = kmeans.predict(X_test_pca)

test_labels = np.zeros_like(test_clusters)
for i in range(2):
    mask = (kmeans.labels_ == i)
    values, counts = np.unique(y_train[mask], return_counts=True)
    test_labels[test_clusters == i] = values[np.argmax(counts)]

# 7. Ocena
print("Dokładność KMeans + PCA:", accuracy_score(y_test, test_labels))


Dokładność KMeans + PCA: 0.7564870259481038


In [7]:
print("Wczytanie danych ...")
df = load_csv('income_evaluation.csv')

df = df.dropna()

print(df.head())

X = df.drop(" income", axis=1)
y = df[" income"]
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

    # Zastosuj OrdinalEncoder do kolumn kategorycznych
encoder = OrdinalEncoder()
if cat_cols:
    X[cat_cols] = encoder.fit_transform(X[cat_cols])

# Podział na dane treningowe i testowe
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Trening modelu
model = LogisticRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)

acc = accuracy_score(y_test, preds)

print(f"Accuracy: {acc}")

Wczytanie danych ...
   age          workclass   fnlwgt   education   education-num  \
0   39          State-gov    77516   Bachelors              13   
1   50   Self-emp-not-inc    83311   Bachelors              13   
2   38            Private   215646     HS-grad               9   
3   53            Private   234721        11th               7   
4   28            Private   338409   Bachelors              13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

    capital-gain   capital-loss   hours-per-week  native-country  income  
0           2174              0 

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
