### 1. Wczytanie bibliotek, danych i opis data frame'u

#### 1.1. Biblioteki

In [1454]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error,root_mean_squared_error, mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm

#### 1.2. Wczytanie danych

In [1455]:
df = pd.read_csv('adult.data', sep=',', skipinitialspace=True)

In [1456]:
df.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


#### 1.3. Opis DF

In [1457]:
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32560.0,32560.0,32560.0,32560.0,32560.0,32560.0
mean,38.581634,189781.8,10.08059,1077.615172,87.306511,40.437469
std,13.640642,105549.8,2.572709,7385.402999,402.966116,12.347618
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117831.5,9.0,0.0,0.0,40.0
50%,37.0,178363.0,10.0,0.0,0.0,40.0
75%,48.0,237054.5,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [1458]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [1459]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

### 2. Przygotowanie danych

#### 2.1. Usunięcie niepotrzebnych kolumn

- fnlwgt: Ponieważ nie jest potrzebna w analizie ani modelowaniu, ponieważ reprezentuje wagę próby (nie wnosi dodatkowych informacji na temat relacji między predyktorami a zmienną docelową).
- education: Istnieje już kolumna education-num
- relationship: Zbędna, mniej istotna niż martial_status
- capital_gain i capital_loss: Zbędna w kontekście analizy zysków zawodowych

In [1460]:
df = df[['age', 'workclass', 'education_num','marital_status', 'occupation', 'race', 'sex', 'hours_per_week', 'native_country', 'income']]
df.head()

Unnamed: 0,age,workclass,education_num,marital_status,occupation,race,sex,hours_per_week,native_country,income
0,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,White,Male,13,United-States,<=50K
1,38,Private,9,Divorced,Handlers-cleaners,White,Male,40,United-States,<=50K
2,53,Private,7,Married-civ-spouse,Handlers-cleaners,Black,Male,40,United-States,<=50K
3,28,Private,13,Married-civ-spouse,Prof-specialty,Black,Female,40,Cuba,<=50K
4,37,Private,14,Married-civ-spouse,Exec-managerial,White,Female,40,United-States,<=50K


#### 2.2. Obsługa brakujących danych 

In [1461]:
df.isin(['?']).sum() # Braki znajdują się w kolumnach workclass, occupation i native_country. Native_country mogą tak pozostać ponieważ i tak później
# są pogrupowane jako other

age                  0
workclass         1836
education_num        0
marital_status       0
occupation        1843
race                 0
sex                  0
hours_per_week       0
native_country     583
income               0
dtype: int64

In [1462]:
df.workclass.value_counts()

workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1297
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64

In [1463]:
df = df[~df['workclass'].isin(['Without-pay', 'Never-worked'])] # Usunięcie osób nie zarabiających ponieważ nie powinni być brani pod uwagę

df['workclass'].value_counts()

workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1297
Self-emp-inc         1116
Federal-gov           960
Name: count, dtype: int64

In [1464]:
df['workclass'].replace('?', 'Other', inplace=True)

df['workclass'].value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['workclass'].replace('?', 'Other', inplace=True)


workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
Other                1836
State-gov            1297
Self-emp-inc         1116
Federal-gov           960
Name: count, dtype: int64

In [1465]:
df.occupation.value_counts()

occupation
Prof-specialty       4140
Craft-repair         4098
Exec-managerial      4066
Adm-clerical         3766
Sales                3650
Other-service        3294
Machine-op-inspct    2001
?                    1836
Transport-moving     1596
Handlers-cleaners    1369
Farming-fishing       988
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: count, dtype: int64

In [1466]:
df['occupation'].replace('Armed-Forces', '?', inplace=True) # Dodanie armed forces do ? ponieważ jest ich niewiele

df['occupation'].replace('?', 'Other', inplace=True)

df['occupation'].replace(['Protective-serv', 'Priv-house-serv'], 'Other-service', inplace=True) # Połączenie usług

df['occupation'].value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['occupation'].replace('Armed-Forces', '?', inplace=True) # Dodanie armed forces do ? ponieważ jest ich niewiele
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['occupation'].replace('?', 'Other', inplace=True)
The behavior will change in pandas 3.0. This inplace method will

occupation
Prof-specialty       4140
Craft-repair         4098
Other-service        4092
Exec-managerial      4066
Adm-clerical         3766
Sales                3650
Machine-op-inspct    2001
Other                1845
Transport-moving     1596
Handlers-cleaners    1369
Farming-fishing       988
Tech-support          928
Name: count, dtype: int64

In [1467]:
df.isin(['?']).sum()

age                 0
workclass           0
education_num       0
marital_status      0
occupation          0
race                0
sex                 0
hours_per_week      0
native_country    583
income              0
dtype: int64

#### 2.3. Przekształcenia

In [1468]:
df['marital_status'].value_counts() # Dużo niepotrzebnych klas

marital_status
Married-civ-spouse       14967
Never-married            10673
Divorced                  4442
Separated                 1025
Widowed                    992
Married-spouse-absent      417
Married-AF-spouse           23
Name: count, dtype: int64

In [1469]:
married_status = ['Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse']
single_status = ['Never-married', 'Divorced', 'Widowed', 'Separated']

df['marital_status'].replace(married_status, 'Taken', inplace=True)
df['marital_status'].replace(single_status, 'Single', inplace=True)
df['marital_status'].value_counts() # Zamiana statusów na singla i w związku w celu zmniejszenia ilości kolumn

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['marital_status'].replace(married_status, 'Taken', inplace=True)


marital_status
Single    17132
Taken     15407
Name: count, dtype: int64

In [1470]:
df['native_country'].value_counts() # Dużo niepotrzebnych klas

native_country
United-States                 29149
Mexico                          643
?                               583
Philippines                     197
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
France       

In [1471]:
df['native_country'] = np.where(df['native_country'].isin(['United-States']),
                                 df['native_country'],
                                 'Other')  # Podział na mieszkańców i obcokrajowców 

df['native_country'].value_counts()

native_country
United-States    29149
Other             3390
Name: count, dtype: int64

In [1472]:
df['race'].value_counts()

race
White                 27798
Black                  3121
Asian-Pac-Islander     1038
Amer-Indian-Eskimo      311
Other                   271
Name: count, dtype: int64

In [1473]:
other_races = ['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Other'] # Połączenie mniej licznych ras

df['race'].replace(other_races, 'Other', inplace=True)

df['race'].value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['race'].replace(other_races, 'Other', inplace=True)


race
White    27798
Black     3121
Other     1620
Name: count, dtype: int64

In [1474]:
df = pd.get_dummies(df, columns=['income'], drop_first=True) # Zamiana na bool większe/równe 50k = True

df = pd.get_dummies(df, columns=['sex'], drop_first=True) # Zamiana na bool mężczyzna = True

df = pd.get_dummies(df, columns=['workclass','marital_status', 'occupation', 'race', 'native_country']) # one hot encoding

df.head()

Unnamed: 0,age,education_num,hours_per_week,income_>50K,sex_Male,workclass_Federal-gov,workclass_Local-gov,workclass_Other,workclass_Private,workclass_Self-emp-inc,...,occupation_Other-service,occupation_Prof-specialty,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,race_Black,race_Other,race_White,native_country_Other,native_country_United-States
0,50,13,13,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
1,38,9,40,False,True,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,True
2,53,7,40,False,True,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,True
3,28,13,40,False,False,False,False,False,True,False,...,False,True,False,False,False,True,False,False,True,False
4,37,14,40,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,True,False,True


### 3. Podział na zbiory, trening i ocena

In [1476]:
y = df['income_>50K']
X = df.drop(['income_>50K'], axis = 1)

In [1477]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify = y, random_state=42)

### 4. Modele

#### 4.1. Linear regression

In [1478]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [1479]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R² Score: {r2}")

Mean Absolute Error: 0.27766673806632225
Mean Squared Error: 0.12390618930860198
Root Mean Squared Error: 0.3520031097996181
R² Score: 0.3224909795127444


#### 4.2 Logistic regression

In [1480]:
logistic_model = LogisticRegression(max_iter=1000)

logistic_model.fit(X_train, y_train)

y_pred = logistic_model.predict(X_test)


In [1481]:
conf_matrix = confusion_matrix(y_test, y_pred)

class_report = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print(f"Accuracy: {accuracy}")

Confusion Matrix:
 [[6812  598]
 [1083 1269]]

Classification Report:
               precision    recall  f1-score   support

       False       0.86      0.92      0.89      7410
        True       0.68      0.54      0.60      2352

    accuracy                           0.83      9762
   macro avg       0.77      0.73      0.75      9762
weighted avg       0.82      0.83      0.82      9762

Accuracy: 0.8278016799836099


#### 4.3. KNN

In [1482]:
knn_model = KNeighborsClassifier(n_neighbors=5)

knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)


In [1483]:
conf_matrix = confusion_matrix(y_test, y_pred)

class_report = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

print("=== Wyniki Klasyfikacji KNN ===")
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print(f"Accuracy: {accuracy:.2f}")


=== Wyniki Klasyfikacji KNN ===
Confusion Matrix:
 [[6579  831]
 [1040 1312]]

Classification Report:
               precision    recall  f1-score   support

       False       0.86      0.89      0.88      7410
        True       0.61      0.56      0.58      2352

    accuracy                           0.81      9762
   macro avg       0.74      0.72      0.73      9762
weighted avg       0.80      0.81      0.81      9762

Accuracy: 0.81


#### 4.4. SVM

In [1484]:
model_svc = svm.SVC()

model_svc.fit(X_train, y_train)

y_pred = model_svc.predict(X_test)

In [1485]:
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)
print(f"Accuracy: {accuracy}")


Confusion Matrix:
 [[7034  376]
 [1347 1005]]

Classification Report:
               precision    recall  f1-score   support

       False       0.84      0.95      0.89      7410
        True       0.73      0.43      0.54      2352

    accuracy                           0.82      9762
   macro avg       0.78      0.69      0.71      9762
weighted avg       0.81      0.82      0.81      9762

Accuracy: 0.823499282933825
