In [1]:
import pandas as pd
import numpy as np 

df = pd.read_csv("adult-kaggle.csv")

In [2]:
df.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
# Nahrazení chybejících hodnot
df=df.replace('?',np.nan)
# Dropnutí nepotřebných sloupců
df.drop(['fnlwgt'],axis=1,inplace=True)

# Přejměnování cílové proměnné
df = df.rename(columns={'income': 'target'})
df.isnull().sum()

age                   0
workclass          2799
education             0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
target                0
dtype: int64

In [4]:
df['workclass'].value_counts()

workclass
Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: count, dtype: int64

In [5]:
df['workclass'] = df['workclass'].fillna("Private")

In [6]:
df['occupation'].value_counts()

occupation
Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: count, dtype: int64

In [7]:
df['occupation']=df['occupation'].fillna("Prof-specialty")

In [8]:
df['native-country'].value_counts()

native-country
United-States                 43832
Mexico                          951
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru                             46
Ecuador      

In [9]:
df['native-country']=df['native-country'].fillna('United-States')

In [10]:
df.isnull().sum()

age                0
workclass          0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
target             0
dtype: int64

In [11]:
df['target'] = df['target'].apply(lambda x: 1 if '>50K' in x else 0)


y = df['target'].astype('int').values
X = df.drop('target', axis=1)

In [12]:
from sklearn.model_selection import train_test_split

# saving the dataframe
df.to_csv('adult-processed.csv', index=False)
train, test = train_test_split(df, test_size=0.1)
train.to_csv('adult-processed-train.csv', index=False)
test.to_csv('adult-processed-test.csv', index=False)



In [13]:
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()


In [14]:
from sklearn.preprocessing import StandardScaler

# One-hot kodování kategorických proměnných 
X_encoded = pd.get_dummies(X, columns=categorical_cols)

# Škálování numerických hodnot
scaler = StandardScaler()
X_encoded[numeric_cols] = scaler.fit_transform(X_encoded[numeric_cols])


In [None]:
import joblib

# Save for Swift
joblib.dump(X_encoded.columns.tolist(), "feature_columns.pkl")
joblib.dump(scaler.mean_, "scaler_mean.pkl")
joblib.dump(scaler.scale_, "scaler_scale.pkl")
joblib.dump(numeric_cols, "numeric_cols.pkl")

column_names = X_encoded.columns.tolist()

import json
with open("feature_columns.json", "w") as f:
    json.dump(X_encoded.columns.tolist(), f)

with open("scaler_params.json", "w") as f:
    json.dump({
        "mean": scaler.mean_.tolist(),
        "scale": scaler.scale_.tolist(),
        "numeric_cols": numeric_cols
    }, f)

In [None]:
X_encoded
y

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded.astype("float32").values, y, test_size=0.2, random_state=42
)


X_train, X_test, y_train, y_test = train_test_split(X_encoded.values.astype('float32'), y, test_size=0.2, random_state=42)


In [16]:
X_train.shape[1]

104

In [17]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[ 'accuracy',                         
                        tf.keras.metrics.Precision(),        # precision = TP / (TP + FP)
                        tf.keras.metrics.Recall(),           # recall = TP / (TP + FN)  
                      ])
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

Epoch 1/20
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 903us/step - accuracy: 0.8378 - loss: 0.3494 - precision: 0.7146 - recall: 0.5363 - val_accuracy: 0.8506 - val_loss: 0.3194 - val_precision: 0.7598 - val_recall: 0.5927
Epoch 2/20
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 645us/step - accuracy: 0.8602 - loss: 0.3063 - precision: 0.7496 - recall: 0.6228 - val_accuracy: 0.8488 - val_loss: 0.3189 - val_precision: 0.7239 - val_recall: 0.6436
Epoch 3/20
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 681us/step - accuracy: 0.8570 - loss: 0.3085 - precision: 0.7392 - recall: 0.6094 - val_accuracy: 0.8506 - val_loss: 0.3174 - val_precision: 0.7513 - val_recall: 0.6059
Epoch 4/20
[1m1099/1099[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 696us/step - accuracy: 0.8584 - loss: 0.3070 - precision: 0.7496 - recall: 0.6228 - val_accuracy: 0.8516 - val_loss: 0.3189 - val_precision: 0.7610 - val_recall: 0.5967
Epoch 5/

<keras.src.callbacks.history.History at 0x316175dd0>

In [18]:
model.evaluate(X_test, y_test)


[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 444us/step - accuracy: 0.8549 - loss: 0.3444 - precision: 0.7324 - recall: 0.6191


[0.3366887867450714,
 0.8557682633399963,
 0.7253196835517883,
 0.6192139983177185]

In [22]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()
with open("adult_income_model.tflite", "wb") as f:
    f.write(tflite_model)

INFO:tensorflow:Assets written to: /var/folders/0g/xmt_dj710tx7_k26h6r2tlf40000gn/T/tmp8aw1dovf/assets


INFO:tensorflow:Assets written to: /var/folders/0g/xmt_dj710tx7_k26h6r2tlf40000gn/T/tmp8aw1dovf/assets


Saved artifact at '/var/folders/0g/xmt_dj710tx7_k26h6r2tlf40000gn/T/tmp8aw1dovf'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): TensorSpec(shape=(None, 104), dtype=tf.float32, name='keras_tensor')
Output Type:
  TensorSpec(shape=(None, 1), dtype=tf.float32, name=None)
Captures:
  13259640464: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13259641616: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13259638352: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13259641232: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13259642384: TensorSpec(shape=(), dtype=tf.resource, name=None)
  13259643344: TensorSpec(shape=(), dtype=tf.resource, name=None)


W0000 00:00:1750619862.517632 5587799 tf_tfl_flatbuffer_helpers.cc:365] Ignored output_format.
W0000 00:00:1750619862.517648 5587799 tf_tfl_flatbuffer_helpers.cc:368] Ignored drop_control_dependency.
2025-06-22 21:17:42.517785: I tensorflow/cc/saved_model/reader.cc:83] Reading SavedModel from: /var/folders/0g/xmt_dj710tx7_k26h6r2tlf40000gn/T/tmp8aw1dovf
2025-06-22 21:17:42.518062: I tensorflow/cc/saved_model/reader.cc:52] Reading meta graph with tags { serve }
2025-06-22 21:17:42.518067: I tensorflow/cc/saved_model/reader.cc:147] Reading SavedModel debug info (if present) from: /var/folders/0g/xmt_dj710tx7_k26h6r2tlf40000gn/T/tmp8aw1dovf
2025-06-22 21:17:42.520230: I tensorflow/cc/saved_model/loader.cc:236] Restoring SavedModel bundle.
2025-06-22 21:17:42.532177: I tensorflow/cc/saved_model/loader.cc:220] Running initialization op on SavedModel bundle at path: /var/folders/0g/xmt_dj710tx7_k26h6r2tlf40000gn/T/tmp8aw1dovf
2025-06-22 21:17:42.535794: I tensorflow/cc/saved_model/loader.cc:

In [20]:
from sklearn.metrics import confusion_matrix, classification_report

# Predict labels
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

# Print confusion matrix and classification report
print(classification_report(y_test, y_pred))



[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 308us/step
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      7479
           1       0.73      0.62      0.67      2290

    accuracy                           0.86      9769
   macro avg       0.81      0.77      0.79      9769
weighted avg       0.85      0.86      0.85      9769

