In [26]:
# Import Required Libraries

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from ucimlrepo import fetch_ucirepo 
import tensorflow as tf
import pandas as pd
import numpy as np
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
Y = adult.data.targets 

df = pd.concat([X, Y], axis=1)
df.dropna(inplace=True)

df.reset_index(inplace=True)

X = df.drop('income', axis=1)
Y = df[['income']]

df

Unnamed: 0,index,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47616,48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K.
47617,48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
47618,48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
47619,48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [27]:
# Keep copy of data

x = X.copy()
y = Y.copy()

In [28]:
# Drop Irrelevant features

x.drop(['capital-gain', 'capital-loss', 'fnlwgt', 'education-num', 'marital-status'], axis=1, inplace=True)

In [29]:
# Print features

x

Unnamed: 0,index,age,workclass,education,occupation,relationship,race,sex,hours-per-week,native-country
0,0,39,State-gov,Bachelors,Adm-clerical,Not-in-family,White,Male,40,United-States
1,1,50,Self-emp-not-inc,Bachelors,Exec-managerial,Husband,White,Male,13,United-States
2,2,38,Private,HS-grad,Handlers-cleaners,Not-in-family,White,Male,40,United-States
3,3,53,Private,11th,Handlers-cleaners,Husband,Black,Male,40,United-States
4,4,28,Private,Bachelors,Prof-specialty,Wife,Black,Female,40,Cuba
...,...,...,...,...,...,...,...,...,...,...
47616,48836,33,Private,Bachelors,Prof-specialty,Own-child,White,Male,40,United-States
47617,48837,39,Private,Bachelors,Prof-specialty,Not-in-family,White,Female,36,United-States
47618,48839,38,Private,Bachelors,Prof-specialty,Husband,White,Male,50,United-States
47619,48840,44,Private,Bachelors,Adm-clerical,Own-child,Asian-Pac-Islander,Male,40,United-States


In [30]:
# Print output

y

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K
...,...
47616,<=50K.
47617,<=50K.
47618,<=50K.
47619,<=50K.


In [31]:
# Check number of unique values in a given column

col = 'workclass'
x[col].nunique()

9

In [32]:
# Prepare features with One-Hot-Encoding

cols = ['workclass', 'education', 'relationship', 'race', 'sex', 'native-country', 'occupation']
numerical_cols = x.drop(cols, axis=1)
one_hot_encoder = OneHotEncoder(sparse=False)
scaler = MinMaxScaler()
normalized_numerical_cols = scaler.fit_transform(numerical_cols)
normalized_numerical_df = pd.DataFrame(normalized_numerical_cols, columns=numerical_cols.columns)
one_hot_encoded_data = one_hot_encoder.fit_transform(x[cols])
x_encoded_df = pd.DataFrame(one_hot_encoded_data, columns=one_hot_encoder.get_feature_names_out(cols))
# print(numerical_cols)
# print(x_encoded_df)
x_encoded_df = pd.concat([x_encoded_df, normalized_numerical_df], axis=1)

In [33]:
# Process output

print(y['income'].unique())
y['income'] = y['income'].str.replace('.', '', regex=False)
print(y['income'].unique())
y['income'] = y['income'].str.replace('<=50K', '0', regex=False)
y['income'] = y['income'].str.replace('>50K', '1', regex=False)
y['income'] = y['income'].astype(int)
y_encoded_df = y

['<=50K' '>50K' '<=50K.' '>50K.']
['<=50K' '>50K']


In [34]:
x_encoded_df.drop(['workclass_?', 'native-country_?', 'occupation_?', 'index'], axis=1, inplace=True)
# x_encoded_df.columns
x_encoded_df

Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,education_11th,...,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,age,hours-per-week
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.301370,0.397959
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.452055,0.122449
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.287671,0.397959
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.493151,0.397959
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.150685,0.397959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47616,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.219178,0.397959
47617,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.301370,0.357143
47618,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.287671,0.500000
47619,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369863,0.397959


In [35]:
y_encoded_df

Unnamed: 0,income
0,0
1,0
2,0
3,0
4,0
...,...
47616,0
47617,0
47618,0
47619,0


In [36]:
X_train, X_test, y_train, y_test = train_test_split(x_encoded_df, y_encoded_df, test_size=0.15, random_state=42)
y_test_2 = y_test
X_train = X_train.values.reshape(-1, X_train.shape[1], 1)
X_test = X_test.values.reshape(-1, X_test.shape[1], 1)

y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

model = Sequential()
model.add(Dense(8, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(8, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(2, activation='softmax')) 

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, epochs=7, batch_size=32, validation_split=0.15)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x76c4d8b35a10>

In [37]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy}')

 28/224 [==>...........................] - ETA: 0s - loss: 0.3701 - accuracy: 0.8359

Accuracy: 0.8376259803771973


In [38]:
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 8)                 760       
                                                                 
 dropout_2 (Dropout)         (None, 8)                 0         
                                                                 
 dense_4 (Dense)             (None, 8)                 72        
                                                                 
 dropout_3 (Dropout)         (None, 8)                 0         
                                                                 
 dense_5 (Dense)             (None, 2)                 18        
                                                                 
Total params: 850
Trainable params: 850
Non-trainable params: 0
_________________________________________________________________
None


In [39]:
model.save("Salary.h5")

In [40]:
def representative_dataset():
    for _ in range(100):
      data =  X_test
      yield [data.astype(np.float32)]

print(representative_dataset())

<generator object representative_dataset at 0x76c4d8a519d0>


In [41]:
name = "salary_model_keras_dir"

tf.saved_model.save(model, name)
converter = tf.lite.TFLiteConverter.from_saved_model(name)

converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.representative_dataset = representative_dataset

tflite_model = converter.convert()

INFO:tensorflow:Assets written to: salary_model_keras_dir/assets


2024-11-02 01:18:09.842828: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:362] Ignored output_format.
2024-11-02 01:18:09.842883: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:365] Ignored drop_control_dependency.
2024-11-02 01:18:09.843052: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: salary_model_keras_dir
2024-11-02 01:18:09.844314: I tensorflow/cc/saved_model/reader.cc:89] Reading meta graph with tags { serve }
2024-11-02 01:18:09.844352: I tensorflow/cc/saved_model/reader.cc:130] Reading SavedModel debug info (if present) from: salary_model_keras_dir
2024-11-02 01:18:09.847271: I tensorflow/cc/saved_model/loader.cc:229] Restoring SavedModel bundle.
2024-11-02 01:18:09.886879: I tensorflow/cc/saved_model/loader.cc:213] Running initialization op on SavedModel bundle at path: salary_model_keras_dir
2024-11-02 01:18:09.894624: I tensorflow/cc/saved_model/loader.cc:305] SavedModel load for tags { serve }; Status: success

In [42]:
with open('SalaryClassifyModel.tflite', 'wb') as f:
  f.write(tflite_model)

In [43]:
interpreter = tf.lite.Interpreter(model_path="SalaryClassifyModel.tflite")
interpreter.allocate_tensors()

In [44]:
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
print('input_details:\n', input_details)
print('output_details:\n', output_details)

input_details:
 [{'name': 'serving_default_dense_3_input:0', 'index': 0, 'shape': array([ 1, 94], dtype=int32), 'shape_signature': array([-1, 94], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]
output_details:
 [{'name': 'StatefulPartitionedCall:0', 'index': 12, 'shape': array([1, 2], dtype=int32), 'shape_signature': array([-1,  2], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}]


In [45]:
# Test the model on random input data.
input_shape = input_details[0]['shape']
#print(input_shape)
#print(type(X_test))
#print(X_test.iloc[1])
#print(X_test.iloc[0])
input0_data = np.random.random_sample(input_shape)
print(input0_data)
input0_data = np.array(input0_data, dtype=np.float32)
interpreter.set_tensor(input_details[0]['index'], input0_data)

interpreter.invoke()
# The function `get_tensor()` returns a copy of the tensor data.
# Use `tensor()` in order to get a pointer to the tensor.
output0_data = interpreter.get_tensor(output_details[0]['index'])
print(output0_data)

# Verify if the same data is given to the original model what is the output
output0_data = model.predict(input0_data)
print(output0_data)

[[2.95951916e-01 5.32851849e-01 3.86487110e-01 7.95557084e-01
  2.78047953e-01 4.12001508e-01 6.05124476e-01 6.55820633e-01
  6.15358114e-02 9.35736668e-01 5.39017694e-01 3.29844630e-01
  7.95203826e-01 4.48090083e-01 2.92828939e-01 3.23620213e-01
  4.17441714e-01 8.58466536e-01 7.64871735e-01 4.29981767e-01
  8.52747170e-01 4.92543008e-02 4.84261778e-02 4.70806943e-01
  8.47063132e-01 5.88311463e-01 6.49652934e-01 2.35362878e-01
  8.13894262e-01 9.12471603e-01 8.10033086e-01 4.19192293e-01
  4.38143873e-01 6.83991957e-01 7.99081179e-01 7.95912704e-01
  6.62760726e-01 9.35737277e-01 4.95662727e-01 4.54716986e-01
  2.11071097e-01 5.51379358e-01 6.52489764e-01 9.16512851e-01
  2.75188411e-01 5.60154885e-02 1.49000761e-01 7.42509618e-02
  3.16563059e-04 6.53563825e-01 6.03498981e-01 3.01027721e-01
  2.38562202e-01 2.05406406e-01 5.18911064e-01 3.25385111e-01
  8.43024697e-01 4.29035269e-01 7.40366649e-01 2.09593585e-01
  8.15430855e-01 2.78576634e-01 7.31444564e-01 2.19653865e-01
  8.4689

In [46]:
NUM_OF_EPOCHS = 7
BATCH_SIZE = 32
DENSE1_SIZE = 8
DENSE2_SIZE = 8

# Function to convert some hex values into an array for C programming
import time, sys

# Function to convert some hex values into an array for C programming
def hex_to_c_array(hex_data, var_name):
    c_str = ""

    # Create header guard
    c_str += '#ifndef ' + var_name.upper() + '_H\n'
    c_str += "#define " + var_name.upper() + '_H\n\n'

    c_str += "/*\n Author: Mouli Sankaran \n"
    c_str += " CAUTION: This is an auto generated file.\n DO NOT EDIT OR MAKE ANY CHANGES TO IT.\n"

# Time stamping of this model data in the generated file
    localtime = time.asctime( time.localtime(time.time()) )
    c_str += " This model data was generated on " + localtime+ '\n\n'
    print("This model data was generated on:", localtime)

# Add information about the verisons of tools and packages used in generating this header file
    c_str += " Tools used:\n Python:" + str(sys.version) + "\n Numpy:" + str(np.version.version) + \
          "\n TensorFlow:" + str(sys.version) + "\n Keras: "+ str(tf.keras.__version__) + "\n\n"
    print("Tools used: Python:", sys.version, "\n Numpy:", np.version.version, \
          "\n TensorFlow:", sys.version, "\n Keras: ", tf.keras.__version__, "\n\n")

# Training details of the model
    c_str += ' Model details are:\n'
    c_str += ' NUM_OF_EPOCHS = ' + str(NUM_OF_EPOCHS) + '\n'
    c_str += ' BATCH_SIZE    = ' + str(BATCH_SIZE) + '\n*/\n'
    
# Generate 'C' constants for the no. of nodes in each layer
    c_str += '\nconst int ' + 'DENSE1_SIZE' + ' = ' + str(DENSE1_SIZE) + ';\n'
    c_str +=   'const int ' + 'DENSE2_SIZE' + ' = ' + str(DENSE2_SIZE) + ';\n'      
    
    # Add array length at the top of the file
    c_str += '\nconst unsigned int ' + var_name + '_len = ' + str(len(hex_data)) + ';\n'

    # Declare C variable
    c_str += 'alignas(8) const unsigned char ' + var_name + '[] = {'
    hex_array = []
    for i, val in enumerate(hex_data):
        # Construct string from hex
        hex_str = format(val, '#04x')

        # Add formating so each line stays within 80 characters
        if (i + 1) < len(hex_data):
          hex_str += ','
        if (i + 1) % 12 == 0:
          hex_str += '\n'
        hex_array.append(hex_str)

    # Add closing brace
    c_str += '\n' + format(''.join(hex_array)) + '\n};\n\n'

    # Close out header guard
    c_str += '#endif //' + var_name.upper() + '_H'

    return c_str

In [47]:
# Write TFLite model to a C source (or header) file
with open("salary_model_esp32" + '.h', 'w') as file:
  file.write(hex_to_c_array(tflite_model, "salary_model_esp32"))

This model data was generated on: Sat Nov  2 01:18:10 2024
Tools used: Python: 3.7.16 (default, Jan 17 2023, 22:20:44) 
[GCC 11.2.0] 
 Numpy: 1.21.5 
 TensorFlow: 3.7.16 (default, Jan 17 2023, 22:20:44) 
[GCC 11.2.0] 
 Keras:  2.10.0 




In [48]:
# Test tf_lite model
interpreter = tf.lite.Interpreter(model_path='SalaryClassifyModel.tflite')
interpreter.allocate_tensors()

# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# Store predictions
predictions = []

for i in range(len(X_test)):
    input_data = X_test[i:i + 1].astype(np.float32)  # Ensure data is FLOAT32
    input_data = input_data.reshape((1, 94))
    interpreter.set_tensor(input_details[0]['index'], input_data)
    interpreter.invoke()
    
    output_data = interpreter.get_tensor(output_details[0]['index'])
    predictions.append(np.argmax(output_data))  # Get the predicted class

# Convert predictions list to a NumPy array
predictions = np.array(predictions)

# Ensure y_test_2 is in the correct format (convert from one-hot if necessary)
y_test_2_labels = np.argmax(y_test, axis=1) if y_test.ndim > 1 else y_test.values.flatten()
y_test_2_labels = y_test_2_labels

# Calculate accuracy
accuracy = np.sum(predictions == y_test_2_labels) / len(y_test_2_labels)
print(f'Accuracy: {accuracy}')


Accuracy: 0.8372060470324748


In [49]:
print(predictions.shape)
print(y_test_2_labels.shape)


(7144,)
(7144,)


In [None]:
# Model Test Accuracy                  0.8376259803771973
# Tflite Model Test Accuracy           0.8372060470324748