In [19]:
# Import Required Libraries

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from ucimlrepo import fetch_ucirepo 
import tensorflow as tf
import pandas as pd
import numpy as np
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
Y = adult.data.targets 

df = pd.concat([X, Y], axis=1)
df.dropna(inplace=True)

df.reset_index(inplace=True)

X = df.drop('income', axis=1)
Y = df[['income']]

df

Unnamed: 0,index,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47616,48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,0,0,40,United-States,<=50K.
47617,48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
47618,48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
47619,48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


In [20]:
# Keep copy of data

x = X.copy()
y = Y.copy()

In [21]:
# Drop Irrelevant features

x.drop(['capital-gain', 'capital-loss', 'fnlwgt', 'education-num', 'marital-status'], axis=1, inplace=True)

In [22]:
# Print features

x

Unnamed: 0,index,age,workclass,education,occupation,relationship,race,sex,hours-per-week,native-country
0,0,39,State-gov,Bachelors,Adm-clerical,Not-in-family,White,Male,40,United-States
1,1,50,Self-emp-not-inc,Bachelors,Exec-managerial,Husband,White,Male,13,United-States
2,2,38,Private,HS-grad,Handlers-cleaners,Not-in-family,White,Male,40,United-States
3,3,53,Private,11th,Handlers-cleaners,Husband,Black,Male,40,United-States
4,4,28,Private,Bachelors,Prof-specialty,Wife,Black,Female,40,Cuba
...,...,...,...,...,...,...,...,...,...,...
47616,48836,33,Private,Bachelors,Prof-specialty,Own-child,White,Male,40,United-States
47617,48837,39,Private,Bachelors,Prof-specialty,Not-in-family,White,Female,36,United-States
47618,48839,38,Private,Bachelors,Prof-specialty,Husband,White,Male,50,United-States
47619,48840,44,Private,Bachelors,Adm-clerical,Own-child,Asian-Pac-Islander,Male,40,United-States


In [23]:
# Print output

y

Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K
...,...
47616,<=50K.
47617,<=50K.
47618,<=50K.
47619,<=50K.


In [24]:
# Prepare features with One-Hot-Encoding

cols = ['workclass', 'education', 'relationship', 'race', 'sex', 'native-country', 'occupation']
numerical_cols = x.drop(cols, axis=1)
one_hot_encoder = OneHotEncoder(sparse=False)
scaler = MinMaxScaler()
normalized_numerical_cols = scaler.fit_transform(numerical_cols)
normalized_numerical_df = pd.DataFrame(normalized_numerical_cols, columns=numerical_cols.columns)
one_hot_encoded_data = one_hot_encoder.fit_transform(x[cols])
x_encoded_df = pd.DataFrame(one_hot_encoded_data, columns=one_hot_encoder.get_feature_names_out(cols))
# print(numerical_cols)
# print(x_encoded_df)
x_encoded_df = pd.concat([x_encoded_df, normalized_numerical_df], axis=1)

In [25]:
# Process output

print(y['income'].unique())
y['income'] = y['income'].str.replace('.', '', regex=False)
print(y['income'].unique())
y['income'] = y['income'].str.replace('<=50K', '0', regex=False)
y['income'] = y['income'].str.replace('>50K', '1', regex=False)
y['income'] = y['income'].astype(int)
y_encoded_df = y

['<=50K' '>50K' '<=50K.' '>50K.']
['<=50K' '>50K']


In [26]:
x_encoded_df.drop(['workclass_?', 'native-country_?', 'occupation_?', 'index'], axis=1, inplace=True)
# x_encoded_df.columns
x_encoded_df

Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,education_11th,...,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,age,hours-per-week
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.301370,0.397959
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.452055,0.122449
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.287671,0.397959
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.493151,0.397959
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.150685,0.397959
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47616,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.219178,0.397959
47617,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.301370,0.357143
47618,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.287671,0.500000
47619,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369863,0.397959


In [27]:
# load models

from tensorflow.keras.models import load_model
from tensorflow import convert_to_tensor
import os

model_path = os.path.join('.', 'Salary.h5')
model = load_model(model_path)

# Test tf_lite model
interpreter = tf.lite.Interpreter(model_path='SalaryClassifyModel.tflite')
interpreter.allocate_tensors()

# Get input and output details
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [28]:
x.head(), y.head()

(   index  age         workclass  education         occupation   relationship  \
 0      0   39         State-gov  Bachelors       Adm-clerical  Not-in-family   
 1      1   50  Self-emp-not-inc  Bachelors    Exec-managerial        Husband   
 2      2   38           Private    HS-grad  Handlers-cleaners  Not-in-family   
 3      3   53           Private       11th  Handlers-cleaners        Husband   
 4      4   28           Private  Bachelors     Prof-specialty           Wife   
 
     race     sex  hours-per-week native-country  
 0  White    Male              40  United-States  
 1  White    Male              13  United-States  
 2  White    Male              40  United-States  
 3  Black    Male              40  United-States  
 4  Black  Female              40           Cuba  ,
    income
 0       0
 1       0
 2       0
 3       0
 4       0)

In [29]:
# Evaluate model

X_train, X_test, y_train, y_test = train_test_split(x_encoded_df, y_encoded_df, test_size=0.15, random_state=42)
y_test_2 = y_test
X_train = X_train.values.reshape(-1, X_train.shape[1], 1)
X_test = X_test.values.reshape(-1, X_test.shape[1], 1)

y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8376259803771973


In [40]:
predictions = []

for i in range(10):
    predict = model.predict(X_test[i].reshape((1, 94)))       
    predictions.append(predict)



In [41]:
predicted_classes = [np.argmax(pred) for pred in predictions]
actual_classes = [np.argmax(label) for label in y_test]

In [42]:
comparison_df = pd.DataFrame({
    'Predicted': predicted_classes,
    'Actual': actual_classes[:10]
})

In [None]:
comparison_df   # Around 80% testing accuracy in this case, 1st 4 rows are taken as input in ino file

Unnamed: 0,Predicted,Actual
0,1,0
1,0,0
2,1,0
3,1,1
4,1,1
5,0,0
6,0,0
7,0,0
8,1,1
9,1,1


In [44]:
first_five_rows = X_test[:10]

# Open a text file to write
with open('Features.txt', 'w') as f:
    for row in first_five_rows:
        # Format the row as [feature1, feature2, feature3]
        formatted_row = ', '.join(f'{value:}0f' for i, value in enumerate(row))
        formatted_row = formatted_row.replace('[', '')
        formatted_row = formatted_row.replace(']', '')
        f.write(f'[{formatted_row}]\n')

In [45]:
first_five_rows = predictions[:10]

# Open a text file to write
with open('Output.txt', 'w') as f:
    for row in first_five_rows:
        # Format the row as [feature1, feature2, feature3]
        formatted_row = ', '.join(f'{value}' for i, value in enumerate(row))
        formatted_row = formatted_row.replace('[', '')
        formatted_row = formatted_row.replace(']', '')
        f.write(f'[{formatted_row}]\n')