# ***Clustering Models for kNowy***

# Setting the environment

In [None]:
!pip install tensorflow-model-optimization

Collecting tensorflow-model-optimization
  Downloading tensorflow_model_optimization-0.8.0-py2.py3-none-any.whl (242 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-model-optimization
Successfully installed tensorflow-model-optimization-0.8.0


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# Read the data

In [None]:
data = pd.read_csv('Data_final.csv')

x = data.drop(columns=['Career'])
y = data['Career']

# Standardize the data using StandardScaler

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

# Creating AutoEncoder Models

In [None]:
input_shape = (10,)
inputs = keras.Input(shape=input_shape)
x = layers.Reshape((10, 1))(inputs)  # reshape input to (10, 1)
x = layers.Conv1D(32, kernel_size=3, activation='relu')(x)  # 1D convol"utional layer
x = layers.MaxPooling1D(pool_size=2)(x)  # max pooling layer
x = layers.Flatten()(x)  # flatten output
x = layers.Dense(64, activation='relu')(x)  # dense layer
outputs = layers.Dense(10, activation='linear')(x)  # output layer for reconstruction

# Create Costum Clustering Layer

In [None]:
class ClusteringLayer(keras.layers.Layer):
    def __init__(self, number_of_clusters=10, **kwargs):
        super(ClusteringLayer, self).__init__(**kwargs)
        self.number_of_clusters = number_of_clusters

    def build(self, input_shape):
        self.clusters = self.add_weight(shape=(input_shape[-1], self.number_of_clusters),
                                         initializer='glorot_uniform',
                                         trainable=True,
                                         name='clusters')

    def call(self, inputs):
        return tf.matmul(inputs, self.clusters)

# Apply Weight Clustering

In [None]:
number_of_clusters = 10
clustering_layer = ClusteringLayer(number_of_clusters)
clustered_output = clustering_layer(outputs)

# Compile and Training the models

In [None]:
from tensorflow.keras.losses import MeanSquaredError

model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(loss=MeanSquaredError(), optimizer='adam', metrics=['accuracy'])

model.fit(x_scaled, x_scaled, epochs=50, batch_size=16, shuffle=True, verbose=2)

Epoch 1/50
7/7 - 3s - 403ms/step - accuracy: 0.2952 - loss: 0.9120
Epoch 2/50
7/7 - 0s - 4ms/step - accuracy: 0.4095 - loss: 0.7220
Epoch 3/50
7/7 - 0s - 4ms/step - accuracy: 0.4571 - loss: 0.5698
Epoch 4/50
7/7 - 0s - 5ms/step - accuracy: 0.5143 - loss: 0.4424
Epoch 5/50
7/7 - 0s - 5ms/step - accuracy: 0.6190 - loss: 0.3527
Epoch 6/50
7/7 - 0s - 8ms/step - accuracy: 0.6952 - loss: 0.2920
Epoch 7/50
7/7 - 0s - 5ms/step - accuracy: 0.7238 - loss: 0.2542
Epoch 8/50
7/7 - 0s - 5ms/step - accuracy: 0.7143 - loss: 0.2257
Epoch 9/50
7/7 - 0s - 5ms/step - accuracy: 0.7333 - loss: 0.2034
Epoch 10/50
7/7 - 0s - 5ms/step - accuracy: 0.7333 - loss: 0.1860
Epoch 11/50
7/7 - 0s - 8ms/step - accuracy: 0.7238 - loss: 0.1719
Epoch 12/50
7/7 - 0s - 5ms/step - accuracy: 0.7524 - loss: 0.1595
Epoch 13/50
7/7 - 0s - 8ms/step - accuracy: 0.8000 - loss: 0.1477
Epoch 14/50
7/7 - 0s - 8ms/step - accuracy: 0.8095 - loss: 0.1388
Epoch 15/50
7/7 - 0s - 8ms/step - accuracy: 0.8381 - loss: 0.1307
Epoch 16/50
7/7 -

<keras.src.callbacks.history.History at 0x7ed2cd295060>

# Extract Encoder part from model and its features

In [None]:
encoder = keras.Model(inputs=model.input, outputs=model.layers[-3].output)

encoded_features = encoder.predict(x_scaled)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step


# Using K-Means Clustering

In [None]:
num_clusters = len(np.unique(y))
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(encoded_features)

  kmeans.fit(encoded_features)


# Get the cluster lables and transform it to Integer

In [None]:
labels = kmeans.labels_

_, y_encoded = np.unique(y, return_inverse=True)

# Evaluate the models using ARI [Adjusted Rand Index]

In [None]:
ari_score = adjusted_rand_score(y_encoded, labels)
print(f"Adjusted Rand Index: {ari_score}")

Adjusted Rand Index: -0.00034352456200618345


# Result of the Cluster using Dataframe

In [None]:
df = pd.DataFrame(x_scaled, columns=[f'feature_{i+1}' for i in range(10)])
df['career'] = y.values
df['cluster'] = labels

print(df.head())

   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0  -1.325510   0.935916  -1.283532  -1.010057  -1.429708   1.688228   
1   1.067059  -1.545052  -0.604755  -0.274248  -1.429708  -0.420530   
2  -0.844122  -0.627094   2.318267   0.461562  -0.527743   0.148989   
3   1.311345   1.026884  -0.806554  -0.791303  -1.048107   1.000701   
4  -0.607021   0.017957   0.074022   1.495672  -1.429708  -0.307652   

   feature_7  feature_8  feature_9  feature_10              career  cluster  
0   0.094643   0.929447   1.161923   -0.453172          Accountant       25  
1   2.100720   1.567791   0.330372   -0.527090    Graphic Designer       24  
2  -0.075538   0.739669   0.507412    1.112546         Salesperson        4  
3   0.496890   0.739669   1.939826    0.588400  Research Scientist       36  
4  -0.188992   0.613150   0.029941    1.260382             Teacher       20  


# Cluster unique code for each Career

In [None]:
pd.set_option('display.max_rows', 105)

df_sorted = df[['career', 'cluster']].sort_values('cluster')

print(df_sorted)

                                    career  cluster
102                         Civil Engineer        0
57                      Physical Therapist        1
43                        Speech Therapist        1
73                       Genetic Counselor        1
85                         Fashion Stylist        2
67                       Video Game Tester        2
53                       Financial Advisor        3
2                              Salesperson        4
87                   Public Health Analyst        5
41                    Marketing Copywriter        6
82                        Technical Writer        7
25                   Marketing Coordinator        8
104            Electronics Design Engineer        9
16                              Astronomer       10
14                   Construction Engineer       11
19                           Event Planner       12
101  Customs and Border Protection Officer       13
97                          Police Officer       13
37          

# **Testing the models**

In [None]:
def predict_career(new_inputs):
    new_inputs_scaled = scaler.transform(new_inputs)
    encoded_features = encoder.predict(new_inputs_scaled)
    cluster_labels = kmeans.predict(encoded_features)
    return cluster_labels

In [None]:
def map_cluster_to_career(cluster_label):
    cluster_to_career_map = {}
    for cluster, career in df_sorted.groupby('cluster')['career']:
        cluster_to_career_map[cluster] = career.iloc[0]
    return cluster_to_career_map.get(cluster_label, 'Unknown')

In [None]:
new_inputs = np.array([[1.78, 2.45, 2.45, 5.67, 3.56, 4.00, 6.00, 1.00, 6.00, 1.00]])
predicted_cluster = predict_career(new_inputs)[0]
predicted_career = map_cluster_to_career(predicted_cluster)
print(f"Predicted career: {predicted_career}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 128ms/step
Predicted career: Construction Engineer




# **Saving the Models**

In [None]:
def save_model(model, filename):
    model.save(filename)

In [None]:
model.summary()
save_model(model, 'career_prediction.h5')

print("Model saved to career_prediction.h5")



Model saved to career_prediction.h5


In [None]:
encoder.summary()
save_model(encoder, 'encoder_career_prediction.h5')

print("Model saved to encoder_career_prediction.h5")



Model saved to encoder_career_prediction.h5


# **Convert Models to TFLite**

In [None]:
model.summary()

converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# **Using TensorFlow JS**

In [None]:
!pip install tensorflowjs

In [None]:
import tensorflow as tf
import tensorflowjs as tfjs

In [None]:
class ClusteringLayer(tf.keras.layers.Layer):
    def __init__(self, number_of_clusters=10, **kwargs):
        super(ClusteringLayer, self).__init__(**kwargs)
        self.number_of_clusters = number_of_clusters

    def build(self, input_shape):
        self.clusters = self.add_weight(shape=(input_shape[-1], self.number_of_clusters),
                                         initializer='glorot_uniform',
                                         trainable=True,
                                         name='clusters')

    def call(self, inputs):
        return tf.matmul(inputs, self.clusters)

In [None]:
# Assume you have saved your models as model1.h5 and model2.h5
custom_objects = {'ClusteringLayer': ClusteringLayer, 'mse': tf.keras.losses.MeanSquaredError}

modeljs1 = tf.keras.models.load_model('career_prediction.h5', custom_objects=custom_objects)
modeljs2 = tf.keras.models.load_model('encoder_career_prediction.h5')

modeljs1.summary()
modeljs2.summary()

# Convert the models
tfjs.converters.save_keras_model(modeljs1, 'model1_tfjs')
tfjs.converters.save_keras_model(modeljs2, 'model2_tfjs')



FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'encoder_career_prediction.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [None]:
import tensorflow as tf
import tensorflowjs as tfjs

# Define the custom objects with ClusteringLayer and MeanSquaredError
class ClusteringLayer(tf.keras.layers.Layer):
    def __init__(self, number_of_clusters=10, **kwargs):
        super(ClusteringLayer, self).__init__(**kwargs)
        self.number_of_clusters = number_of_clusters

    def build(self, input_shape):
        self.clusters = self.add_weight(shape=(input_shape[-1], self.number_of_clusters),
                                         initializer='glorot_uniform',
                                         trainable=True,
                                         name='clusters')

    def call(self, inputs):
        return tf.matmul(inputs, self.clusters)

# Use tf.keras.losses.MeanSquaredError as the custom object
custom_objects = {'ClusteringLayer': ClusteringLayer, 'mse': 'mse'}

# Load the models
modeljs1 = tf.keras.models.load_model('career_prediction.h5', custom_objects=custom_objects)
modeljs2 = tf.keras.models.load_model('encoder_career_prediction.h5', custom_objects=custom_objects)

# Print summaries of the loaded models
modeljs1.summary()
modeljs2.summary()

# Convert the models to TensorFlow.js format
tfjs.converters.save_keras_model(modeljs1, 'model1_tfjs')
tfjs.converters.save_keras_model(modeljs2, 'model2_tfjs')


In [None]:
from tensorflow import keras

model = keras.models.load_model('career_prediction.h5')

In [None]:
import tensorflow as tf

converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

In [None]:
with open('career_prediction_model.tflite', 'wb') as f:
    f.write(tflite_model)

print("Model saved to career_prediction_model.tflite")

Model saved to career_prediction_model.tflite


In [None]:
import tensorflow as tf

# Load your model (if it's not already loaded)
model = tf.keras.models.load_model('/content/career_prediction.h5')

# Save the model as a SavedModel
tf.saved_model.save(model, '/path/to/saved_model')




In [None]:
import subprocess

command = [
    'tensorflowjs_converter',
    '--input_format=tf_saved_model',
    '--output_format=tfjs_graph_model',
    '/path/to/saved_model',
    '/path/to/save/tfjs_model'
]

result = subprocess.run(command, capture_output=True, text=True)

print("Return code:", result.returncode)
print("stdout:", result.stdout)
print("stderr:", result.stderr)


Return code: 0
stdout: 
2024-06-14 15:00:50.221435: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.



In [None]:
import tensorflow as tf

# Convert the TensorFlow model to TensorFlow.js format
converter = tf.converter.saved_model.convert(
    tf.saved_model.load('career_prediction'),
    './web_model',
    input_arrays=['input_1'],
    output_arrays=['dense_1'],
    user_bindings=[
        tf.converter.saved_model.UserBinding(
            tf.TensorSpec(shape=[None, 10], dtype=tf.float32, name='input_1'),
            lambda: tf.random.normal(shape=[1, 10])
        )
    ]
)

print("Model converted to TensorFlow.js format")

Model Summary

Inputs:
  Name: serving_default_input_1:0, Shape: [ 1 10], Type: <class 'numpy.float32'>

Outputs:
  Name: StatefulPartitionedCall:0, Shape: [ 1 10], Type: <class 'numpy.float32'>

Layers:
  Name: serving_default_input_1:0, Shape: [ 1 10], Type: <class 'numpy.float32'>
  Name: model/conv1d/BiasAdd/ReadVariableOp, Shape: [32], Type: <class 'numpy.float32'>
  Name: model/conv1d/Conv1D/Squeeze, Shape: [3], Type: <class 'numpy.int32'>
  Name: model/conv1d/Conv1D, Shape: [32  1  3  1], Type: <class 'numpy.float32'>
  Name: model/dense/BiasAdd/ReadVariableOp, Shape: [64], Type: <class 'numpy.float32'>
  Name: model/dense_1/BiasAdd/ReadVariableOp, Shape: [10], Type: <class 'numpy.float32'>
  Name: model/reshape/strided_slice/stack_1, Shape: [1], Type: <class 'numpy.int32'>
  Name: model/reshape/strided_slice/stack, Shape: [1], Type: <class 'numpy.int32'>
  Name: model/reshape/Reshape/shape/2, Shape: [], Type: <class 'numpy.int32'>
  Name: model/reshape/Reshape/shape/1, Shape: [

In [None]:
import tensorflow as tf

# Load the TFLite model
tflite_model_path = '/content/career_prediction_model.tflite'
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
interpreter.allocate_tensors()

# Get input and output tensors
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print("Model Summary:")
print("Inputs:")
for detail in input_details:
    print(detail)

print("\nOutputs:")
for detail in output_details:
    print(detail)


Model Summary:
Inputs:
{'name': 'serving_default_input_1:0', 'index': 0, 'shape': array([ 1, 10], dtype=int32), 'shape_signature': array([-1, 10], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}

Outputs:
{'name': 'StatefulPartitionedCall:0', 'index': 26, 'shape': array([ 1, 10], dtype=int32), 'shape_signature': array([-1, 10], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}
