# ***Clustering Models for kNowy***

# Setting the environment

In [None]:
!pip install tensorflow-model-optimization

Collecting tensorflow-model-optimization
  Downloading tensorflow_model_optimization-0.8.0-py2.py3-none-any.whl (242 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/242.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m133.1/242.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m235.5/242.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorflow-model-optimization
Successfully installed tensorflow-model-optimization-0.8.0


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

# Read the data

In [None]:
url = 'https://raw.githubusercontent.com/Capstone-project-Knowy/machine-learn/main/Dataset/Data_final.csv'

data = pd.read_csv(url)

x = data.drop(columns=['Career'])
y = data['Career']

# Standardize the data using StandardScaler

In [None]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

# Creating AutoEncoder Models

In [None]:
input_shape = (10,)
inputs = keras.Input(shape=input_shape)
x = layers.Reshape((10, 1))(inputs)  # reshape input to (10, 1)
x = layers.Conv1D(32, kernel_size=3, activation='relu')(x)  # 1D convolutional layer
x = layers.MaxPooling1D(pool_size=2)(x)  # max pooling layer
x = layers.Flatten()(x)  # flatten output
x = layers.Dense(64, activation='relu')(x)  # dense layer
outputs = layers.Dense(10, activation='linear')(x)  # output layer for reconstruction

# Compile and Training the models

In [None]:
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

model.fit(x_scaled, x_scaled, epochs=50, batch_size=16, shuffle=True, verbose=2)

Epoch 1/50
7/7 - 6s - loss: 0.9640 - accuracy: 0.1333 - 6s/epoch - 832ms/step
Epoch 2/50
7/7 - 0s - loss: 0.8133 - accuracy: 0.4762 - 22ms/epoch - 3ms/step
Epoch 3/50
7/7 - 0s - loss: 0.6996 - accuracy: 0.5714 - 22ms/epoch - 3ms/step
Epoch 4/50
7/7 - 0s - loss: 0.5945 - accuracy: 0.6000 - 22ms/epoch - 3ms/step
Epoch 5/50
7/7 - 0s - loss: 0.5000 - accuracy: 0.6667 - 22ms/epoch - 3ms/step
Epoch 6/50
7/7 - 0s - loss: 0.4247 - accuracy: 0.6667 - 21ms/epoch - 3ms/step
Epoch 7/50
7/7 - 0s - loss: 0.3705 - accuracy: 0.7143 - 23ms/epoch - 3ms/step
Epoch 8/50
7/7 - 0s - loss: 0.3282 - accuracy: 0.7143 - 22ms/epoch - 3ms/step
Epoch 9/50
7/7 - 0s - loss: 0.2925 - accuracy: 0.7524 - 29ms/epoch - 4ms/step
Epoch 10/50
7/7 - 0s - loss: 0.2612 - accuracy: 0.7905 - 27ms/epoch - 4ms/step
Epoch 11/50
7/7 - 0s - loss: 0.2316 - accuracy: 0.8095 - 27ms/epoch - 4ms/step
Epoch 12/50
7/7 - 0s - loss: 0.2077 - accuracy: 0.7810 - 22ms/epoch - 3ms/step
Epoch 13/50
7/7 - 0s - loss: 0.1857 - accuracy: 0.7905 - 22ms

<keras.src.callbacks.History at 0x7a45c85dfee0>

# Extract Encoder part from model and its features

In [None]:
encoder = keras.Model(inputs=model.input, outputs=model.layers[-3].output)

encoded_features = encoder.predict(x_scaled)



# Using K-Means Clustering

In [None]:
num_clusters = len(np.unique(y))
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(encoded_features)

  kmeans.fit(encoded_features)


# Get the cluster lables and transform it to Integer

In [None]:
labels = kmeans.labels_

_, y_encoded = np.unique(y, return_inverse=True)

# Evaluate the models using ARI [Adjusted Rand Index]

In [None]:
ari_score = adjusted_rand_score(y_encoded, labels)
print(f"Adjusted Rand Index: {ari_score}")

Adjusted Rand Index: -0.0003402517863218782


# Result of the Cluster using Dataframe

In [None]:
df = pd.DataFrame(x_scaled, columns=[f'feature_{i+1}' for i in range(10)])
df['career'] = y.values
df['cluster'] = labels

print(df.head())

   feature_1  feature_2  feature_3  feature_4  feature_5  feature_6  \
0  -1.325510   0.935916  -1.283532  -1.010057  -1.429708   1.688228   
1   1.067059  -1.545052  -0.604755  -0.274248  -1.429708  -0.420530   
2  -0.844122  -0.627094   2.318267   0.461562  -0.527743   0.148989   
3   1.311345   1.026884  -0.806554  -0.791303  -1.048107   1.000701   
4  -0.607021   0.017957   0.074022   1.495672  -1.429708  -0.307652   

   feature_7  feature_8  feature_9  feature_10              career  cluster  
0   0.094643   0.929447   1.161923   -0.453172          Accountant       40  
1   2.100720   1.567791   0.330372   -0.527090    Graphic Designer       24  
2  -0.075538   0.739669   0.507412    1.112546         Salesperson        4  
3   0.496890   0.739669   1.939826    0.588400  Research Scientist       46  
4  -0.188992   0.613150   0.029941    1.260382             Teacher       14  


# Cluster unique code for each Career

In [None]:
pd.set_option('display.max_rows', 105)

df_sorted = df[['career', 'cluster']].sort_values('cluster')

print(df_sorted)

                                    career  cluster
102                         Civil Engineer        0
56                       Interior Designer        2
49                      Wildlife Biologist        3
42                               Zoologist        3
2                              Salesperson        4
104            Electronics Design Engineer        5
98                  Administrative Officer        6
80                        Database Analyst        7
99                           Tax Collector        7
25                   Marketing Coordinator        8
21                 Environmental Scientist        9
96                                Diplomat       10
100                Foreign Service Officer       10
14                   Construction Engineer       11
26                     Biomedical Engineer       12
9                                     Chef       13
4                                  Teacher       14
34       Software Quality Assurance Tester       15
97          

# **Testing the models**

In [None]:
def predict_career(new_inputs):
    new_inputs_scaled = scaler.transform(new_inputs)
    encoded_features = encoder.predict(new_inputs_scaled)
    cluster_labels = kmeans.predict(encoded_features)
    return cluster_labels

In [None]:
def map_cluster_to_career(cluster_label):
    cluster_to_career_map = {}
    for cluster, career in df_sorted.groupby('cluster')['career']:
        cluster_to_career_map[cluster] = career.iloc[0]
    return cluster_to_career_map.get(cluster_label, 'Unknown')

In [None]:
new_inputs = np.array([[1.78, 2.45, 2.45, 5.67, 3.56, 4.00, 6.00, 1.00, 6.00, 1.00]])
predicted_cluster = predict_career(new_inputs)[0]
predicted_career = map_cluster_to_career(predicted_cluster)
print(f"Predicted career: {predicted_career}")





Predicted career: Construction Engineer


# **Saving the Models**

In [None]:
def save_model(model, filename):
    model.save(filename)

In [None]:
model.summary()
save_model(model, 'career_prediction.h5')

print("Model saved to career_prediction.h5")

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 10)]              0         
                                                                 
 reshape_1 (Reshape)         (None, 10, 1)             0         
                                                                 
 conv1d_1 (Conv1D)           (None, 8, 32)             128       
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 4, 32)             0         
 g1D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                             

  saving_api.save_model(


Model saved to career_prediction.h5


In [None]:
encoder.summary()
save_model(encoder, 'encoder_career_prediction.h5')

print("Model saved to encoder_career_prediction.h5")

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 10)]              0         
                                                                 
 reshape_1 (Reshape)         (None, 10, 1)             0         
                                                                 
 conv1d_1 (Conv1D)           (None, 8, 32)             128       
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 4, 32)             0         
 g1D)                                                            
                                                                 
 flatten_1 (Flatten)         (None, 128)               0         
                                                                 
Total params: 128 (512.00 Byte)
Trainable params: 128 (512.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_____________________



Model saved to encoder_career_prediction.h5
