Steps for model training
1. Initial Model training:
    1. Scaled the data using Standard Scaler
    2. Trained a k-means model to cluster data and measured the Silhoutte Score  
    3. Removed the clusters with very few data points from further model training
    4. Used the clusters obtained by k-means to classify the data using knn after splitting train and test sets and measured metrics like accuracy, precision, f1 score and recall and discovered data is imbalanced

2. Preprocessing Steps:
    1. Applied SMOTE
    2. Scaled the data using Standard Scaler
    3. Reduced dimensionality using truncated SVD

3. Retrained the models k-means to obtain the cluster and knn to classify data along those clusters
4. Saved the model in .onnx format

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType


In [2]:
df = pd.read_csv(r"D:\hackathons\CaseVault\recommendation-engine\Data\preprocessed\preprocessed.csv")

In [3]:
df

Unnamed: 0.1,Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0,1.632583,-0.377532,0.421133,-0.572584,-0.131109,-0.001156,-0.595789,0.847357,0.465964,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,1,0.789583,-0.498816,-0.599629,-0.231379,0.016958,-0.591930,0.450830,-0.418475,0.848181,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,2,0.849633,-0.589849,-0.655112,0.592220,-0.308401,-0.145752,-0.058529,-0.074482,-0.161176,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,3,0.783255,-0.403721,-0.562864,-0.392344,0.119531,-0.196019,-0.411185,0.621779,0.221999,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,4,1.219752,-0.394178,-0.025055,-0.374735,0.006981,-0.212935,-0.390794,0.635433,0.208640,...,0.0,0.0,0.0,0.0,0.025314,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10995,10995,0.778033,-0.405470,-0.574456,-0.375656,0.141227,-0.197933,-0.382736,0.574690,0.182558,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
10996,10996,0.768899,-0.398344,-0.559442,-0.361337,0.128174,-0.201218,-0.381377,0.576052,0.197655,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
10997,10997,1.639517,-0.587959,0.457809,0.675293,-0.637100,-0.181253,-0.134810,0.019063,-0.071621,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
10998,10998,0.976004,-0.870889,-0.440822,1.160238,0.708630,0.226552,0.098795,0.148528,-0.160238,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [4]:
df = df.drop('Unnamed: 0',axis=1)

In [5]:
df

Unnamed: 0,component_1,component_2,component_3,component_4,component_5,component_6,component_7,component_8,component_9,component_10,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,1.632583,-0.377532,0.421133,-0.572584,-0.131109,-0.001156,-0.595789,0.847357,0.465964,-0.257247,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.789583,-0.498816,-0.599629,-0.231379,0.016958,-0.591930,0.450830,-0.418475,0.848181,-0.488616,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.849633,-0.589849,-0.655112,0.592220,-0.308401,-0.145752,-0.058529,-0.074482,-0.161176,0.132598,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.783255,-0.403721,-0.562864,-0.392344,0.119531,-0.196019,-0.411185,0.621779,0.221999,0.307248,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,1.219752,-0.394178,-0.025055,-0.374735,0.006981,-0.212935,-0.390794,0.635433,0.208640,0.287776,...,0.0,0.0,0.0,0.0,0.025314,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10995,0.778033,-0.405470,-0.574456,-0.375656,0.141227,-0.197933,-0.382736,0.574690,0.182558,0.283897,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
10996,0.768899,-0.398344,-0.559442,-0.361337,0.128174,-0.201218,-0.381377,0.576052,0.197655,0.280361,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
10997,1.639517,-0.587959,0.457809,0.675293,-0.637100,-0.181253,-0.134810,0.019063,-0.071621,0.150423,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
10998,0.976004,-0.870889,-0.440822,1.160238,0.708630,0.226552,0.098795,0.148528,-0.160238,-0.200619,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df.iloc[:, :-1])  


In [7]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)  

In [8]:
from sklearn.metrics import silhouette_score

silhouette_avg = silhouette_score(X_scaled, df['cluster'])
print(f"Silhouette Score: {silhouette_avg:.2f}")


Silhouette Score: 0.07


In [9]:
print(df['cluster'].value_counts())


cluster
1    10730
0      267
3        2
2        1
Name: count, dtype: int64


In [10]:
valid_clusters = df['cluster'].value_counts()[df['cluster'].value_counts() > 1].index
df_filtered = df[df['cluster'].isin(valid_clusters)]

train_data, test_data = train_test_split(df_filtered, test_size=0.3, stratify=df_filtered['cluster'], random_state=42)

X_train, y_train = train_data.iloc[:, :-1], train_data['cluster']
X_test, y_test = test_data.iloc[:, :-1], test_data['cluster']


In [11]:
print(df['cluster'].value_counts())


cluster
1    10730
0      267
3        2
2        1
Name: count, dtype: int64


In [12]:
knn = KNeighborsClassifier(n_neighbors=3, metric='cosine')
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [13]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Accuracy: 1.00
Precision: 0.65
Recall: 0.65
F1-Score: 0.65


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
from imblearn.over_sampling import SMOTE
X = df.drop(columns=['cluster'])  
y = df['cluster']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [16]:
min_class_size = 5
class_counts = y_train.value_counts()

valid_classes = class_counts[class_counts >= min_class_size].index
X_train_filtered = X_train[y_train.isin(valid_classes)]
y_train_filtered = y_train[y_train.isin(valid_classes)]

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_filtered, y_train_filtered)


In [17]:
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)


In [18]:
X_resampled_scaled.shape

(15018, 5101)

In [19]:
X_test.shape

(3300, 5101)

In [20]:
X_train.shape

(7700, 5101)

In [21]:
X_resampled.shape

(15018, 5101)

In [22]:
from sklearn.decomposition import TruncatedSVD

n_components = 300  

svd = TruncatedSVD(n_components=n_components, random_state=42)

X_train_reduced = svd.fit_transform(X_resampled_scaled)  

X_test_reduced = svd.transform(X_test) 



In [23]:
X_train_reduced.shape

(15018, 300)

In [24]:
X_test_reduced.shape

(3300, 300)

In [25]:
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_train_reduced)

cluster_assignments = kmeans.labels_

print("Cluster Centers:\n", kmeans.cluster_centers_)
print("Number of samples in each cluster:", np.bincount(cluster_assignments))

Cluster Centers:
 [[-8.11438707e+00 -5.93399439e+00  1.91056423e+00 ... -9.95348681e-03
  -1.15643707e-01 -3.87733614e-02]
 [-5.92913522e+00 -9.61871828e+00  3.31982716e+00 ... -5.75955896e-02
   3.46736996e-02 -7.54095684e-02]
 [-1.13541170e+01 -8.83961749e+00  8.20260442e-01 ...  3.02092751e-02
   2.37585080e-02 -5.74879239e-03]
 [ 9.96436532e+00  3.33363416e-01 -4.07365678e-01 ...  9.30082398e-03
  -4.35840056e-03  9.42246655e-04]
 [-1.06691815e+01  5.49890608e+00 -5.87419462e-01 ... -1.02472440e-02
  -9.37799562e-03  2.24155888e-02]]
Number of samples in each cluster: [  72 1269 1655 7520 4502]


In [26]:
silhouette_avg = silhouette_score(X_train_reduced, kmeans.labels_)

print(f"Silhouette Score: {silhouette_avg:.4f}")


Silhouette Score: 0.0582


In [27]:
y_resampled.shape

(15018,)

In [28]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_reduced, y_resampled)  

y_pred = knn.predict(X_test_reduced) 

In [29]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.9761
Precision: 0.9527
Recall: 0.9761
F1-Score: 0.9642
Confusion Matrix:
[[   0   77    0    0]
 [   0 3221    0    0]
 [   0    1    0    0]
 [   0    1    0    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [30]:
kmeans_onnx = convert_sklearn(kmeans, initial_types=[('input', FloatTensorType([None, X_train_reduced.shape[1]]))])
with open("kmeans_model.onnx", "wb") as f:
    f.write(kmeans_onnx.SerializeToString())

In [31]:
knn_onnx = convert_sklearn(knn, initial_types=[('input', FloatTensorType([None, X_train_reduced.shape[1]]))])
with open("knn_model.onnx", "wb") as f:
    f.write(knn_onnx.SerializeToString())