In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Sample skill data
skills = [
    "html", "css", "javascript", "react", "angular", "python", "data analysis", "machine learning",
    "project management", "database", "sql", "nosql", "web development", "front-end", "back-end"
]

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(skills)

# Clustering using KMeans
n_clusters =3
kmeans = KMeans(n_clusters=n_clusters)
kmeans.fit(X)

# Determine the cluster centers
cluster_centers = kmeans.cluster_centers_

# Determine which cluster each skill belongs to
skill_clusters = kmeans.predict(X)

# Determine the demand level of each cluster
cluster_sizes = [sum(skill_clusters == i) for i in range(n_clusters)]
highest_demand_cluster = cluster_sizes.index(max(cluster_sizes))
lowest_demand_cluster = cluster_sizes.index(min(cluster_sizes))
middle_demand_cluster = [i for i in range(n_clusters) if i != highest_demand_cluster and i != lowest_demand_cluster][0]

# Determine the demand level of each skill
demand_levels = []
for skill, cluster in zip(skills, skill_clusters):
    if cluster == highest_demand_cluster:
        demand_levels.append("High demand")
    elif cluster == lowest_demand_cluster:
        demand_levels.append("Low demand")
    else:
        demand_levels.append("Middle demand")

# Now you have the demand level of each skill
for skill, demand_level in zip(skills, demand_levels):
    print(f"Skill: {skill}, Demand Level: {demand_level}")


  super()._check_params_vs_input(X, default_n_init=10)


Skill: html, Demand Level: High demand
Skill: css, Demand Level: High demand
Skill: javascript, Demand Level: High demand
Skill: react, Demand Level: High demand
Skill: angular, Demand Level: High demand
Skill: python, Demand Level: High demand
Skill: data analysis, Demand Level: Low demand
Skill: machine learning, Demand Level: High demand
Skill: project management, Demand Level: High demand
Skill: database, Demand Level: High demand
Skill: sql, Demand Level: High demand
Skill: nosql, Demand Level: High demand
Skill: web development, Demand Level: High demand
Skill: front-end, Demand Level: Middle demand
Skill: back-end, Demand Level: Middle demand


In [2]:
import numpy as np
from numpy.linalg import norm

class Kmeans:
    '''Implementing Kmeans algorithm.'''

    def __init__(self, n_clusters, max_iter=100, random_state=123):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state

    def initializ_centroids(self, X):
        np.random.RandomState(self.random_state)
        random_idx = np.random.permutation(len(X))
        centroids = X[random_idx[:self.n_clusters]]
        return centroids

    def compute_centroids(self, X, labels):
        centroids = np.zeros((self.n_clusters, len(X[0])))
        for k in range(self.n_clusters):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids

    def compute_distance(self, X, centroids):
        distance = np.zeros((len(X), self.n_clusters))
        for k in range(self.n_clusters):
            row_norm = norm(X - centroids[k, :], axis=1)
            distance[:, k] = np.square(row_norm)
        return distance

    def find_closest_cluster(self, distance):
        return np.argmin(distance, axis=1)

    def compute_sse(self, X, labels, centroids):
        distance = np.zeros(len(X))
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
        return np.sum(np.square(distance))

    def fit(self, skills):
        skill_to_index = {skill: i for i, skill in enumerate(skills)}
        X = np.eye(len(skills))
        self.centroids = self.initializ_centroids(X)
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self.compute_distance(X, old_centroids)
            self.labels = self.find_closest_cluster(distance)
            self.centroids = self.compute_centroids(X, self.labels)
            if np.all(old_centroids == self.centroids):
                break
        self.error = self.compute_sse(X, self.labels, self.centroids)

    def predict(self, skills):
        skill_to_index = {skill: i for i, skill in enumerate(skills)}
        X = np.eye(len(skills))
        distance = self.compute_distance(X, self.centroids)
        return self.find_closest_cluster(distance)

skills = [
    "html", "css", "javascript", "react", "angular", "python", "data analysis", "machine learning",
    "project management", "database", "sql", "nosql", "web development", "front-end", "back-end"
]

kmeans = Kmeans(n_clusters=3)
kmeans.fit(skills)
clusters = kmeans.predict(skills)

for cluster, skill in zip(clusters, skills):
    print(f"{skill} belongs to cluster {cluster}")


html belongs to cluster 1
css belongs to cluster 0
javascript belongs to cluster 0
react belongs to cluster 0
angular belongs to cluster 0
python belongs to cluster 0
data analysis belongs to cluster 0
machine learning belongs to cluster 0
project management belongs to cluster 0
database belongs to cluster 0
sql belongs to cluster 0
nosql belongs to cluster 0
web development belongs to cluster 0
front-end belongs to cluster 2
back-end belongs to cluster 0
