In [25]:
import numpy as np
from numpy.linalg import norm

class Kmeans:
    '''Implementing Kmeans algorithm.'''

    def __init__(self, n_clusters, max_iter=100, random_state=123):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state

    def initializ_centroids(self, X):
        np.random.RandomState(self.random_state)
        random_idx = np.random.permutation(len(X))
        centroids = X[random_idx[:self.n_clusters]]
        return centroids

    def compute_centroids(self, X, labels):
        centroids = np.zeros((self.n_clusters, len(X[0])))
        for k in range(self.n_clusters):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids

    def compute_distance(self, X, centroids):
        distance = np.zeros((len(X), self.n_clusters))
        for k in range(self.n_clusters):
            row_norm = norm(X - centroids[k, :], axis=1)
            distance[:, k] = np.square(row_norm)
        return distance

    def find_closest_cluster(self, distance):
        return np.argmin(distance, axis=1)

    def compute_sse(self, X, labels, centroids):
        distance = np.zeros(len(X))
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
        return np.sum(np.square(distance))

    def fit(self, skills):
        skill_to_index = {skill: i for i, skill in enumerate(skills)}
        X = np.eye(len(skills))
        self.centroids = self.initializ_centroids(X)
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self.compute_distance(X, old_centroids)
            self.labels = self.find_closest_cluster(distance)
            self.centroids = self.compute_centroids(X, self.labels)
            if np.all(old_centroids == self.centroids):
                break
        self.error = self.compute_sse(X, self.labels, self.centroids)

    def predict(self, skills):
        skill_to_index = {skill: i for i, skill in enumerate(skills)}
        X = np.eye(len(skills))
        distance = self.compute_distance(X, self.centroids)
        clusters = self.find_closest_cluster(distance)
        
        # Assign each skill to a demand category based on the cluster
        demand_levels = ["low demand", "middle demand", "high demand"]
        skill_demand = {skill: demand_levels[cluster] for skill, cluster in zip(skills, clusters)}
        return skill_demand

skills = [
    "html", "css", "javascript", "react", "angular", "python", "data analysis", "machine learning",
    "project management", "database", "sql", "nosql", "web development", "front-end", "back-end"
]

kmeans = Kmeans(n_clusters=3)
kmeans.fit(skills)
skill_demand = kmeans.predict(skills)

for skill, demand in skill_demand.items():
    print(f"Skill: {skill} {demand} level")


Skill: html low demand level
Skill: css low demand level
Skill: javascript low demand level
Skill: react low demand level
Skill: angular low demand level
Skill: python low demand level
Skill: data analysis low demand level
Skill: machine learning low demand level
Skill: project management low demand level
Skill: database low demand level
Skill: sql low demand level
Skill: nosql high demand level
Skill: web development low demand level
Skill: front-end middle demand level
Skill: back-end low demand level


In [30]:
import numpy as np
from numpy.linalg import norm

class Kmeans:
    '''Implementing Kmeans algorithm.'''

    def __init__(self, n_clusters, max_iter=100, random_state=123):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state

    def initializ_centroids(self, X):
        rng = np.random.default_rng(self.random_state)
        random_idx = rng.permutation(len(X))
        centroids = X[random_idx[:self.n_clusters]]
        return centroids



    def compute_centroids(self, X, labels):
        centroids = np.zeros((self.n_clusters, len(X[0])))
        for k in range(self.n_clusters):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids

    def compute_distance(self, X, centroids):
        distance = np.zeros((len(X), self.n_clusters))
        for k in range(self.n_clusters):
            centroid_tile = np.tile(centroids[k, :], (len(X), 1)).T  # Transpose centroid_tile
            row_norm = norm(X - centroid_tile, axis=1)
            distance[:, k] = np.square(row_norm)
        return distance



    def find_closest_cluster(self, distance):
        return np.argmin(distance, axis=1)

    def compute_sse(self, X, labels, centroids):
        distance = np.zeros(len(X))
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
        return np.sum(np.square(distance))

    def fit(self, skills):
        flattened_skills = [skill for sublist in skills for skill in sublist]
        skill_to_index = {skill: i for i, skill in enumerate(flattened_skills)}
        X = np.eye(len(flattened_skills))
        self.centroids = self.initializ_centroids(X)
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self.compute_distance(X, old_centroids)
            self.labels = self.find_closest_cluster(distance)
            self.centroids = self.compute_centroids(X, self.labels)
            if np.all(old_centroids == self.centroids):
                break
        self.error = self.compute_sse(X, self.labels, self.centroids)


    def predict(self, skills):
        flattened_skills = [skill for sublist in skills for skill in sublist]
        X = np.eye(len(flattened_skills))
        distance = self.compute_distance(X, self.centroids)
        clusters = self.find_closest_cluster(distance)

        # Assign each skill to a demand category based on the cluster
        demand_levels = ["low demand", "middle demand", "high demand"]
        skill_demand = {skill: [] for skill in flattened_skills}
        for skill, cluster in zip(flattened_skills, clusters):
            skill_demand[skill].append(demand_levels[cluster])
        return skill_demand


    

skills = [
      ['social media platforms', 'content creation', 'and scheduling', 'social media', 'community engagement', 'paid social advertising'],
    ['html', 'css', 'javascript', 'frontend frameworks', 'user experience ux'],
    ['quality control processes and methodologies', 'statistical process control', 'spc root', 'cause analysis and corrective', 'quality management systems eg', 'regulatory knowledge'],
    ['wireless network design', 'wifi standards and protocols', 'rf radio frequency', 'and optimization', 'wireless security protocols', 'troubleshooting', 'wireless network issues'],
    ['event planning', 'conference logistics', 'budget management', 'vendor coordination', 'marketing and promotion', 'client relations'],
    ['ui design', 'graphic design', 'adobe photoshop', 'typography and color theory', 'visual design and layout', 'responsive design'],
    ['account management', 'client relations', 'marketing strategies', 'campaign optimization', 'data analysis', 'communication skills'],
    ['product innovation', 'prototype development'],
    ['test automation', 'test framework development']
]

kmeans = Kmeans(n_clusters=3)
kmeans.fit(skills)
skill_demand = kmeans.predict(skills)

for skill, demand in skill_demand.items():
    print(f"Skills: {skill}: {demand} level ")


Skills: social media platforms: ['low demand'] level 
Skills: content creation: ['low demand'] level 
Skills: and scheduling: ['low demand'] level 
Skills: social media: ['low demand'] level 
Skills: community engagement: ['low demand'] level 
Skills: paid social advertising: ['low demand'] level 
Skills: html: ['low demand'] level 
Skills: css: ['low demand'] level 
Skills: javascript: ['low demand'] level 
Skills: frontend frameworks: ['low demand'] level 
Skills: user experience ux: ['low demand'] level 
Skills: quality control processes and methodologies: ['low demand'] level 
Skills: statistical process control: ['low demand'] level 
Skills: spc root: ['low demand'] level 
Skills: cause analysis and corrective: ['low demand'] level 
Skills: quality management systems eg: ['low demand'] level 
Skills: regulatory knowledge: ['low demand'] level 
Skills: wireless network design: ['low demand'] level 
Skills: wifi standards and protocols: ['low demand'] level 
Skills: rf radio frequen

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


In [52]:
import numpy as np
from numpy.linalg import norm

class Kmeans:
    '''Implementing Kmeans algorithm.'''

    def __init__(self, n_clusters, max_iter=100, random_state=123):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state

    def initializ_centroids(self, X):
        rng = np.random.default_rng(self.random_state)
        random_idx = rng.permutation(len(X))
        centroids = X[random_idx[:self.n_clusters]]
        return centroids

    def compute_centroids(self, X, labels):
        centroids = np.zeros((self.n_clusters, len(X[0])))
        for k in range(self.n_clusters):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids

    def compute_distance(self, X, centroids):
        distance = np.zeros((len(X), self.n_clusters))
        for k in range(self.n_clusters):
            row_norm = norm(X - centroids[k, :], axis=1)
            distance[:, k] = np.square(row_norm)
        return distance

    def find_closest_cluster(self, distance):
        return np.argmin(distance, axis=1)

    def compute_sse(self, X, labels, centroids):
        distance = np.zeros(len(X))
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
        return np.sum(np.square(distance))

    def fit(self, skills):
        skill_to_index = {skill: i for i, skill in enumerate(skills)}
        X = np.eye(len(skills))
        self.centroids = self.initializ_centroids(X)
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self.compute_distance(X, old_centroids)
            self.labels = self.find_closest_cluster(distance)
            self.centroids = self.compute_centroids(X, self.labels)
            if np.all(old_centroids == self.centroids):
                break
        self.error = self.compute_sse(X, self.labels, self.centroids)

    def predict(self, skills):
        skill_to_index = {skill: i for i, skill in enumerate(skills)}
        X = np.eye(len(skills))
        distance = self.compute_distance(X, self.centroids)
        clusters = self.find_closest_cluster(distance)
        
        # Assign each skill to a demand category based on the cluster
        demand_levels = ["low demand", "middle demand", "high demand"]
        skill_demand = {skill: demand_levels[cluster] for skill, cluster in zip(skills, clusters)}
        return skill_demand

skills = [
    "html", "css", "javascript", "react", "angular", "python", "data analysis", "machine learning",
    "project management", "database", "sql", "nosql", "web development", "front-end", "back-end"
]

kmeans = Kmeans(n_clusters=3)
kmeans.fit(skills)
skill_demand = kmeans.predict(skills)

for skill, demand in skill_demand.items():
    print(f" Skill: {skill} : {demand}")


 Skill: html : low demand
 Skill: css : low demand
 Skill: javascript : middle demand
 Skill: react : low demand
 Skill: angular : low demand
 Skill: python : low demand
 Skill: data analysis : low demand
 Skill: machine learning : low demand
 Skill: project management : low demand
 Skill: database : low demand
 Skill: sql : low demand
 Skill: nosql : low demand
 Skill: web development : low demand
 Skill: front-end : low demand
 Skill: back-end : high demand


In [104]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

class Kmeans:
    '''Implementing Kmeans algorithm.'''

    def __init__(self, n_clusters, max_iter=100, random_state=123):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state

    def initializ_centroids(self, X):
        rng = np.random.default_rng(self.random_state)
        random_idx = rng.permutation(len(X))
        centroids = X[random_idx[:self.n_clusters]]
        return centroids

    def compute_centroids(self, X, labels):
        centroids = np.zeros((self.n_clusters, len(X[0])))
        for k in range(self.n_clusters):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids

    def compute_distance(self, X, centroids):
        distance = np.zeros((len(X), self.n_clusters))
        for k in range(self.n_clusters):
            row_norm = norm(X - centroids[k, :], axis=1)
            distance[:, k] = np.square(row_norm)
        return distance

    def find_closest_cluster(self, distance):
        return np.argmin(distance, axis=1)

    def compute_sse(self, X, labels, centroids):
        distance = np.zeros(len(X))
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
        return np.sum(np.square(distance))

    def fit(self, skills):
        flattened_skills = [item for sublist in skills for item in sublist]
        skill_to_index = {skill: i for i, skill in enumerate(flattened_skills)}
        X = np.eye(len(flattened_skills))
        self.centroids = self.initializ_centroids(X)
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self.compute_distance(X, old_centroids)
            self.labels = self.find_closest_cluster(distance)
            self.centroids = self.compute_centroids(X, self.labels)
            if np.all(old_centroids == self.centroids):
                break
        self.error = self.compute_sse(X, self.labels, self.centroids)


    def predict(self, skills):
        flattened_skills = [item for sublist in skills for item in sublist]
        X = np.eye(len(flattened_skills))
        distance = self.compute_distance(X, self.centroids)
        clusters = self.find_closest_cluster(distance)
        
        # Assign each skill to a demand category based on the cluster
        demand_levels = ["low demand", "middle demand", "high demand"]
        skill_demand = {}
        for sublist, cluster in zip(skills, clusters):
            for skill in sublist:
                skill_demand[skill] = demand_levels[cluster]
        return skill_demand



skills = [
    ['social media platforms', 'content creation', 'and scheduling', 'social media', 'community engagement', 'paid social advertising'],
    ['html', 'css', 'javascript', 'frontend frameworks', 'user experience ux'],
    ['quality control processes and methodologies', 'statistical process control', 'spc root', 'cause analysis and corrective', 'quality management systems eg', 'regulatory knowledge'],
    ['wireless network design', 'wifi standards and protocols', 'rf radio frequency', 'and optimization', 'wireless security protocols', 'troubleshooting', 'wireless network issues'],
    ['event planning', 'conference logistics', 'budget management', 'vendor coordination', 'marketing and promotion', 'client relations'],
    ['ui design', 'graphic design', 'adobe photoshop', 'typography and color theory', 'visual design and layout', 'responsive design'],
    ['account management', 'client relations', 'marketing strategies', 'campaign optimization', 'data analysis', 'communication skills'],
    ['product innovation', 'prototype development'],
    ['test automation', 'test framework development'],
]

kmeans = Kmeans(n_clusters=3)
kmeans.fit(skills)
skill_demand = kmeans.predict(skills)

for skill, demand in skill_demand.items():
    print(f" Skill: {skill} : {demand}")


 Skill: social media platforms : low demand
 Skill: content creation : low demand
 Skill: and scheduling : low demand
 Skill: social media : low demand
 Skill: community engagement : low demand
 Skill: paid social advertising : low demand
 Skill: html : low demand
 Skill: css : low demand
 Skill: javascript : low demand
 Skill: frontend frameworks : low demand
 Skill: user experience ux : low demand
 Skill: quality control processes and methodologies : low demand
 Skill: statistical process control : low demand
 Skill: spc root : low demand
 Skill: cause analysis and corrective : low demand
 Skill: quality management systems eg : low demand
 Skill: regulatory knowledge : low demand
 Skill: wireless network design : low demand
 Skill: wifi standards and protocols : low demand
 Skill: rf radio frequency : low demand
 Skill: and optimization : low demand
 Skill: wireless security protocols : low demand
 Skill: troubleshooting : low demand
 Skill: wireless network issues : low demand
 Skil

In [124]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import numpy as np


class Kmeans:
    '''Implementing Kmeans algorithm.'''

    def __init__(self, n_clusters, max_iter=100, random_state=123):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.random_state = random_state

    def initializ_centroids(self, X):
        rng = np.random.default_rng(self.random_state)
        random_idx = rng.permutation(X.shape[0])
        centroids = X[random_idx[:self.n_clusters]]
        return centroids


    def compute_centroids(self, X, labels):
        centroids = np.zeros((self.n_clusters, X.shape[1]))
        for k in range(self.n_clusters):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids

    def compute_distance(self, X, centroids):
        distance = np.zeros((X.shape[0], self.n_clusters))
        for k in range(self.n_clusters):
            centroid_broadcasted = np.tile(centroids[k], (X.shape[0], 1))
            row_norm = norm(X - centroid_broadcasted, axis=1)
            distance[:, k] = np.square(row_norm)
        return distance






    def find_closest_cluster(self, distance):
        return np.argmin(distance, axis=1)

    def compute_sse(self, X, labels, centroids):
        distance = np.zeros(len(X))
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
        return np.sum(np.square(distance))

    def fit(self, skills):
        # Flatten the list of skills
        flattened_skills = [skill for sublist in skills for skill in sublist]

        # Feature extraction using TF-IDF
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(flattened_skills)
        X = normalize(X)

        # Initialize centroids
        self.centroids = self.initializ_centroids(X)

        # Perform KMeans clustering
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self.compute_distance(X, old_centroids)
            self.labels = self.find_closest_cluster(distance)
            self.centroids = self.compute_centroids(X, self.labels)
            if np.all(old_centroids == self.centroids):
                break
        self.error = self.compute_sse(X, self.labels, self.centroids)

    def predict(self, skills):
        # Flatten the list of skills
        flattened_skills = [skill for sublist in skills for skill in sublist]

        # Feature extraction using TF-IDF
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(flattened_skills)
        X = normalize(X)

        # Compute distance to centroids
        distance = self.compute_distance(X, self.centroids)
        clusters = self.find_closest_cluster(distance)

        # Assign each skill to a demand category based on the cluster
        demand_levels = ["low demand", "middle demand", "high demand"]
        skill_demand = {}
        for sublist, cluster in zip(skills, clusters):
            for skill in sublist:
                skill_demand[skill] = demand_levels[cluster]
        return skill_demand


skills = [
    ['social media platforms', 'content creation', 'and scheduling', 'social media', 'community engagement', 'paid social advertising'],
    ['html', 'css', 'javascript', 'frontend frameworks', 'user experience ux'],
    ['quality control processes and methodologies', 'statistical process control', 'spc root', 'cause analysis and corrective', 'quality management systems eg', 'regulatory knowledge'],
    ['wireless network design', 'wifi standards and protocols', 'rf radio frequency', 'and optimization', 'wireless security protocols', 'troubleshooting', 'wireless network issues'],
    ['event planning', 'conference logistics', 'budget management', 'vendor coordination', 'marketing and promotion', 'client relations'],
    ['ui design', 'graphic design', 'adobe photoshop', 'typography and color theory', 'visual design and layout', 'responsive design'],
    ['account management', 'client relations', 'marketing strategies', 'campaign optimization', 'data analysis', 'communication skills'],
    ['product innovation', 'prototype development'],
    ['test automation', 'test framework development'],
]

kmeans = Kmeans(n_clusters=3)
kmeans.fit(skills)
skill_demand = kmeans.predict(skills)

for skill, demand in skill_demand.items():
    print(f" Skill: {skill} : {demand}")


NotImplementedError: subtracting a sparse matrix from a nonzero scalar is not supported