$$ min \ f = \sum_{j=1}^K \sum_{x_{i}\epsilon C_{j}} || x_{j}-\mu_{j}||^2 $$

In [84]:
import numpy as np

class Particle:
    def __init__(self, num_clusters, num_features):
        self.position = np.random.rand(num_clusters, num_features)
        self.velocity = np.random.rand(num_clusters, num_features)
        self.best_position = self.position.copy()
        self.best_fitness = float('inf')

In [85]:
def fitness_function(data, particles, num_clusters):
    distances = np.zeros((len(data), num_clusters))

    for i, particle in enumerate(particles):
        for j in range(num_clusters):
            distances[:, j] = np.linalg.norm(data - particle.position[j], axis=1)

        particle_fitness = np.sum(np.min(distances, axis=1))
        
        if particle_fitness < particle.best_fitness:
            particle.best_fitness = particle_fitness
            particle.best_position = particle.position.copy()

    return particles

In [86]:
def update_velocity_position(particles, inertia, c1, c2, global_best_position):
    for particle in particles:
        inertia_term = inertia * particle.velocity
        cognitive_term = c1 * np.random.rand() * (particle.best_position - particle.position)
        social_term = c2 * np.random.rand() * (global_best_position - particle.position)

        particle.velocity = inertia_term + cognitive_term + social_term
        particle.position = particle.position + particle.velocity

In [90]:
def initialize_particles(num_particles, num_clusters, num_features):
    particles = [Particle(num_clusters, num_features) for _ in range(num_particles)]
    return particles


In [91]:
# Load your data and set parameters
data = np.loadtxt("./data/wikipedia_td.txt")
num_particles = 30
num_clusters = 10
num_features = data.shape[1]  # Corrected the dimension of num_features
max_iterations = 100
inertia = 0.5
c1 = 2.0
c2 = 2.0

# Initialize particles
particles = initialize_particles(num_particles, num_clusters, num_features)
global_best_position = None
global_best_fitness = float('inf')

In [92]:
#  PSO Main Loop
for iteration in range(max_iterations):
    particles = fitness_function(data, particles, num_clusters)
    
    # Update global best
    for particle in particles:
        if particle.best_fitness < global_best_fitness:
            global_best_fitness = particle.best_fitness
            global_best_position = particle.best_position.copy()

    # Update particles' velocity and position
    update_velocity_position(particles, inertia, c1, c2, global_best_position)

In [93]:
import csv

# Load dictionary from CSV file
dictionary_path = "./data/dictionary.csv"  # Replace with the actual path to your dictionary CSV file
dictionary = {}

with open(dictionary_path, "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    for index, word in enumerate(reader):
        dictionary[index] = word[0]

# print(dictionary)

# Print the words corresponding to the top 10 entries of the global best position
sorted_indices = np.argsort(-global_best_position, axis=1)
top_words = [[dictionary[idx] for idx in row[:10]] for row in sorted_indices]
print("Top words for each cluster:")
for cluster_index, words in enumerate(top_words):
    print(f"Cluster {cluster_index + 1}: {', '.join(words)}")



Top words for each cluster:
Cluster 1: share, pass, government, defeat, shortly, track, control, james, critic, cite
Cluster 2: overall, enter, level, select, community, private, require, element, half, choice
Cluster 3: record, heart, launch, broadcast, hour, sometimes, king, black, request, explain
Cluster 4: parent, attention, movie, writer, feel, board, mixed, decision, writing, officially
Cluster 5: england, political, campaign, test, sell, program, company, question, engage, play
Cluster 6: lot, san, common, paul, city, support, express, able, appearance, canada
Cluster 7: left, force, travel, charge, believe, talk, marry, enter, accord, official
Cluster 8: regular, subsequently, average, variety, defeat, addition, system, professional, contain, office
Cluster 9: drive, attend, hospital, george, challenge, manage, studio, title, agree, response
Cluster 10: entertainment, hear, explain, believe, determine, spend, potential, american, capture, richard


In [82]:
titles_df = pd.read_csv("./data/titles.csv", header=None)
titles = titles_df.iloc[:, 0].tolist()