$$ min \ f = \sum_{j=1}^K \sum_{x_{i}\epsilon C_{j}} || x_{j}-\mu_{j}||^2 $$

In [1]:
import csv
import numpy as np
import pandas as pd
class Particle:
    def __init__(self, num_clusters, num_features):
        self.position = np.random.rand(num_clusters, num_features)
        self.velocity = np.random.rand(num_clusters, num_features)
        self.best_position = self.position.copy()
        self.best_fitness = float('inf')

In [2]:
def fitness_function(data, particles, num_clusters):
    distances = np.zeros((len(data), num_clusters))

    for i, particle in enumerate(particles):
        for j in range(num_clusters):
            distances[:, j] = np.linalg.norm(data - particle.position[j], axis=1)

        particle_fitness = np.sum(np.min(distances, axis=1))
        
        if particle_fitness < particle.best_fitness:
            particle.best_fitness = particle_fitness
            particle.best_position = particle.position.copy()

    return particles

In [3]:
def update_velocity_position(particles, inertia, c1, c2, global_best_position):
    for particle in particles:
        inertia_term = inertia * particle.velocity
        cognitive_term = c1 * np.random.rand() * (particle.best_position - particle.position)
        social_term = c2 * np.random.rand() * (global_best_position - particle.position)

        particle.velocity = inertia_term + cognitive_term + social_term
        particle.position = particle.position + particle.velocity

In [4]:
def initialize_particles(num_particles, num_clusters, num_features):
    particles = [Particle(num_clusters, num_features) for _ in range(num_particles)]
    return particles

In [5]:
# Load your data and set parameters
data = np.loadtxt("./data/wikipedia_td.txt")
num_particles = 30
num_clusters = 10
num_features = data.shape[1]
max_iterations = 100
inertia = 0.5
c1 = 2.0
c2 = 2.0


# Initialize particles
particles = initialize_particles(num_particles, num_clusters, num_features)
global_best_position = None
global_best_fitness = float('inf')

In [6]:
# PSO Main Loop
for iteration in range(max_iterations):
    particles = fitness_function(data, particles, num_clusters)
    
    # Update global best
    for particle in particles:
        if particle.best_fitness < global_best_fitness:
            global_best_fitness = particle.best_fitness
            global_best_position = particle.best_position.copy()

    # Update particles' velocity and position
    update_velocity_position(particles, inertia, c1, c2, global_best_position)

In [7]:
import csv

# Load dictionary from CSV file
dictionary_path = "./data/dictionary.csv"  # Replace with the actual path to your dictionary CSV file
dictionary = {}

with open(dictionary_path, "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    for index, word in enumerate(reader):
        dictionary[index] = word[0]

# print(dictionary)

# Print the words corresponding to the top 10 entries of the global best position
sorted_indices = np.argsort(-global_best_position, axis=1)
top_words = [[dictionary[idx] for idx in row[:10]] for row in sorted_indices]
print("Top words for each cluster:")
for cluster_index, words in enumerate(top_words):
    print(f"Cluster {cluster_index + 1}: {', '.join(words)}")



Top words for each cluster:
Cluster 1: raise, mark, associate, focus, base, claim, network, maintain, statement, track
Cluster 2: california, former, occur, enter, capture, learn, east, dead, date, week
Cluster 3: response, website, cover, front, marriage, job, appearance, range, contain, arrive
Cluster 4: approach, version, move, college, question, request, premier, establish, paul, class
Cluster 5: program, approach, hour, artist, question, college, result, decide, washington, injury
Cluster 6: theme, suffer, pay, decline, earlier, british, boy, special, shortly, true
Cluster 7: promote, confirm, develop, sister, dead, theme, kingdom, level, argue, respond
Cluster 8: organization, common, heart, half, result, publish, finish, feel, land, hospital
Cluster 9: victory, sister, middle, top, stage, image, lack, initial, town, board
Cluster 10: originally, respectively, feature, fire, popular, broadcast, john, require, hope, strike


In [8]:
import csv

# Load dictionary from CSV file
titles_path = "./data/titles.csv"  # Replace with the actual path to your dictionary CSV file
titles = {}

with open(titles_path, "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    for index, title in enumerate(reader):
        titles[index] = title[0]
print(titles)

{0: 'Unfinished_portrait_of_Franklin_D._Roosevelt', 1: 'Negan', 2: 'Cam_Newton', 3: 'Beyonce', 4: 'Coachella_Valley_Music_and_Arts_Festival', 5: 'Charlie_Sheen', 6: 'Keanu_Reeves', 7: 'Crimson_Peak', 8: 'Lisa_Brennan-Jobs', 9: 'Rodrigo_Duterte', 10: 'Conor_McGregor', 11: 'The_Life_of_Pablo', 12: 'Memorial_Day', 13: 'Veterans_Day', 14: 'Labor_Day', 15: 'UEFA_Euro_2016', 16: 'Ides_of_March', 17: 'Pat_Bowlen', 18: 'The_Martian_(film)', 19: 'David_Bowie', 20: 'Hanukkah', 21: '2016_ICC_World_Twenty20', 22: 'Whitey_Bulger', 23: 'Chris_Stapleton', 24: 'House_of_Cards_(season_4)', 25: 'Ted_Cruz', 26: 'Clara_Rockmore', 27: 'Flip_Saunders', 28: 'Channing_Tatum', 29: 'Meldonium', 30: 'Fear_the_Walking_Dead', 31: 'Terry_Wogan', 32: 'Negasonic_Teenage_Warhead', 33: 'Leap_year', 34: 'Good_Friday', 35: 'Mahatma_Gandhi', 36: 'Michael_J._Fox', 37: 'Lupe_Fuentes', 38: 'Lucy_(Australopithecus)', 39: 'Von_Miller', 40: 'Blac_Chyna', 41: 'Prince_(musician)', 42: 'The_Walk_(2015_film)', 43: 'Tami_Erin', 44: 

In [9]:
sorted_indices = np.argsort(-global_best_position, axis=0)
top_titles = [[titles[idx] for idx in row[:10]] for row in sorted_indices]
print("Top 10 titles for each cluster:")
for cluster_index, titles in enumerate(top_titles):
    print(f"Cluster {cluster_index + 1}: {', '.join(titles)}")

Top 10 titles for each cluster:
Cluster 1: Crimson_Peak, Lisa_Brennan-Jobs, Negan, Cam_Newton, Keanu_Reeves, Lisa_Brennan-Jobs, Cam_Newton, Keanu_Reeves, Negan, Coachella_Valley_Music_and_Arts_Festival
Cluster 2: Lisa_Brennan-Jobs, Coachella_Valley_Music_and_Arts_Festival, Charlie_Sheen, Crimson_Peak, Negan, Negan, Negan, Rodrigo_Duterte, Cam_Newton, Lisa_Brennan-Jobs
Cluster 3: Rodrigo_Duterte, Unfinished_portrait_of_Franklin_D._Roosevelt, Cam_Newton, Charlie_Sheen, Charlie_Sheen, Keanu_Reeves, Keanu_Reeves, Crimson_Peak, Charlie_Sheen, Charlie_Sheen
Cluster 4: Beyonce, Charlie_Sheen, Lisa_Brennan-Jobs, Beyonce, Unfinished_portrait_of_Franklin_D._Roosevelt, Charlie_Sheen, Coachella_Valley_Music_and_Arts_Festival, Charlie_Sheen, Unfinished_portrait_of_Franklin_D._Roosevelt, Crimson_Peak
Cluster 5: Keanu_Reeves, Cam_Newton, Rodrigo_Duterte, Unfinished_portrait_of_Franklin_D._Roosevelt, Lisa_Brennan-Jobs, Coachella_Valley_Music_and_Arts_Festival, Beyonce, Beyonce, Keanu_Reeves, Unfinishe

In [48]:
# Load titles from CSV file
titles_path = "./data/titles.csv"  # Replace with the actual path to your titles CSV file
titles_df = pd.read_csv(titles_path, header=None, names=['Titles'])

# Find the top 10 titles for each cluster
top_titles_indices = np.argsort(-global_best_position, axis=0)
top_titles = titles_df.iloc[top_titles_indices.flatten()]['Titles'].values.reshape(10, -1)

print(top_titles_indices.shape)
# Print the top 10 titles for each cluster
# print("\nTop titles for each cluster:")
# for cluster_index, cluster_titles in enumerate(top_titles):
#     print(f"Cluster {cluster_index + 1}:")
#     for title in cluster_titles:
#         print(f"  - {title}")

(10, 500)


## GREY WOLF OPTIMIZATION(GWO)

In [10]:
import numpy as np
import pandas as pd

class GreyWolf:
    def __init__(self, num_clusters, num_features):
        self.position = np.random.rand(num_clusters, num_features)
        self.best_position = self.position.copy()
        self.best_fitness = float('inf')

In [11]:
def gwo_fitness_function(data, wolves, num_clusters):
    distances = np.zeros((len(data), num_clusters))

    for i, wolf in enumerate(wolves):
        for j in range(num_clusters):
            distances[:, j] = np.linalg.norm(data - wolf.position[j], axis=1)

        wolf_fitness = np.sum(np.min(distances, axis=1))

        if wolf_fitness < wolf.best_fitness:
            wolf.best_fitness = wolf_fitness
            wolf.best_position = wolf.position.copy()

    return wolves

In [12]:
def gwo_update_position(wolves, a, num_clusters):
    alpha, beta, delta = wolves[0], wolves[1], wolves[2]

    for wolf in wolves:
        r1, r2 = np.random.rand(), np.random.rand()
        A1, A2, A3 = 2 * a * r1 - a, 2 * a * r2 - a, 2 * r2

        D_alpha = np.abs(A1 * alpha.position - wolf.position)
        D_beta = np.abs(A2 * beta.position - wolf.position)
        D_delta = np.abs(A3 * delta.position - wolf.position)

        X1 = alpha.position - A1 * D_alpha
        X2 = beta.position - A2 * D_beta
        X3 = delta.position - A3 * D_delta

        wolf.position = (X1 + X2 + X3) / 3

In [13]:
def gwo_initialize_wolves(num_wolves, num_clusters, num_features):
    wolves = [GreyWolf(num_clusters, num_features) for _ in range(num_wolves)]
    return wolves

In [14]:
# Load your data and set parameters
data = np.loadtxt("./data/wikipedia_td.txt")
num_wolves = 3  # Number of wolves in the GWO algorithm
num_clusters = 10
num_features = data.shape[1]
max_iterations = 100
a = 2.0  # Parameter for GWO algorithm

# Initialize wolves
wolves = gwo_initialize_wolves(num_wolves, num_clusters, num_features)

In [15]:
# GWO Main Loop
for iteration in range(max_iterations):
    wolves = gwo_fitness_function(data, wolves, num_clusters)

    # Update wolves' positions
    gwo_update_position(wolves, a, num_clusters)

In [21]:
# Retrieve the best wolf's position
global_best_position = wolves[np.argmin([wolf.best_fitness for wolf in wolves])].best_position


# Load dictionary from CSV file
dictionary_path = "./data/dictionary.csv"  # Replace with the actual path to your dictionary CSV file
dictionary = {}

with open(dictionary_path, "r", encoding="utf-8") as csvfile:
    reader = csv.reader(csvfile)
    for index, word in enumerate(reader):
        dictionary[index] = word[0]


(10, 500)


In [22]:
# Print the words corresponding to the top 10 entries of the global best position
sorted_indices = np.argsort(-global_best_position, axis=1)
print(sorted_indices)
top_words = [[dictionary[idx] for idx in row[:10]] for row in sorted_indices]
print("Top words for each cluster (GWO):")
for cluster_index, words in enumerate(top_words):
    print(f"Cluster {cluster_index + 1}: {', '.join(words)}")


[[171 318 208 ... 453  24 149]
 [287 335 182 ... 381  74 348]
 [353 435 425 ... 205 225  83]
 ...
 [367 207 274 ... 203 292 484]
 [360 289 403 ...   0 278  68]
 [454 312  20 ... 110 477  18]]
Top words for each cluster (GWO):
Cluster 1: writing, annual, personal, version, argue, system, sometimes, local, website, cause
Cluster 2: fellow, radio, seek, university, request, track, article, travel, conclude, cross
Cluster 3: private, church, reviews, local, president, spend, decline, achieve, white, fail
Cluster 4: significant, addition, minute, series, reference, debut, develop, website, plan, effect
Cluster 5: accord, suffer, miss, executive, book, decline, commercial, website, debut, rest
Cluster 6: san, suggest, central, bill, offer, game, total, canada, manager, reach
Cluster 7: lead, occur, set, boy, war, private, mixed, ability, wear, sense
Cluster 8: pick, promote, style, attention, history, king, activity, publish, produce, originally
Cluster 9: mention, meeting, guest, participat

In [18]:
# Load titles from CSV file
titles_path = "./data/titles.csv"  # Replace with the actual path to your titles CSV file
titles_df = pd.read_csv(titles_path, header=None, names=['Titles'])

In [26]:
sorted_indices = np.argsort(-global_best_position, axis=0)
print(sorted_indices)
top_titles = [[titles[idx] for idx in row[:10]] for row in sorted_indices]
print("Top 10 titles for each cluster:")
for cluster_index, titles in enumerate(top_titles):
    print(f"Cluster {cluster_index + 1}: {', '.join(titles)}")

[[6 1 7 ... 9 1 4]
 [9 4 6 ... 3 3 0]
 [0 2 2 ... 6 0 8]
 ...
 [3 8 5 ... 1 2 1]
 [5 0 8 ... 0 5 7]
 [8 9 4 ... 4 4 9]]
Top 10 titles for each cluster:
Cluster 1: Rodrigo_Duterte, Cam_Newton, Cam_Newton, Rodrigo_Duterte, Rodrigo_Duterte, Cam_Newton, Cam_Newton, Rodrigo_Duterte, Rodrigo_Duterte, Cam_Newton
Cluster 2: Cam_Newton, Cam_Newton, Rodrigo_Duterte, Cam_Newton, Rodrigo_Duterte, Cam_Newton, Cam_Newton, Rodrigo_Duterte, Cam_Newton, Cam_Newton
Cluster 3: Rodrigo_Duterte, Cam_Newton, Cam_Newton, Cam_Newton, Crimson_Peak, Cam_Newton, Rodrigo_Duterte, Cam_Newton, Crimson_Peak, Cam_Newton
Cluster 4: Cam_Newton, Rodrigo_Duterte, Rodrigo_Duterte, Cam_Newton, Cam_Newton, Cam_Newton, Cam_Newton, Cam_Newton, Cam_Newton, Rodrigo_Duterte
Cluster 5: Cam_Newton, Rodrigo_Duterte, Cam_Newton, Rodrigo_Duterte, Cam_Newton, Cam_Newton, Rodrigo_Duterte, Crimson_Peak, Cam_Newton, Rodrigo_Duterte
Cluster 6: Cam_Newton, Cam_Newton, Cam_Newton, Crimson_Peak, Rodrigo_Duterte, Rodrigo_Duterte, Cam_Newton, 