In [18]:
import os
import ast
import math
import json
import torch
import openai
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import Counter
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from langchain.embeddings import OpenAIEmbeddings
from transformers import BertTokenizer, BertModel
from get_embedding import getEmbeddingDict

corpus_path = "corpus/emo_map_list.txt"
aug_corpus_path = "corpus/2_aug_emo_map_list.txt"
embedding_path = "embedding/emomap_embeddings_ada002.json"
aug_embedding_path = "embedding/aug_emomap_embeddings_ada002.json"

In [105]:
## Feature
def data_augmentation(corpus, scale):
    with open(corpus, 'r') as f:
        words = eval(f.read())
    aug_words = [word.strip() * scale for word in words]

    aug_path = f"corpus/{scale}_aug_emo_map_list.txt"
    with open(aug_path, "w") as f:
        f.write(str(aug_words))
    
    print("the augmented data is stored in: ", aug_path)

def dim_reduction(n_components, embedding_dict): # input: dict; output: dict
    keys = list(embedding_dict.keys())
    embeddings = list(embedding_dict.values())
    pca = PCA(n_components=n_components)
    pca_embeddings = pca.fit_transform(embeddings).tolist()
    pca_embedding_dict = dict(zip(keys, pca_embeddings))
    return pca_embedding_dict

def kmeans_cls(num_clusters, tags, embeddings): # K-means

    print('cluster=', num_clusters)
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(embeddings)
    cluster_labels = kmeans.labels_

    return dict(zip(tags, cluster_labels))

emo_embedding_dict = getEmbeddingDict(corpus_path, embedding_path)
augemo_embedding_dict = getEmbeddingDict(aug_corpus_path, aug_embedding_path)
pca_embedding_dict = dim_reduction(150, emo_embedding_dict)

In [106]:
## Kmeans Clustering
a = kmeans_cls(150, list(pca_embedding_dict.keys()), list(pca_embedding_dict.values()))
emotion_df = pd.DataFrame(list(a.items()), columns=['tags_emotion', 'label'])
emotion_df.sort_values('label').to_excel('emotion_cluster.xlsx')

cluster= 150


  super()._check_params_vs_input(X, default_n_init=10)


In [104]:
## Supervised Classification
df = pd.read_excel('data/emotion_keep.xlsx')
df['tags_emotion'] = df['tags_emotion'].map(pca_embedding_dict)
df = df[['tags_emotion', 'label']]

In [107]:
with open('pcaemo_center_embedding.json', 'r') as file:
    cluster_centers_data = json.load(file)
with open('pcaemo_embeddings_ada002.json', 'r') as file:
    embeddings_data = json.load(file)

cluster_centers_data_sample = {k: cluster_centers_data[k] for k in list(cluster_centers_data)[:5]}
embeddings_data_sample = {k: embeddings_data[k] for k in list(embeddings_data)[:5]}

In [None]:
with open(corpus_path, 'r') as f:
    tags = eval(f.read()) # load theme tags
unique_tags = list(set(tags)) # unique tags

# LLM-ada002 configuration
embedding = OpenAIEmbeddings(
    deployment = "embedding-ada-002-2",
    model = "text-embedding-ada-002",
    openai_api_key = "6e25ec6fa59d44f8af091db59e6db6d7",
    openai_api_base = 'https://tcl-ai.openai.azure.com/',
    openai_api_type = 'azure',
    openai_api_version = '2023-07-01-preview',
    chunk_size=1,
)

# generate LLM-ada002 embedding
tag_documents = unique_tags
tag_embeddings = embedding.embed_documents(tag_documents)

# save LLM-ada002 embedding
embedding_dictionary = {}
for tagDoc, tagEmbedding in zip(tag_documents, tag_embeddings):
    embedding_dictionary[tagDoc] = (tagEmbedding)
file_path = embedding_path
with open(file_path, 'w') as json_file:
    json.dump(embedding_dictionary, json_file)

In [117]:
with open('embedding/emo_center_embedding.json', 'r') as f:
    theme_embeddings_dict = json.load(f)
    pca_embedding_dict = dim_reduction(10, theme_embeddings_dict)
with open('emo_center_embedding.json', 'w') as json_file:
        json.dump(pca_embedding_dict, json_file)
with open('embedding/emomap_embeddings_ada002.json', 'r') as f:
    theme_embeddings_dict = json.load(f)
    pca_embedding_dict = dim_reduction(10, theme_embeddings_dict)
with open('pcaemo_embeddings_ada002.json', 'w') as json_file:
        json.dump(pca_embedding_dict, json_file)

In [126]:
cluster_centers = list(cluster_centers_data.values())

embeddings = list(embeddings_data.values())
embedding_keys = list(embeddings_data.keys())

distances = cdist(embeddings, cluster_centers, 'euclidean')
closest_cluster_indices = np.argmin(distances, axis=1)

cluster_names = list(cluster_centers_data.keys())
classified_embeddings = {embedding_keys[i]: cluster_names[closest_cluster_indices[i]] for i in range(len(embedding_keys))}

In [128]:
df_embeddings = pd.DataFrame(list(classified_embeddings.items()), columns=['Key', 'Cluster'])
df_embeddings.sort_values('Cluster').to_excel('emotion_class.xlsx')

In [72]:
df

tuple_pca_embedding_dict = {k: tuple(v) for k, v in pca_embedding_dict.items()}
inverse_pca_embedding_dict = {v: k for k, v in tuple_pca_embedding_dict.items()}
df['tags_emotion'] = df['tags_emotion'].apply(lambda x: inverse_pca_embedding_dict[tuple(x)] if tuple(x) in inverse_pca_embedding_dict else None)
df = df[['tags_emotion', 'label', 'predicted_labels']]

In [93]:
df = pd.read_excel('data/emotion_human.xlsx')
df['tags_emotion'] = df['tags_emotion'].map(pca_embedding_dict)
df = df[['tags_emotion', 'label']]
# df.to_excel('example.xlsx')

In [94]:
# Check for unique values in the 'label' column to identify if there are unlabeled instances
data = df
unique_labels = data['label'].unique()
# Determine the format of the 'tags_emotion' entries
first_entry = data['tags_emotion'][0]
(unique_labels, type(first_entry), first_entry)


(array([ 0,  2,  1, -1]),
 list,
 [0.13341506551551466, 0.1432317717360748, -0.03549755231311173])

In [95]:

# Separate the labeled and unlabeled data
labeled_data = data[data['label'] != -1]
unlabeled_data = data[data['label'] == -1]

# Check the conversion and separation
labeled_data.head(), unlabeled_data.head(), labeled_data.shape, unlabeled_data.shape


(                                        tags_emotion  label
 0  [0.13341506551551466, 0.1432317717360748, -0.0...      0
 1  [-0.09357420371434641, -0.0716346806852909, 0....      2
 2  [-0.0925421567144721, 0.0029044354921829853, 0...      2
 3  [0.011553421165230922, -0.14130647648026587, 0...      0
 4  [-0.15345890136918972, 0.09529602782552032, -0...      2,
                                          tags_emotion  label
 35  [0.07027609450656774, -0.007064140992634334, -...     -1
 36  [0.10195614545193399, -0.05001554543857868, -0...     -1
 37  [0.09053770850020393, -0.12463684019605807, 0....     -1
 38  [-0.11437076270344095, 0.11104853225452627, -0...     -1
 39  [0.08974735908814718, 0.10794493759273345, -0....     -1,
 (35, 2),
 (157, 2))

In [96]:
from sklearn.cluster import KMeans
import numpy as np

# Extract feature arrays
features_labeled = np.array(labeled_data['tags_emotion'].tolist())
features_unlabeled = np.array(unlabeled_data['tags_emotion'].tolist())

# Number of unique labels in the labeled data (excluding -1)
num_clusters = len(labeled_data['label'].unique())

# Apply K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(features_unlabeled)

# Assign the cluster labels to the unlabeled instances
unlabeled_data['label'] = kmeans.labels_

# Combine the labeled and newly labeled data
combined_data = pd.concat([labeled_data, unlabeled_data], ignore_index=True)

# Check the combined data
combined_data.tail(), combined_data['label'].value_counts()


  super()._check_params_vs_input(X, default_n_init=10)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unlabeled_data['label'] = kmeans.labels_


(                                          tags_emotion  label
 187  [0.018564552356162806, 0.0507342450573382, -0....      2
 188  [0.10348100315428825, -0.06091558393554987, -0...      2
 189  [0.07735890584167626, 0.08248571595580048, -0....      2
 190  [-0.11199147190327581, 0.03439855435454477, -0...      1
 191  [0.08519220961603167, -0.05897965962294559, 0....      2,
 label
 2    74
 0    70
 1    48
 Name: count, dtype: int64)

In [97]:
combined_data

tuple_pca_embedding_dict = {k: tuple(v) for k, v in pca_embedding_dict.items()}
inverse_pca_embedding_dict = {v: k for k, v in tuple_pca_embedding_dict.items()}
combined_data['tags_emotion'] = combined_data['tags_emotion'].apply(lambda x: inverse_pca_embedding_dict[tuple(x)] if tuple(x) in inverse_pca_embedding_dict else None)
combined_data = combined_data[['tags_emotion', 'label']]

In [98]:
combined_data.to_excel('example.xlsx')