In [None]:
import pandas as pd
import os
from tqdm import tqdm

dataset = "dataset"

df_train = pd.read_csv(os.path.join(dataset, "train.csv"))
df_test = pd.read_csv(os.path.join(dataset, "test.csv"))

In [None]:
from matplotlib import pyplot as plt

df_train["mask_count"] = df_train["text"].str.count("<mask>")

# Group by the number of <mask> tokens and count occurrences
mask_count_distribution = df_train["mask_count"].value_counts().sort_index()

# Plotting
plt.figure(figsize=(10, 6))
plt.bar(mask_count_distribution.index, mask_count_distribution.values, alpha=0.7)
plt.title("Frequency of <mask> Token Counts")
plt.xlabel("Number of <mask> Tokens")
plt.ylabel("Number of Items")
plt.xticks(mask_count_distribution.index)  # Use mask counts as x-ticks
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
mask_count_distribution

In [None]:
i = 12
j = 0
print(df_train[df_train["mask_count"] == i]["text"].iloc[j])
print(df_train[df_train["mask_count"] == i]["emotion"].iloc[j])

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, pipeline
import torch

# Load tokenizer and model
model_name = "twitter/twhin-bert-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# Define the sentence with multiple masks
sentence = "<user> 🐝AMAZED <user> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>"

# Create a fill-mask pipeline
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer, device=device)


def fill_multiple_masks(sentence, fill_mask):
    tokens = sentence.split()

    while "<mask>" in tokens:
        # Iterate over each mask to resolve them one by one
        for mask_index in [i for i, token in enumerate(tokens) if token == "<mask>"]:
            # Temporarily replace other masks with <ignore>
            temp_tokens = [
                token if idx == mask_index or token != "<mask>" else "<pad>"
                for idx, token in enumerate(tokens)
            ]

            # Join tokens to form a temporary sentence
            temp_sentence = " ".join(temp_tokens)

            # Predict for the current mask
            predictions = fill_mask(temp_sentence)

            # Replace the current mask with the top prediction
            tokens[mask_index] = predictions[0]["token_str"]

    # Return the fully resolved sentence
    return " ".join(tokens)


# Get the filled sentence
filled_sentence = fill_multiple_masks(sentence, fill_mask)
print(f"Original Sentence: {sentence}")
print(f"Filled Sentence: {filled_sentence}")

In [None]:
def clean_text(text):
    """
    Clean up text by removing special tokens and unnecessary spaces.
    """
    text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
    text = text.replace("▁", " ").strip()  # Replace BPE underscores with spaces
    return " ".join(text.split())  # Remove extra spaces


def fill_multiple_masks_batch(batch_sentences):
    """
    Fill the <mask> tokens in sentences using the language model.
    """
    tokenized = tokenizer(
        batch_sentences, padding=True, truncation=True, return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**tokenized)
        predictions = outputs.logits

    # Iterate over each sentence and replace masks
    resolved_sentences = []
    for i, sentence in enumerate(batch_sentences):
        tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][i])
        sentence_predictions = predictions[i]

        while "<mask>" in tokens:
            mask_index = tokens.index("<mask>")

            # Get top predicted token for the current mask
            predicted_token_id = sentence_predictions[mask_index].argmax().item()
            predicted_token = tokenizer.convert_ids_to_tokens([predicted_token_id])[0]

            # Replace <mask> with the predicted token
            tokens[mask_index] = predicted_token

            # Update predictions to reflect the resolved mask
            sentence_predictions = predictions[i]

        # Decode tokens into a human-readable sentence
        resolved_sentence = tokenizer.decode(
            tokenizer.convert_tokens_to_ids(tokens),
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        resolved_sentences.append(clean_text(resolved_sentence))

    return resolved_sentences


# Example sentences with masks
example_sentences = [
    "People <mask> post add me on #Snapchat must be dehydrated. Cuz man. that's <mask>",
    "The weather is <mask> today.",
    "He bought a new <mask> for his birthday.",
    "<mask> is the capital of France.",
]

# Fill the masked tokens
filled_sentences = fill_multiple_masks_batch(example_sentences)

# Print original and resolved sentences
for original, filled in zip(example_sentences, filled_sentences):
    print(f"Original: {original}")
    print(f"Filled: {filled}")
    print()

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load model and tokenizer
model_name = "twitter/twhin-bert-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name).to(
    "cuda" if torch.cuda.is_available() else "cpu"
)
device = "cuda" if torch.cuda.is_available() else "cpu"

dataset = "dataset"
output_file = os.path.join(dataset, "filled_test.csv")

# Load dataset
df_train = pd.read_csv(os.path.join(dataset, "test.csv"))

# Initialize output file
if not os.path.exists(output_file):
    pd.DataFrame(columns=["text", "filled text"]).to_csv(output_file, index=False)


def clean_text(text):
    """
    Clean up text by removing special tokens and unnecessary spaces.
    """
    text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
    text = text.replace("▁", " ").strip()  # Replace BPE underscores with spaces
    return " ".join(text.split())  # Remove extra spaces


def fill_multiple_masks_batch(batch_sentences):
    """
    Fill the <mask> tokens in sentences using the language model.
    """
    tokenized = tokenizer(
        batch_sentences, padding=True, truncation=True, return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**tokenized)
        predictions = outputs.logits

    # Iterate over each sentence and replace masks
    resolved_sentences = []
    for i, sentence in enumerate(batch_sentences):
        tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][i])
        sentence_predictions = predictions[i]

        while "<mask>" in tokens:
            mask_index = tokens.index("<mask>")

            # Get top predicted token for the current mask
            predicted_token_id = sentence_predictions[mask_index].argmax().item()
            predicted_token = tokenizer.convert_ids_to_tokens([predicted_token_id])[0]

            # Replace <mask> with the predicted token
            tokens[mask_index] = predicted_token

            # Update predictions to reflect the resolved mask
            sentence_predictions = predictions[i]

        # Decode tokens into a human-readable sentence
        resolved_sentence = tokenizer.decode(
            tokenizer.convert_tokens_to_ids(tokens),
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        resolved_sentences.append(clean_text(resolved_sentence))

    return resolved_sentences


# Batch processing with tqdm
batch_size = 100
save_interval = 1500  # Save to CSV every 1000 batches

for batch_num, start_idx in enumerate(
    tqdm(range(0, len(df_train), batch_size), desc="Processing batches")
):
    end_idx = min(start_idx + batch_size, len(df_train))
    batch = df_train.iloc[start_idx:end_idx].copy()

    # Process each batch
    batch["filled text"] = fill_multiple_masks_batch(batch["text"].tolist())

    # Append to CSV and clear GPU memory after every `save_interval` batches
    batch.to_csv(output_file, mode="a", index=False, header=False)
    torch.cuda.empty_cache()

# Note: The header is written only once at initialization.

In [None]:
import os
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForMaskedLM

# Load model and tokenizer
model_name = "twitter/twhin-bert-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name).to(
    "cuda" if torch.cuda.is_available() else "cpu"
)
device = "cuda" if torch.cuda.is_available() else "cpu"

dataset = "dataset"
output_file = os.path.join(dataset, "filled_train_emotion.csv")

# Load dataset
df_train = pd.read_csv(os.path.join(dataset, "train.csv"))

# Ensure the dataset contains an 'emotion' column
if "emotion" not in df_train.columns:
    raise ValueError("Dataset must include an 'emotion' column.")

# Initialize output file
if not os.path.exists(output_file):
    pd.DataFrame(columns=["Original", "Filled"]).to_csv(output_file, index=False)


def clean_text(text):
    """
    Clean up text by removing special tokens and unnecessary spaces.
    """
    text = text.replace("<s>", "").replace("</s>", "").replace("<pad>", "")
    text = text.replace("▁", " ").strip()  # Replace BPE underscores with spaces
    return " ".join(text.split())  # Remove extra spaces


def fill_multiple_masks_batch(batch_sentences):
    """
    Fill the <mask> tokens in sentences using the language model.
    """
    tokenized = tokenizer(
        batch_sentences, padding=True, truncation=True, return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**tokenized)
        predictions = outputs.logits

    # Iterate over each sentence and replace masks
    resolved_sentences = []
    for i, sentence in enumerate(batch_sentences):
        tokens = tokenizer.convert_ids_to_tokens(tokenized["input_ids"][i])
        sentence_predictions = predictions[i]

        while "<mask>" in tokens:
            mask_index = tokens.index("<mask>")

            # Get top predicted token for the current mask
            predicted_token_id = sentence_predictions[mask_index].argmax().item()
            predicted_token = tokenizer.convert_ids_to_tokens([predicted_token_id])[0]

            # Replace <mask> with the predicted token
            tokens[mask_index] = predicted_token

            # Update predictions to reflect the resolved mask
            sentence_predictions = predictions[i]

        # Decode tokens into a human-readable sentence
        resolved_sentence = tokenizer.decode(
            tokenizer.convert_tokens_to_ids(tokens),
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True,
        )
        resolved_sentences.append(clean_text(resolved_sentence))

    return resolved_sentences


def add_emotion_context(sentences, emotions):
    """
    Combine sentences with their corresponding emotions to provide context.
    """
    return [
        f"{sentence} (Emotion: {emotion})"
        for sentence, emotion in zip(sentences, emotions)
    ]


# Batch processing with tqdm
batch_size = 100

for batch_num, start_idx in enumerate(
    tqdm(range(0, len(df_train), batch_size), desc="Processing batches")
):
    end_idx = min(start_idx + batch_size, len(df_train))
    batch = df_train.iloc[start_idx:end_idx].copy()

    # Add emotion context to each sentence
    contextualized_sentences = add_emotion_context(
        batch["text"].tolist(), batch["emotion"].tolist()
    )

    # Process each batch
    batch["filled text"] = fill_multiple_masks_batch(contextualized_sentences)

    # Format the output
    batch["Original"] = contextualized_sentences
    batch["Filled"] = batch["filled text"]

    # Select only the relevant columns
    output_batch = batch[["Original", "Filled"]]

    # Append to CSV and clear GPU memory
    output_batch.to_csv(output_file, mode="a", index=False, header=False)
    torch.cuda.empty_cache()

In [None]:
import pandas as pd
import re


# Function to remove emotion tags
def remove_emotion_tags(text):
    if isinstance(text, str):  # Check if the text is a string
        pattern = r"\(emotion: [a-z]+\)"  # Match the (emotion: {emotion}) format
        return re.sub(pattern, "", text, flags=re.IGNORECASE).strip()
    return text  # If not a string, return as-is


df_train = pd.read_csv("dataset/filled_train_emotion.csv")

df_train["text"] = df_train["Filled"].apply(remove_emotion_tags)

In [None]:
df_train.drop(columns=["Original", "Filled"], inplace=True)

In [None]:
df_train.to_csv("dataset/eilled_train.csv", index=False)

In [None]:
df_train = pd.read_csv("dataset/train.csv")

In [None]:
df_filled = pd.read_csv("dataset/eilled_train.csv")

In [None]:
df_train.head()

In [None]:
df_filled.head()

In [None]:
df_train["text"] = df_filled["text"]

In [None]:
df_train.head()

In [None]:
df_train.to_csv("dataset/train.csv", index=False)

In [None]:
tokens = [
    0,
    41552,
    12105,
    15698,
    1649,
    110,
    2788,
    3731,
    328,
    18636,
    15375,
    50264,
    849,
    6968,
    20042,
    27740,
    2,
]

for token in tokens:
    print(f"{token}: {tokenizer.decode(token)}")

In [None]:
tokenizer.decode(tokens)

In [None]:
emotion_sentiment = {
    "trust": "Positive",
    "surprise": "Neutral",  # Surprise can vary depending on context
    "anticipation": "Neutral",  # Anticipation can be positive or negative
    "sadness": "Negative",
    "fear": "Negative",
    "joy": "Positive",
    "anger": "Negative",
    "disgust": "Negative"
}

df_train["sentiment"] = df_train["emotion"].map(emotion_sentiment)

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(["Negative", "Neutral", "Positive"])
df_train["sentiment_label"] = le.transform(df_train["sentiment"])
df_train.head()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
import pandas as pd  # Ensure pandas is imported for DataFrame handling

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

# Batch size
batch_size = 128  # Adjust based on your GPU memory capacity

# Initialize tracking variables
id_list = []
count = 0
step = 0

# Progress bar
pbar = tqdm(range(0, df_train.shape[0], batch_size), total=(df_train.shape[0] // batch_size) + 1)

# Process data in batches
with torch.no_grad():
    for start_idx in pbar:
        end_idx = start_idx + batch_size
        batch = df_train.iloc[start_idx:end_idx]

        # Tokenize the batch
        texts = batch["text"].tolist()
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

        # Get model outputs
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_sentiments = logits.argmax(dim=1).cpu().numpy()

        # Compare predictions to actual sentiments
        true_sentiments = batch["sentiment"].tolist()
        tweet_ids = batch["tweet_id"].tolist()

        for idx, (pred, true, tweet_id) in enumerate(zip(predicted_sentiments, true_sentiments, tweet_ids)):
            step += 1
            if pred != le.transform([true])[0]:  # Encode true sentiment if needed
                id_list.append(tweet_id)
            else:
                count += 1

        # Update progress bar
        pbar.set_description(f"Accuracy: {count / step:.2f}")

# Final accuracy
print(f"Accuracy: {count / step:.2f}")


In [None]:
# Define a batch size
batch_size = 5

# Initialize or retrieve the batch index
if 'batch_index' not in globals():
    batch_index = 0

# Calculate start and end indices for the current batch
start_index = batch_index * batch_size
end_index = start_index + batch_size

# Retrieve the tweet_ids for the current batch
current_batch = id_list[start_index:end_index]

# Display the rows for the current batch of tweet_ids
for tweet_id in current_batch:
    display(df_train[df_train["tweet_id"] == tweet_id])

# Update the batch index for the next run
batch_index += 1


In [None]:
import pandas as pd
import os
from tqdm import tqdm

dataset = "dataset"

df_train = pd.read_csv(os.path.join(dataset, "train.csv"))

In [None]:
import torch


class EmotionClassifier(torch.nn.Module):
    def __init__(self, model, num_emotions=8):
        super(EmotionClassifier, self).__init__()
        self.model = model
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, num_emotions)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        logits = self.dropout(logits)

        return logits

In [None]:
load_model = "model/model_epoch_5R.ckpt"

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

device = "cuda" if torch.cuda.is_available() else "cpu"
model = torch.load(load_model, map_location=device)
model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-large")

In [None]:
id_list = []

batch_size = 128  # Define your batch size

model.eval()  # Set model to evaluation mode

id_list = []

# Progress bar
pbar = tqdm(range(0, df_train.shape[0], batch_size), total=(df_train.shape[0] // batch_size) + 1)

# Process data in batches
with torch.no_grad():
    for start_idx in pbar:
        end_idx = start_idx + batch_size
        batch = df_train.iloc[start_idx:end_idx]

        # Tokenize the batch
        texts = batch["text"].tolist()
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {name: tensor.to(device) for name, tensor in inputs.items()}

        # Get model outputs
        outputs = model(**inputs)
        logits = outputs
        predicted_sentiments = logits.argmax(dim=1).cpu().numpy()

        # Compare predictions to actual sentiments
        true_sentiments = batch["label"].tolist()
        tweet_ids = batch["tweet_id"].tolist()

        for idx, (pred, true, tweet_id) in enumerate(zip(predicted_sentiments, true_sentiments, tweet_ids)):
            if pred != true:
                id_list.append(tweet_id)

In [None]:
df_train = pd.read_csv(os.path.join(dataset, "train.csv"))

In [None]:
import matplotlib.pyplot as plt

# Filter the DataFrame to include only rows where 'tweet_id' is in id_list
misclassified_df = df_train[df_train['tweet_id'].isin(id_list)]
data = misclassified_df['score'].value_counts()

plt.figure(figsize=(20, 6))
plt.bar(data.index, data.values)
plt.title('Misclassified Scores')
plt.xlabel('Score')
plt.ylabel('Count')
plt.show()

In [None]:
misclassified_df

In [None]:
df_train

In [None]:
import re

def clean_text(text):
    # text = text.lower()
    text = re.sub(r"http\S+", "[URL]", text)
    text = re.sub(r"@\S+", "[UESR]", text)
    text = re.sub(r"(<LH>\s*)+", "<LH>", text)
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"([!?.,;:])\1+", r"\1", text)
    return text

In [None]:
text = "Hello      world!!!!???"
print(clean_text(text))

In [None]:
seed = 100

In [None]:
from transformers import RobertaModel, RobertaTokenizer
import torch
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from tqdm import tqdm

# Load model and tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


# Function to get CLS embeddings in batches
def get_cls_embeddings_batched(texts, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing Batches"):
        batch_texts = texts[i : i + batch_size]
        tokens = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt",
        )
        tokens = {key: val.to(device) for key, val in tokens.items()}
        with torch.no_grad():
            outputs = model(**tokens)
        cls_batch_embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings.append(cls_batch_embeddings)
    return torch.cat(embeddings, dim=0)


# Load data
dataset = "dataset"
df_train = pd.read_csv(f"{dataset}/train.csv")
df_test = pd.read_csv(f"{dataset}/test.csv")

# Sample a smaller subset for visualization (e.g., 10%)
train_sampled = df_train.sample(frac=0.05, random_state=seed)
test_sampled = df_test.sample(frac=0.1, random_state=seed)

train_texts = train_sampled["text"].tolist()
test_texts = test_sampled["text"].tolist()

# Compute embeddings for the sampled data
train_embeddings = get_cls_embeddings_batched(train_texts, batch_size=16)
test_embeddings = get_cls_embeddings_batched(test_texts, batch_size=16)

# Combine embeddings for PCA
combined_embeddings = (
    torch.cat([train_embeddings, test_embeddings], dim=0).cpu().numpy()
)

In [None]:
# Apply PCA
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(combined_embeddings)


plt.figure(figsize=(12, 9))
plt.scatter(
    reduced_embeddings[: len(train_texts), 0],
    reduced_embeddings[: len(train_texts), 1],
    label="Train",
    alpha=0.5,
    s=10,  # Smaller dot size
)
plt.scatter(
    reduced_embeddings[len(train_texts) :, 0],
    reduced_embeddings[len(train_texts) :, 1],
    label="Test",
    alpha=0.5,
    s=10,  # Smaller dot size
)
plt.legend()
plt.title("Train vs. Test Embedding Visualization")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.show()
torch.cuda.empty_cache()

In [None]:
from transformers import RobertaModel, RobertaTokenizer
import torch
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from tqdm import tqdm

# Load model and tokenizer
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


# Function to get CLS embeddings in batches
def get_cls_embeddings_batched(texts, batch_size=16):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing Batches"):
        batch_texts = texts[i : i + batch_size]
        tokens = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt",
        )
        tokens = {key: val.to(device) for key, val in tokens.items()}
        with torch.no_grad():
            outputs = model(**tokens)
        cls_batch_embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings.append(cls_batch_embeddings)
    return torch.cat(embeddings, dim=0)


# Load data
dataset = "dataset"
df_train = pd.read_csv(f"{dataset}/train.csv")
df_test = pd.read_csv(f"{dataset}/test.csv")

# Seed for reproducibility
seed = 42

# Sample a smaller subset for visualization
train_sampled = df_train.sample(frac=0.05, random_state=seed)
test_sampled = df_test.sample(frac=0.1, random_state=seed)

train_texts = train_sampled["text"].tolist()
train_emotions = train_sampled["emotion"].tolist()  # Extract corresponding emotions
test_texts = test_sampled["text"].tolist()

# Compute embeddings for the sampled data
train_embeddings = get_cls_embeddings_batched(train_texts, batch_size=16)

In [None]:
# PCA on train embeddings
pca = PCA(n_components=2)
train_embeddings_pca = pca.fit_transform(train_embeddings.cpu().numpy())

# Plot train embeddings by emotion class
plt.figure(figsize=(10, 8))
unique_emotions = list(set(train_emotions))
for emotion in unique_emotions:
    indices = [i for i, e in enumerate(train_emotions) if e == emotion]
    plt.scatter(
        train_embeddings_pca[indices, 0],
        train_embeddings_pca[indices, 1],
        label=emotion,
        alpha=0.7,
        s=10,  # Adjust marker size
    )

plt.legend(title="Emotion")
plt.title("Train Embeddings Visualization by Emotion")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.show()

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Apply t-SNE to train embeddings
tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)
train_embeddings_tsne = tsne.fit_transform(train_embeddings.cpu().numpy())

In [None]:
# Plot train embeddings by emotion class
plt.figure(figsize=(10, 8))
unique_emotions = list(set(train_emotions))

for emotion in unique_emotions:
    if emotion == "joy":
        continue
    indices = [i for i, e in enumerate(train_emotions) if e == emotion]
    plt.scatter(
        train_embeddings_tsne[indices, 0],
        train_embeddings_tsne[indices, 1],
        label=emotion,
        alpha=0.3,
        s=3,  # Adjust marker size
    )

plt.legend(title="Emotion")
plt.title("Train Embeddings Visualization by Emotion (t-SNE)")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")
plt.show()

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Apply t-SNE with 3 components
tsne = TSNE(n_components=3, perplexity=30, n_iter=1000, random_state=42)
train_embeddings_tsne_3d = tsne.fit_transform(train_embeddings.cpu().numpy())

In [None]:
# Plot train embeddings by emotion class in 3D from different angles
fig = plt.figure(figsize=(18, 6))  # Wide figure to accommodate multiple subplots

angles = [(30, 45), (30, 135), (30, 225)]  # Three angles: (elev, azim)
titles = ["Angle 1 (30, 45)", "Angle 2 (30, 135)", "Angle 3 (30, 225)"]

unique_emotions = list(set(train_emotions))

for i, angle in enumerate(angles):
    ax = fig.add_subplot(1, 3, i + 1, projection="3d")  # 3D subplot
    for emotion in unique_emotions:
        indices = [j for j, e in enumerate(train_emotions) if e == emotion]
        ax.scatter(
            train_embeddings_tsne_3d[indices, 0],
            train_embeddings_tsne_3d[indices, 1],
            train_embeddings_tsne_3d[indices, 2],
            label=emotion,
            alpha=0.6,
            s=3,
        )
    ax.set_title(titles[i])
    ax.set_xlabel("t-SNE Component 1")
    ax.set_ylabel("t-SNE Component 2")
    ax.set_zlabel("t-SNE Component 3")
    ax.view_init(elev=angle[0], azim=angle[1])  # Set elevation and azimuth angles

# Add legend to the last subplot only (to avoid repetition)
ax.legend(title="Emotion", bbox_to_anchor=(1.1, 0.5), loc="center left")

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd

file = "/home/S113062628/project/Data Mining/DM-Autumn-2024-Lab-2/DM2024-Lab2-Homework/submission.csv"
df = pd.read_csv(file)

df["emotion"] = "joy"
df.to_csv(file, index=False)