In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from transformers import (BertTokenizerFast,TFBertTokenizer,BertTokenizer,RobertaTokenizerFast,
                          DataCollatorWithPadding,TFRobertaForSequenceClassification,TFBertForSequenceClassification,
                          TFBertModel,create_optimizer)
from transformers import AutoTokenizer, AutoModel

In [None]:
data = []
with open('/kaggle/input/dm-2024-isa-5810-lab-2-homework/tweets_DM.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))
 
f.close()

In [None]:
processed_data = [
    {
        "_score": item["_score"],
        "_index": item["_index"],
        "_crawldate": item["_crawldate"],
        "_type": item["_type"],
        "hashtags": item["_source"]["tweet"].get("hashtags", []),
        "tweet_id": item["_source"]["tweet"].get("tweet_id"),
        "text": item["_source"]["tweet"].get("text"),
    }
    for item in data
]
tweets = pd.DataFrame(processed_data)

print(tweets)

In [None]:
emotion_path = '/kaggle/input/dm-2024-isa-5810-lab-2-homework/emotion.csv'
data_identification_path = '/kaggle/input/dm-2024-isa-5810-lab-2-homework/data_identification.csv'
emotion_df = pd.read_csv(emotion_path)
data_identification_df = pd.read_csv(data_identification_path)

print(emotion_df.shape)
print(data_identification_df.shape)
print(tweets.shape)

In [None]:
#merge training data
train_df = data_identification_df[data_identification_df['identification'] == 'train']
merged_df = pd.merge(train_df, emotion_df, on='tweet_id', how='inner')
train_df = pd.merge(merged_df, tweets, on='tweet_id', how='inner')
columns_to_keep = ['tweet_id', 'emotion', '_score', 'hashtags', 'text']
train_df = train_df[columns_to_keep]
print(train_df)
text = train_df['text']
emotion = train_df['emotion']

# merge test data
test_df = data_identification_df[data_identification_df['identification'] == 'test']
test_df = pd.merge(test_df, tweets, on='tweet_id', how='inner')
columns_to_keep = ['tweet_id', '_score', 'hashtags', 'text']
test_df = test_df[columns_to_keep]
print(test_df)

print(train_df)
print(test_df)

In [None]:
import pandas as pd

train_df = train_df.groupby('emotion', group_keys=False).apply(
    lambda x: x.sample(frac=700000 / len(train_df))
).reset_index(drop=True)


In [None]:
y_train_data = train_df ['emotion']
X_train_data = train_df ['text']

x_train, x_val, y_train, y_val =  train_test_split(X_train_data[:], 
                      y_train_data[:], test_size=0.2)

In [None]:
from transformers import BertTokenizer
import re
import emoji
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Define cleaning function
def clean_tweet(text, emoji_dict):
    # Replace defined emojis with corresponding keywords
    for emj, keyword in emoji_dict.items():
        text = text.replace(emj, keyword)
    # Remove remaining emojis
    text = emoji.replace_emoji(text, replace='')
    # Remove <LH> tags
    text = re.sub(r'<LH>', '', text)
    # Remove characters starting with @ (e.g., @username)
    text = re.sub(r'@\w+', '', text)
    # Remove punctuation marks
    text = re.sub(r'[^\w\s]', '', text) 
    # Remove extra whitespace characters
    text = text.strip()
    return text

# Define a dictionary for emoji mappings
emoji_dict = {
    '😂': '[joy]',
    '❤️': '[love]',
    '😍': '[adoration]',
    '😭': '[cry]',
    '❤': '[care]',
    '😊': '[happy]',
    '🙏': '[pray]',
    '😘': '[kiss]',
    '💕': '[love_each_other]',
    '🔥': '[fire]',
    '😩': '[weary]',
    '🤔': '[think]',
    '💯': '[perfect]',
    '💙': '[loyalty]',
    '🙄': '[annoyed]',
    '😁': '[happy]',
    '🙌': '[celebrate]',
    '🙏🏾': '[pray]',
    '👍': '[approve]',
    '🙏🏽': '[pray]'
}

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# First, clean the training and test sets
test_df['tokenized_text'] = [clean_tweet(text, emoji_dict) for text in test_df['text']]
x_train = [clean_tweet(text, emoji_dict) for text in x_train]
x_val = [clean_tweet(text, emoji_dict) for text in x_val]
# Tokenize the cleaned text
x_train_encoding = tokenizer(x_train, truncation=True, padding=True, max_length=64)
x_val_encoding = tokenizer(x_val, truncation=True, padding=True, max_length=64)


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.fit_transform(y_val)

In [None]:
from torch.utils.data import Dataset
import torch
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    # Read a single sample
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(x_train_encoding, y_train_encoded)
test_dataset = NewsDataset(x_val_encoding, y_val_encoded)


In [None]:
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=8)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
 
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)
 
optim = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 1
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
    # Get the index of the label with the highest probability in the prediction results
    pred_flat = np.argmax(preds, axis=1).flatten()
    # Flatten the true labels to one dimension
    labels_flat = labels.flatten()
    # Compare the predicted and true labels, calculate the number of correct predictions
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
from tqdm import tqdm  # Used to display progress bars

# Training function
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    
    # Display progress bar using tqdm
    for batch in tqdm(train_loader, desc="Training", total=total_iter):
        # Forward pass
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_train_loss += loss.item()
        
        # Backpropagation
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Parameter update
        optim.step()
        scheduler.step()
 
        iter_num += 1
        if iter_num % 100 == 0:
            print("Epoch: %d, Iteration: %d, Loss: %.4f, %.2f%%" % (epoch,
                            iter_num, loss.item(), iter_num/total_iter*100))
        
    avg_train_loss = total_train_loss / len(train_loader)
    train_losses.append(avg_train_loss)  # Save training loss
    print("Epoch: %d, Average training loss: %.4f" % (epoch, avg_train_loss))

# Validation function
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in tqdm(test_dataloader, desc="Validating", total=len(test_dataloader)):
        with torch.no_grad():
            # Forward pass
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        loss = outputs[0]
        logits = outputs[1]
        
        # Calculate total loss
        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()  # Move logits to CPU and convert to numpy array
        label_ids = labels.to('cpu').numpy()  # Move true labels to CPU and convert to numpy array
        # Calculate accuracy using flat_accuracy function
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    avg_val_loss = total_eval_loss / len(test_dataloader)
    
    # Save validation loss
    val_losses.append(avg_val_loss)
    
    print("Validation Accuracy: %.4f" % (avg_val_accuracy))
    print("Average Validation Loss: %.4f" % (avg_val_loss))
    print("-------------------------------")

# Lists to store training and validation losses
train_losses = []
val_losses = []

# Execute training and validation
for epoch in range(4):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()


import matplotlib.pyplot as plt

plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Validation Loss")
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.title('Training and Validation Losses')
plt.show()


In [None]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm  # Import tqdm progress bar library

# Model prediction function
def predict():
    
    model.eval()
    predictions = []
    
    # Disable gradient calculations
    with torch.no_grad():
        # Initialize tqdm progress bar, setting the total steps to the total number of samples 
        #in the dataset
        with tqdm(total=len(test_df), desc="Predicting", ncols=100) as pbar:
            for batch in test_dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                
                # Forward pass
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs[0]
                
                # Get the predicted labels (based on the maximum value of the softmax output)
                predicted_labels = torch.argmax(logits, dim=-1).cpu().numpy()
                
                # Update predictions and progress bar
                predictions.extend(predicted_labels)
                pbar.update(len(predicted_labels))  # Update progress bar with the number 
                                                    # of predicted samples
    
    return predictions

# Data preprocessing and DataLoader setup
test_encodings = tokenizer(
    test_df['tokenized_text'].tolist(), 
    truncation=True, 
    padding=True, 
    max_length=64
)
test_dataset = NewsDataset(test_encodings, [0] * len(test_df))  # Create test dataset
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Call the prediction function
predicted_labels = predict()

# Convert the predicted numerical labels back to the original labels
predicted_labels = label_encoder.inverse_transform(predicted_labels)

# Add the prediction results to the DataFrame
test_df['predicted_labels'] = predicted_labels

# Save the prediction results to a CSV file
submission = test_df[['tweet_id', 'predicted_labels']].rename(columns={
    'tweet_id': 'id',
    'predicted_labels': 'emotion'
})
submission.to_csv('/kaggle/working/submission.csv', index=False)


In [None]:
submission