In [None]:
# EI Assessment Tool: Data Preprocessing, Training, and Evaluation
# Requires: pandas, numpy, nltk, transformers, scikit-learn, torch, datasets
# Dataset: GoEmotions (place goemotions_1.csv, goemotions_2.csv in ./data/)

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import BertTokenizer, BertForSequenceClassification, BertModel, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import numpy as np
import torch
from datasets import Dataset

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

# Load dataset
train_df = pd.read_csv('data/goemotions_1.csv')
test_df = pd.read_csv('data/goemotions_2.csv')

# Text cleaning
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()
    text = ' '.join(word for word in word_tokenize(text) if word not in stop_words)
    return text

train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)

# Map emotions to EI attributes
emotion_map = {
    'caring': 'empathy', 'gratitude': 'empathy', 'approval': 'empathy', 'admiration': 'empathy',
    'confusion': 'self_awareness', 'nervousness': 'self_awareness', 'realization': 'self_awareness',
    'amusement': 'social_skills', 'curiosity': 'social_skills', 'excitement': 'social_skills'
}

def map_to_ei(emotions):
    emotions = emotions.split(',')
    for e in emotions:
        if e in emotion_map:
            return emotion_map[e]
    return 'other'

train_df['ei_label'] = train_df['emotions'].apply(map_to_ei)
test_df['ei_label'] = test_df['emotions'].apply(map_to_ei)

# Tokenization for BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_df['clean_text'].tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_df['clean_text'].tolist(), truncation=True, padding=True, max_length=128)

# Encode labels
le = LabelEncoder()
y_train = le.fit_transform(train_df['ei_label'])
y_test = le.transform(test_df['ei_label'])

# Prepare datasets for BERT
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': y_train
})
test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': y_test
})

# Train BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(le.classes_))
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)
trainer.train()

# Logistic Regression baseline
bert_model = BertModel.from_pretrained('bert-base-uncased')

def get_bert_embeddings(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors='pt', max_length=128, truncation=True, padding=True)
        with torch.no_grad():
            outputs = bert_model(**inputs)
        embeddings.append(outputs.last_hidden_state[:, 0, :].numpy())
    return np.vstack(embeddings)

X_train = get_bert_embeddings(train_df['clean_text'].tolist())
X_test = get_bert_embeddings(test_df['clean_text'].tolist())
lr_model = LogisticRegression(max_iter=1000, multi_class='multinomial')
lr_model.fit(X_train, y_train)

# Evaluate models
bert_predictions = trainer.predict(test_dataset).predictions.argmax(-1)
bert_scores = trainer.predict(test_dataset).predictions
lr_predictions = lr_model.predict(X_test)
lr_scores = lr_model.predict_proba(X_test)

print("BERT Model:")
print(f"Accuracy: {accuracy_score(y_test, bert_predictions):.2f}")
print(f"Precision: {precision_recall_fscore_support(y_test, bert_predictions, average='weighted')[0]:.2f}")
print(f"Recall: {precision_recall_fscore_support(y_test, bert_predictions, average='weighted')[1]:.2f}")
print(f"F1 Score: {precision_recall_fscore_support(y_test, bert_predictions, average='weighted')[2]:.2f}")
print(f"ROC-AUC: {roc_auc_score(y_test, bert_scores, multi_class='ovr'):.2f}")

print("\nLogistic Regression:")
print(f"Accuracy: {accuracy_score(y_test, lr_predictions):.2f}")
print(f"Precision: {precision_recall_fscore_support(y_test, lr_predictions, average='weighted')[0]:.2f}")
print(f"Recall: {precision_recall_fscore_support(y_test, lr_predictions, average='weighted')[1]:.2f}")
print(f"F1 Score: {precision_recall_fscore_support(y_test, lr_predictions, average='weighted')[2]:.2f}")
print(f"ROC-AUC: {roc_auc_score(y_test, lr_scores, multi_class='ovr'):.2f}")

# Save BERT model
model.save_pretrained('models/bert_ei')
tokenizer.save_pretrained('models/bert_ei')
import joblib
joblib.dump(lr_model, 'models/lr_ei.pkl')