In [4]:
!pip install -q datasets scikit-learn gensim sentence-transformers lazypredict numpy
from datasets import *
dset = load_dataset("dair-ai/emotion","unsplit")

In [5]:
# code from https://stackoverflow.com/a/76218276

train_testvalid = dset['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
dset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

dset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 333447
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 41681
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 41681
    })
})

## Cleaning Function to clean the dataset text

In [6]:
import spacy
import pandas as pd
import re

# Check if the spaCy model is loaded, otherwise install it
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading the 'en_core_web_sm' model")
    !python -m spacy download en_core_web_sm
    nlp = spacy.load("en_core_web_sm")

train_df = pd.DataFrame(dset["train"])
test_df = pd.DataFrame(dset['test'])
val_df = pd.DataFrame(dset['valid'])

def cleaning(text):
    doc = nlp(text.lower().strip())
    cleaned_tokens = []

    for token in doc:
        if not token.is_stop and not token.is_punct and not token.is_space:
            lemma = re.sub(r'\W', '', token.lemma_)
            if lemma:
                cleaned_tokens.append(lemma)

    return ' '.join(cleaned_tokens)

# Apply the cleaning function to the text column
train_df["cleaned_text"] = train_df["text"].apply(cleaning)
test_df["cleaned_text"] = test_df["text"].apply(cleaning)
val_df["cleaned_text"] = val_df["text"].apply(cleaning)

print(train_df[['text', 'cleaned_text']].head())  # Display the original and cleaned text for verification


KeyboardInterrupt: 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
def create_bag_of_words(dataframe):
    vectorizer = CountVectorizer()
    X_bow = vectorizer.fit_transform(dataframe['cleaned_text'])
    return X_bow, vectorizer.get_feature_names_out()

X_bow, features_bow = create_bag_of_words(train_df)
X_test_bow, features_test_bow = create_bag_of_words(test_df)
print("Bag of Words features:", X_bow.shape)



In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
def create_tfidf_features(dataframe):
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(dataframe['cleaned_text'])
    return X_tfidf, vectorizer.get_feature_names_out()

X_tfidf, features_tfidf = create_tfidf_features(train_df)
X_test_tfidf, features_test_tfidf = create_tfidf_features(test_df)
print("TF-IDF features:", X_tfidf.shape)


In [None]:
from gensim.models import Word2Vec
import numpy as np

def create_word2vec_embeddings(dataframe):
    sentences = [text.split() for text in dataframe['cleaned_text']]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    word_vectors = model.wv

    # Average Word Vectors for each text
    def document_vector(doc):
        return np.mean([word_vectors[w] for w in doc if w in word_vectors], axis=0)

    X_w2v = np.array([document_vector(text) for text in sentences if document_vector(text).shape != ()])
    return X_w2v


X_w2v = create_word2vec_embeddings(train_df)
X_test_w2v = create_word2vec_embeddings(test_df)
print("Word2Vec features shape:", X_w2v.shape)

In [None]:
from sentence_transformers import SentenceTransformer

def create_bert_embeddings(dataframe):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    X_bert = model.encode(dataframe['cleaned_text'], show_progress_bar=True)
    return X_bert

X_bert = create_bert_embeddings(train_df)
X_test_bert = create_bert_embeddings(test_df)
print("BERT Embeddings shape:", X_bert.shape)

In [None]:
y_train = (train_df['label'])
y_test = (test_df['label'])
y_val = val_df['label']

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_bert, y_train)
y_pred_test = clf.predict(X_test_bert)
print(classification_report(y_test, y_pred_test))