In [1]:
import os

if not os.path.exists("train.csv"):
    from datasets import *
    dset = load_dataset("dair-ai/emotion","unsplit")

    # code from https://stackoverflow.com/a/76218276
    train_testvalid = dset['train'].train_test_split(test_size=0.2)
    # Split the 10% test + valid in half test, half valid
    test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
    # gather everyone if you want to have a single DatasetDict
    dset = DatasetDict({
        'train': train_testvalid['train'],
        'test': test_valid['test'],
        'valid': test_valid['train']})
    dset
else:
    print("file exists, skipping")

file exists, skipping


## Cleaning Function to clean the dataset text

In [2]:
import spacy
import pandas as pd
import numpy as np
import re
if not os.path.exists("test.csv"):
    # Check if the spaCy model is loaded, otherwise install it
    try:
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading the 'en_core_web_sm' model")
        !python -m spacy download en_core_web_sm
        nlp = spacy.load("en_core_web_sm")

    train_df = pd.DataFrame(dset["train"])
    test_df = pd.DataFrame(dset['test'])
    val_df = pd.DataFrame(dset['valid'])
    i = 0
    def cleaning(text):
        doc = nlp(text.lower().strip())
        cleaned_tokens = []
        global i
        print(f"Cleaning: {i}")
        for token in doc:
            if not token.is_stop and not token.is_punct and not token.is_space:
                lemma = re.sub(r'\W', '', token.lemma_)
                if lemma:
                    cleaned_tokens.append(lemma)
        i += 1
        return ' '.join(cleaned_tokens)

    # Apply the cleaning function to the text column
    train_df["cleaned_text"] = train_df["text"].apply(cleaning)
    test_df["cleaned_text"] = test_df["text"].apply(cleaning)
    val_df["cleaned_text"] = val_df["text"].apply(cleaning)

    print(train_df[['text', 'cleaned_text']].head())  # Display the original and cleaned text for verification

    train_df.to_csv("train.csv")
    test_df.to_csv("test.csv")
    val_df.to_csv("val.csv")

else: 
    print("file exists, skipping")
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    val_df = pd.read_csv("val.csv")    

file exists, skipping


In [3]:
train_df = pd.read_csv("train.csv")
train_df = train_df.dropna(subset=["cleaned_text"])
test_df = pd.read_csv("test.csv")
test_df = test_df.dropna(subset=["cleaned_text"])
val_df = pd.read_csv("val.csv")
val_df = val_df.dropna(subset=["cleaned_text"])

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
def create_bag_of_words(dataframe):
    vectorizer = CountVectorizer()
    X_bow = vectorizer.fit_transform(dataframe['cleaned_text'])
    return X_bow, vectorizer.get_feature_names_out()

X_bow, features_bow = create_bag_of_words(train_df)
X_test_bow, features_test_bow = create_bag_of_words(test_df)
print("Bag of Words features:", X_bow.shape)

Bag of Words features: (333428, 55844)


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
def create_tfidf_features(dataframe):
    vectorizer = TfidfVectorizer()
    X_tfidf = vectorizer.fit_transform(dataframe['cleaned_text'])
    return X_tfidf, vectorizer.get_feature_names_out()

X_tfidf, features_tfidf = create_tfidf_features(train_df)
X_test_tfidf, features_test_tfidf = create_tfidf_features(test_df)
print("TF-IDF features:", X_tfidf.shape)


TF-IDF features: (333428, 55844)


In [6]:
from gensim.models import Word2Vec
import numpy as np

def create_word2vec_embeddings(dataframe):
    sentences = [text.split() for text in dataframe['cleaned_text']]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    word_vectors = model.wv

    # Average Word Vectors for each text
    def document_vector(doc):
        return np.mean([word_vectors[w] for w in doc if w in word_vectors], axis=0)

    X_w2v = np.array([document_vector(text) for text in sentences if document_vector(text).shape != ()])
    return X_w2v


X_w2v = create_word2vec_embeddings(train_df)
X_test_w2v = create_word2vec_embeddings(test_df)
print("Word2Vec features shape:", X_w2v.shape)

Word2Vec features shape: (333428, 100)


In [7]:
from sentence_transformers import SentenceTransformer
if not os.path.exists('X_bert.npy'):
    def create_bert_embeddings(dataframe):
        model = SentenceTransformer('all-MiniLM-L6-v2')
        X_bert = model.encode(dataframe['cleaned_text'], show_progress_bar=True)
        return X_bert

    X_bert = create_bert_embeddings(train_df)
    X_test_bert = create_bert_embeddings(test_df)
    np.save('X_bert.npy', X_bert)
    np.save('X_test_bert.npy', X_test_bert)
    
else:    
    print("file exists, skipping")
    X_bert = np.load('X_bert.npy')
    X_test_bert = np.load('X_test_bert.npy') 

print("BERT Embeddings shape:", X_bert.shape)

file exists, skipping
BERT Embeddings shape: (333447, 384)


In [8]:
y_train = (train_df['label'])
y_test = (test_df['label'])
y_val = val_df['label']

In [9]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [10]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_w2v, y_train)
y_pred_test = clf.predict(X_test_w2v)
print(classification_report(y_test, y_pred_test))

KeyboardInterrupt: 