You have been given a text dataset that contains poems
Dataset is classified in 5 classes :Negative, Positive, Neutral, Extremely Positive,
Extremely Negative
Your goal is to build a sentiment classification model.

1. Preprocess the data (clean and tokenize the text).
2. Use pre-trained embeddings for (CBOW, Skip-Gram, Glove, and ELMo)
3. Choose any model architecture you prefer and make a transfer learning
models
4. Report the evaluation results between four models where each one of them
use different embeddings but the same model architecture.

In [2]:
import pandas as pd
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load datasets from the Kaggle input directory
train_path = "/kaggle/input/coronaaaa/Corona_NLP_train.csv"
test_path = "/kaggle/input/coronaaaa/Corona_NLP_test.csv"

# Read the CSV files
train_df = pd.read_csv(train_path, encoding="latin1")
test_df = pd.read_csv(test_path, encoding="latin1")

# Display the first few rows of the train dataset
train_df.head()


Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [3]:
def safe_load_csv(path):
    try:
        df = pd.read_csv(path)
        print(f"Successfully loaded {path} with shape {df.shape}")
        return df
    except Exception as e:
        print(f"Error loading {path}: {str(e)}")
        return pd.DataFrame()  # Return empty DataFrame as fallback

train_df = safe_load_csv(train_path)
test_df = safe_load_csv(test_path)

Error loading /kaggle/input/coronaaaa/Corona_NLP_train.csv: 'utf-8' codec can't decode byte 0xc2 in position 7485: invalid continuation byte
Successfully loaded /kaggle/input/coronaaaa/Corona_NLP_test.csv with shape (3798, 6)


In [4]:
# Load datasets with proper encoding
train_df = pd.read_csv('/kaggle/input/coronaaaa/Corona_NLP_train.csv', encoding='latin1')
test_df = pd.read_csv('/kaggle/input/coronaaaa/Corona_NLP_test.csv', encoding='latin1')

# Verify data
print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print("\nTraining columns:", train_df.columns.tolist())
print("\nSample training data:")
print(train_df.head(2))

Training data shape: (41157, 6)
Test data shape: (3798, 6)

Training columns: ['UserName', 'ScreenName', 'Location', 'TweetAt', 'OriginalTweet', 'Sentiment']

Sample training data:
   UserName  ScreenName Location     TweetAt  \
0      3799       48751   London  16-03-2020   
1      3800       48752       UK  16-03-2020   

                                       OriginalTweet Sentiment  
0  @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...   Neutral  
1  advice Talk to your neighbours family to excha...  Positive  


In [5]:
# Check current sentiment distribution
print("Original sentiment distribution:")
print(train_df['Sentiment'].value_counts())

# Standardize sentiment labels
sentiment_mapping = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}

train_df['Sentiment'] = train_df['Sentiment'].map(sentiment_mapping)
test_df['Sentiment'] = test_df['Sentiment'].map(sentiment_mapping)

# Verify mapping
print("\nMapped sentiment distribution:")
print(train_df['Sentiment'].value_counts())

Original sentiment distribution:
Sentiment
Positive              11422
Negative               9917
Neutral                7713
Extremely Positive     6624
Extremely Negative     5481
Name: count, dtype: int64

Mapped sentiment distribution:
Sentiment
3    11422
1     9917
2     7713
4     6624
0     5481
Name: count, dtype: int64


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

# Tokenizer configuration
max_words = 20000
max_len = 100

# ✅ Use "OriginalTweet" instead of "clean_text"
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['OriginalTweet'])  

# Convert texts to sequences
X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['OriginalTweet']), maxlen=max_len)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['OriginalTweet']), maxlen=max_len)

# Prepare target variables
y_train = tf.keras.utils.to_categorical(train_df['Sentiment'], num_classes=5)
y_test = tf.keras.utils.to_categorical(test_df['Sentiment'], num_classes=5)

print(f"\nTraining sequences shape: {X_train.shape}")
print(f"Test sequences shape: {X_test.shape}")



Training sequences shape: (41157, 100)
Test sequences shape: (3798, 100)


In [7]:
from gensim.models import Word2Vec

# Prepare sentences for Word2Vec
sentences = [text.split() for text in train_df['OriginalTweet']]

# Train Word2Vec models
w2v_model = Word2Vec(sentences=sentences, 
                    vector_size=100,
                    window=5,
                    min_count=3,
                    workers=4,
                    sg=0)  # CBOW

skipgram_model = Word2Vec(sentences=sentences,
                         vector_size=100,
                         window=5,
                         min_count=3,
                         workers=4,
                         sg=1)  # Skip-Gram

print("Vocabulary size (CBOW):", len(w2v_model.wv))
print("Vocabulary size (Skip-Gram):", len(skipgram_model.wv))

Vocabulary size (CBOW): 25261
Vocabulary size (Skip-Gram): 25261


In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalMaxPooling1D
import numpy as np
def build_model(embedding_matrix=None):
    model = Sequential()
    
    if embedding_matrix is not None:
        model.add(Embedding(input_dim=max_words,
                          output_dim=100,
                          weights=[embedding_matrix],
                          input_length=max_len,
                          trainable=False))
    else:
        model.add(Embedding(input_dim=max_words,
                          output_dim=100,
                          input_length=max_len))
    
    model.add(LSTM(128, return_sequences=True))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(5, activation='softmax'))
    
    model.compile(optimizer='adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])
    
    return model

# Create embedding matrix
def create_embedding_matrix(word_vectors, tokenizer, embedding_dim):
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if i < max_words:
            try:
                embedding_vector = word_vectors[word]
                embedding_matrix[i] = embedding_vector
            except KeyError:
                embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return embedding_matrix

# Build models with different embeddings
cbow_matrix = create_embedding_matrix(w2v_model.wv, tokenizer, 100)
skipgram_matrix = create_embedding_matrix(skipgram_model.wv, tokenizer, 100)

# Train and compare models
models = {
    'Random Embeddings': build_model(),
    'CBOW Embeddings': build_model(cbow_matrix),
    'Skip-Gram Embeddings': build_model(skipgram_matrix)
}

for name, model in models.items():
    print(f"\nTraining {name} model...")
    history = model.fit(X_train, y_train,
                      validation_data=(X_test, y_test),
                      epochs=10,
                      batch_size=128,
                      verbose=1)
    
    # Evaluate
    loss, accuracy = model.evaluate(X_test, y_test)
    print(f"{name} Test Accuracy: {accuracy:.4f}")




Training Random Embeddings model...
Epoch 1/10
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.3487 - loss: 1.4379 - val_accuracy: 0.6661 - val_loss: 0.8562
Epoch 2/10
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.7577 - loss: 0.6904 - val_accuracy: 0.7454 - val_loss: 0.7122
Epoch 3/10
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.8395 - loss: 0.5024 - val_accuracy: 0.7517 - val_loss: 0.6778
Epoch 4/10
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.8795 - loss: 0.3854 - val_accuracy: 0.7649 - val_loss: 0.6633
Epoch 5/10
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9054 - loss: 0.3035 - val_accuracy: 0.7694 - val_loss: 0.6921
Epoch 6/10
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9269 - loss: 0.2311 - val_accuracy: 0.7662 - va

In [9]:
import os
!pip install kagglehub




In [10]:
import os
import numpy as np
import tensorflow as tf
import kagglehub
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print("\n=== GLOVE EMBEDDINGS ===")

try:
    train_path = "/kaggle/input/coronaaaa/Corona_NLP_train.csv"
    test_path = "/kaggle/input/coronaaaa/Corona_NLP_test.csv"

    # Ensure 'clean_text' exists by preprocessing 'OriginalTweet'
    def clean_text(text):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
        text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
        text = re.sub(r"[^a-zA-Z0-9\s]", '', text)  # Remove special characters
        return text

    train_df['clean_text'] = train_df['OriginalTweet'].astype(str).apply(clean_text)
    test_df['clean_text'] = test_df['OriginalTweet'].astype(str).apply(clean_text)

    # Download GloVe via Kaggle Hub
    path = kagglehub.dataset_download("danielwillgeorge/glove6b100dtxt")
    glove_path = os.path.join(path, "glove.6B.100d.txt")

    # Load GloVe vectors
    glove_embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_embeddings[word] = vector

    # Tokenization
    tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')
    tokenizer.fit_on_texts(train_df['clean_text'])

    # Prepare sequences
    X_train = pad_sequences(tokenizer.texts_to_sequences(train_df['clean_text']), maxlen=100)
    X_test = pad_sequences(tokenizer.texts_to_sequences(test_df['clean_text']), maxlen=100)
    y_train = tf.keras.utils.to_categorical(train_df['Sentiment'].factorize()[0], 5)
    y_test = tf.keras.utils.to_categorical(test_df['Sentiment'].factorize()[0], 5)

    # Create embedding matrix
    glove_matrix = np.zeros((20000, 100))
    for word, i in tokenizer.word_index.items():
        if i < 20000:
            glove_matrix[i] = glove_embeddings.get(word, np.random.normal(scale=0.6, size=(100,)))

    # Build model
    glove_model = tf.keras.Sequential([
        tf.keras.layers.Embedding(20000, 100, weights=[glove_matrix], input_length=100, trainable=False),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(5, activation='softmax')
    ])
    glove_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train
    print("Training GloVe model...")
    glove_model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))
    glove_acc = glove_model.evaluate(X_test, y_test, verbose=0)[1]
    print(f"\nGloVe Test Accuracy: {glove_acc:.4f}")

except Exception as e:
    print(f"GloVe Error: {str(e)}")



=== GLOVE EMBEDDINGS ===
Training GloVe model...
Epoch 1/10




[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.3470 - loss: 1.4383 - val_accuracy: 0.2751 - val_loss: 2.1190
Epoch 2/10
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.4842 - loss: 1.1920 - val_accuracy: 0.2449 - val_loss: 2.2904
Epoch 3/10
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.5237 - loss: 1.1201 - val_accuracy: 0.2162 - val_loss: 2.6112
Epoch 4/10
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.5560 - loss: 1.0511 - val_accuracy: 0.2557 - val_loss: 2.6427
Epoch 5/10
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.5801 - loss: 1.0024 - val_accuracy: 0.2757 - val_loss: 2.8451
Epoch 6/10
[1m322/322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.6078 - loss: 0.9483 - val_accuracy: 0.3107 - val_loss: 2.8239
Epoch 7/10
[1m322/322[0m [32m━

In [11]:
import tensorflow_hub as hub
import tensorflow as tf

# Download ELMo model
elmo_model = hub.load("https://tfhub.dev/google/elmo/3")
elmo_model_path = "elmo_model"  # Save directory

# Save the model using SavedModel format
tf.saved_model.save(elmo_model, elmo_model_path)

print("ELMo model downloaded and saved at:", elmo_model_path)


ELMo model downloaded and saved at: elmo_model


In [14]:

# Load ELMo model
elmo = hub.load("https://tfhub.dev/google/elmo/3")

# Function to get ELMo embeddings
def get_elmo_embedding(text):
    if isinstance(text, str):
        embeddings = elmo.signatures["default"](tf.constant([text]))["elmo"]
        sentence_vector = np.mean(embeddings[0], axis=0)  # Averaging word vectors
        return sentence_vector  # No .numpy() needed
    else:
        return np.zeros(1024)  # Placeholder for missing text



# Generate ELMo embeddings for training data
print("Generating ELMo embeddings for training data...")
train_df["ELMo_Embedding"] = [get_elmo_embedding(text) for text in tqdm(train_df["OriginalTweet"])]

# Convert list of arrays to a NumPy array
X = np.vstack(train_df["ELMo_Embedding"].values)
y = train_df["Sentiment"].values  # Assuming Sentiment is the target label

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict and compute accuracy
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.4f}")

# Save processed dataset
train_df.to_pickle("train_with_elmo.pkl")
print("ELMo embeddings generated, model trained, and accuracy calculated!")


Generating ELMo embeddings for training data...



  0%|          | 0/41157 [00:00<?, ?it/s][A
  0%|          | 1/41157 [00:00<3:52:59,  2.94it/s][A
  0%|          | 2/41157 [00:00<3:30:42,  3.26it/s][A
  0%|          | 3/41157 [00:00<2:48:45,  4.06it/s][A
  0%|          | 4/41157 [00:01<2:40:34,  4.27it/s][A
  0%|          | 6/41157 [00:01<2:08:03,  5.36it/s][A
  0%|          | 7/41157 [00:01<2:08:18,  5.35it/s][A
  0%|          | 8/41157 [00:01<2:02:10,  5.61it/s][A
  0%|          | 9/41157 [00:01<2:10:47,  5.24it/s][A
  0%|          | 10/41157 [00:02<2:19:47,  4.91it/s][A
  0%|          | 11/41157 [00:02<2:24:23,  4.75it/s][A
  0%|          | 13/41157 [00:02<2:09:37,  5.29it/s][A
  0%|          | 14/41157 [00:02<2:10:20,  5.26it/s][A
  0%|          | 15/41157 [00:03<2:07:42,  5.37it/s][A
  0%|          | 16/41157 [00:03<2:08:58,  5.32it/s][A
  0%|          | 17/41157 [00:03<2:10:19,  5.26it/s][A
  0%|          | 18/41157 [00:03<2:09:38,  5.29it/s][A
  0%|          | 20/41157 [00:03<1:54:42,  5.98it/s][A
  0%|     

Model Accuracy: 0.3927


  0%|          | 0/41157 [54:31<?, ?it/s]


ELMo embeddings generated, model trained, and accuracy calculated!
