In [1]:
import torch
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
from tqdm import tqdm
import re
from torch import nn
import random

import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
import random
import re
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm


In [16]:
# Paths
data_path = "data_intern/twitter-datasets/"
test_path = f"{data_path}test_data.txt"
trainP_path = f"{data_path}train_pos.txt"
trainN_path = f"{data_path}train_neg.txt"

# Load and clean tweets
def load_and_clean_test_tweets(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        tweets = file.readlines()
    test_tweets = [re.sub(r"^\d+,\s*", "", tweet).strip() for tweet in tweets]
    return test_tweets

def load_tweets(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        tweets = file.readlines()
    return tweets

# Load datasets
test_tweets = load_and_clean_test_tweets(test_path)
trainP_tweets = load_tweets(trainP_path)
trainN_tweets = load_tweets(trainN_path)

# Combine labeled data
labeled_tweets = [(tweet.strip(), 1) for tweet in trainP_tweets] + [(tweet.strip(), 0) for tweet in trainN_tweets]
random.shuffle(labeled_tweets)

# Smaller training
# labeled_tweets = labeled_tweets[:20000]

# Extract tweets and labels
train_tweets = [tweet for tweet, label in labeled_tweets]
train_labels = [label for tweet, label in labeled_tweets]


<class 'list'>


In [17]:

# Define the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
base_model = AutoModel.from_pretrained("vinai/bertweet-base")
device = "mps" if torch.backends.mps.is_available() else "cpu"
base_model = base_model.to(device)

# Dataset class for tokenization
class TweetDataset(Dataset):
    def __init__(self, tweets, tokenizer, max_length=128):
        self.encodings = tokenizer(tweets, truncation=True, padding=True, max_length=max_length, return_tensors="pt")

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
        }

# Function to extract embeddings
def extract_embeddings(tweets, batch_size=8):
    dataset = TweetDataset(tweets, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    embeddings = []

    base_model.eval()
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Extracting Embeddings"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            # Pass through model and extract CLS token
            outputs = base_model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :]
            embeddings.append(cls_embeddings.cpu())  # Move to CPU to save MPS memory

    return torch.cat(embeddings, dim=0).numpy()

# Extract embeddings for training and test tweets
print("Extracting training embeddings...")
train_embeddings = extract_embeddings(train_tweets, batch_size=8)


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


Extracting training embeddings...


Extracting Embeddings: 100%|██████████| 25000/25000 [33:59<00:00, 12.26it/s]


In [None]:
import numpy as np

# Convert train_labels (list) to a NumPy array and ensure it is a column vector
train_labels = np.array(train_labels).reshape(-1, 1)

# Combine embeddings and labels into one array
train_data = np.hstack((train_embeddings, train_labels))

# Export to CSV
np.savetxt(
    "/Users/martinmoureau/Documents/GitHub/DIGITALNOMADIE2/BERTweet_train_data.csv",
    train_data,
    delimiter=",",
    fmt="%.10f"  # Format for floats (you can adjust precision)
)
print("Embeddings and labels successfully saved to BERTweet_train_data.csv")

Embeddings and labels successfully saved to train_data.csv


In [20]:
# Import from CSV
train_data = np.loadtxt(
    "/Users/martinmoureau/Documents/GitHub/DIGITALNOMADIE2/BERTweet_train_data.csv", 
    delimiter=","
)

# Split back into embeddings and labels
train_embeddings = train_data[:, :-1]  # All columns except the last
train_labels = train_data[:, -1]       # Only the last column

print("Embeddings and labels successfully loaded.")
print("Embeddings shape:", train_embeddings.shape)
print("Labels shape:", train_labels.shape)

Embeddings and labels successfully loaded.
Embeddings shape: (200000, 768)
Labels shape: (200000,)


In [24]:
print(len(test_tweets))

10000


## Logistic Classifier

In [25]:
# Train logistic regression classifier
print("Training Logistic Regression...")
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(train_embeddings, train_labels)

Training Logistic Regression...


In [26]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Predict on the training set
train_preds = log_reg.predict(train_embeddings)

# Convert predictions: 1 for positive, -1 for negative
train_preds = [1 if pred == 1 else -1 for pred in train_preds]
train_labels_converted = [1 if label == 1 else -1 for label in train_labels]

# Compute accuracy and F1-score
train_accuracy = accuracy_score(train_labels_converted, train_preds)
train_f1 = f1_score(train_labels_converted, train_preds, average="binary", pos_label=1)

# Print metrics
print("Training Set Evaluation:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"F1-Score: {train_f1:.4f}")

# Optional: Print detailed classification report
print("\nClassification Report:")
print(classification_report(train_labels_converted, train_preds, target_names=["negative (-1)", "positive (1)"]))


Training Set Evaluation:
Accuracy: 0.8725
F1-Score: 0.8748

Classification Report:
               precision    recall  f1-score   support

negative (-1)       0.89      0.85      0.87    100000
 positive (1)       0.86      0.89      0.87    100000

     accuracy                           0.87    200000
    macro avg       0.87      0.87      0.87    200000
 weighted avg       0.87      0.87      0.87    200000



In [8]:
from helpers import create_csv_submission

print("Extracting test embeddings...")
test_embeddings = extract_embeddings(test_tweets, batch_size=8)

# Predict on the test set
print("Predicting sentiments on test data...")
test_preds = log_reg.predict(test_embeddings)

# Convert predictions: 1 for positive, -1 for negative
test_preds = [1 if pred == 1 else -1 for pred in test_preds]

# Convert predictions: 1 for positive, -1 for negative
test_preds = [1 if pred == 1 else -1 for pred in test_preds]

# Create a list of tweet indices (1-based index)
tweet_indices = np.arange(1, len(test_preds) + 1)

# Save predictions using the provided function
create_csv_submission(tweet_indices, test_preds, "test_predictions.csv")
print('finish creating csv')


Extracting test embeddings...


Extracting Embeddings: 100%|██████████| 1250/1250 [01:03<00:00, 19.67it/s]

Predicting sentiments on test data...
finish creating csv





## MLP classifier

In [27]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

# Define the MLP Classifier
class MLPClassifier(nn.Module):
    def __init__(self, input_size=768, hidden_size=256, num_classes=2):
        super(MLPClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_size),  # Fully connected layer 1
            nn.ReLU(),                          # Non-linear activation
            nn.Dropout(0.3),                    # Dropout for regularization
            nn.Linear(hidden_size, num_classes) # Output layer
        )
    
    def forward(self, x):
        return self.model(x)

# Convert embeddings and labels into tensors
train_embeddings_tensor = torch.tensor(train_embeddings, dtype=torch.float32)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)

# Create DataLoader for training
batch_size = 32
train_dataset = TensorDataset(train_embeddings_tensor, train_labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize the MLP model, loss function, and optimizer
device = "mps" if torch.backends.mps.is_available() else "cpu"
mlp = MLPClassifier(input_size=768, hidden_size=256, num_classes=2).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp.parameters(), lr=1e-4)

# Training loop
print("Training MLP Classifier...")
num_epochs = 10

mlp.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch_embeddings, batch_labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        batch_embeddings, batch_labels = batch_embeddings.to(device), batch_labels.to(device)
        
        # Forward pass
        outputs = mlp(batch_embeddings)
        loss = criterion(outputs, batch_labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader):.4f}")

print("MLP Classifier Training Complete!")

Training MLP Classifier...


Epoch 1/10: 100%|██████████| 6250/6250 [00:21<00:00, 286.12it/s]


Epoch [1/10], Loss: 0.4351


Epoch 2/10: 100%|██████████| 6250/6250 [00:21<00:00, 289.58it/s]


Epoch [2/10], Loss: 0.3563


Epoch 3/10: 100%|██████████| 6250/6250 [00:21<00:00, 296.68it/s]


Epoch [3/10], Loss: 0.3341


Epoch 4/10: 100%|██████████| 6250/6250 [00:21<00:00, 294.18it/s]


Epoch [4/10], Loss: 0.3228


Epoch 5/10: 100%|██████████| 6250/6250 [00:23<00:00, 266.43it/s]


Epoch [5/10], Loss: 0.3149


Epoch 6/10: 100%|██████████| 6250/6250 [00:25<00:00, 244.17it/s]


Epoch [6/10], Loss: 0.3098


Epoch 7/10: 100%|██████████| 6250/6250 [00:21<00:00, 295.74it/s]


Epoch [7/10], Loss: 0.3059


Epoch 8/10: 100%|██████████| 6250/6250 [00:21<00:00, 297.17it/s]


Epoch [8/10], Loss: 0.3024


Epoch 9/10: 100%|██████████| 6250/6250 [00:20<00:00, 297.65it/s]


Epoch [9/10], Loss: 0.3000


Epoch 10/10: 100%|██████████| 6250/6250 [00:21<00:00, 292.73it/s]

Epoch [10/10], Loss: 0.2973
MLP Classifier Training Complete!





In [28]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Set the MLP model to evaluation mode
mlp.eval()

# Predict on the training set
with torch.no_grad():
    train_embeddings_tensor = torch.tensor(train_embeddings, dtype=torch.float32).to(device)
    outputs = mlp(train_embeddings_tensor)  # Forward pass
    train_preds = torch.argmax(outputs, dim=1).cpu().numpy()  # Get predicted class indices

# Convert predictions: 1 for positive, -1 for negative
train_preds = [1 if pred == 1 else -1 for pred in train_preds]
train_labels_converted = [1 if label == 1 else -1 for label in train_labels]

# Compute accuracy and F1-score
train_accuracy = accuracy_score(train_labels_converted, train_preds)
train_f1 = f1_score(train_labels_converted, train_preds, average="binary", pos_label=1)

# Print metrics
print("Training Set Evaluation:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"F1-Score: {train_f1:.4f}")

# Optional: Print detailed classification report
print("\nClassification Report:")
print(classification_report(train_labels_converted, train_preds, target_names=["negative (-1)", "positive (1)"]))

Training Set Evaluation:
Accuracy: 0.8761
F1-Score: 0.8807

Classification Report:
               precision    recall  f1-score   support

negative (-1)       0.91      0.84      0.87    100000
 positive (1)       0.85      0.91      0.88    100000

     accuracy                           0.88    200000
    macro avg       0.88      0.88      0.88    200000
 weighted avg       0.88      0.88      0.88    200000



In [11]:
from helpers import create_csv_submission
import numpy as np

# Extract test embeddings
print("Extracting test embeddings...")
test_embeddings = extract_embeddings(test_tweets, batch_size=8)

# Predict on the test set
print("Predicting sentiments on test data...")
mlp.eval()  # Set MLP to evaluation mode

with torch.no_grad():
    test_embeddings_tensor = torch.tensor(test_embeddings, dtype=torch.float32).to(device)
    test_outputs = mlp(test_embeddings_tensor)  # Forward pass
    test_preds = torch.argmax(test_outputs, dim=1).cpu().numpy()  # Get predicted class indices

# Convert predictions: 1 for positive, -1 for negative
test_preds = [1 if pred == 1 else -1 for pred in test_preds]

# Create a list of tweet indices (1-based index)
tweet_indices = np.arange(1, len(test_preds) + 1)

# Save predictions using the provided function
create_csv_submission(tweet_indices, test_preds, "test_predictions.csv")
print("Finished creating 'test_predictions.csv'")

Extracting test embeddings...


Extracting Embeddings: 100%|██████████| 1250/1250 [01:03<00:00, 19.63it/s]


Predicting sentiments on test data...
Finished creating 'test_predictions.csv'


## MLP2 classifier 

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm

class MLPClassifier(nn.Module):
    def __init__(self, input_size=768, hidden_size=256, num_classes=2, dropout=0.5):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.bn2 = nn.BatchNorm1d(hidden_size // 2)
        self.output = nn.Linear(hidden_size // 2, num_classes)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.output(x)
        return x

# Convert embeddings and labels to tensors
train_embeddings_tensor = torch.tensor(train_embeddings, dtype=torch.float32)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long)

# Create DataLoader
batch_size = 32
train_dataset = TensorDataset(train_embeddings_tensor, train_labels_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

mlp2 = MLPClassifier(input_size=768, hidden_size=256, num_classes=2, dropout=0.3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(mlp2.parameters(), lr=1e-4, weight_decay=1e-5)

num_epochs = 10
mlp2.train()
for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch_embeddings, batch_labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        outputs = mlp2(batch_embeddings)
        loss = criterion(outputs, batch_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {epoch_loss/len(train_loader):.4f}")

Epoch 1/10: 100%|██████████| 6250/6250 [00:08<00:00, 757.15it/s]


Epoch [1/10] - Loss: 0.3754


Epoch 2/10: 100%|██████████| 6250/6250 [00:06<00:00, 908.41it/s]


Epoch [2/10] - Loss: 0.3237


Epoch 3/10: 100%|██████████| 6250/6250 [00:06<00:00, 918.72it/s]


Epoch [3/10] - Loss: 0.3111


Epoch 4/10: 100%|██████████| 6250/6250 [00:06<00:00, 943.96it/s] 


Epoch [4/10] - Loss: 0.3034


Epoch 5/10: 100%|██████████| 6250/6250 [00:06<00:00, 907.65it/s] 


Epoch [5/10] - Loss: 0.2976


Epoch 6/10: 100%|██████████| 6250/6250 [00:06<00:00, 966.67it/s]


Epoch [6/10] - Loss: 0.2948


Epoch 7/10: 100%|██████████| 6250/6250 [00:06<00:00, 898.14it/s] 


Epoch [7/10] - Loss: 0.2901


Epoch 8/10: 100%|██████████| 6250/6250 [00:06<00:00, 926.22it/s]


Epoch [8/10] - Loss: 0.2874


Epoch 9/10: 100%|██████████| 6250/6250 [00:06<00:00, 913.34it/s]


Epoch [9/10] - Loss: 0.2855


Epoch 10/10: 100%|██████████| 6250/6250 [00:06<00:00, 898.30it/s] 

Epoch [10/10] - Loss: 0.2820





In [30]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Set the MLP model to evaluation mode
mlp2.eval()

# Predict on the training set
with torch.no_grad():
    train_embeddings_tensor = torch.tensor(train_embeddings, dtype=torch.float32)
    outputs = mlp2(train_embeddings_tensor)  # Forward pass
    train_preds = torch.argmax(outputs, dim=1).numpy()  # Get predicted class indices

# Convert predictions: 1 for positive, -1 for negative
train_preds = [1 if pred == 1 else -1 for pred in train_preds]
train_labels_converted = [1 if label == 1 else -1 for label in train_labels]

# Compute accuracy and F1-score
train_accuracy = accuracy_score(train_labels_converted, train_preds)
train_f1 = f1_score(train_labels_converted, train_preds, average="binary", pos_label=1)

# Print metrics
print("Training Set Evaluation:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"F1-Score: {train_f1:.4f}")

# Optional: Print detailed classification report
print("\nClassification Report:")
print(classification_report(train_labels_converted, train_preds, target_names=["negative (-1)", "positive (1)"]))

Training Set Evaluation:
Accuracy: 0.8958
F1-Score: 0.8961

Classification Report:
               precision    recall  f1-score   support

negative (-1)       0.90      0.89      0.90    100000
 positive (1)       0.89      0.90      0.90    100000

     accuracy                           0.90    200000
    macro avg       0.90      0.90      0.90    200000
 weighted avg       0.90      0.90      0.90    200000



In [14]:
from helpers import create_csv_submission
import numpy as np

# Extract test embeddings
print("Extracting test embeddings...")
test_embeddings = extract_embeddings(test_tweets, batch_size=8)

# Predict on the test set using mlp2
print("Predicting sentiments on test data...")
mlp2.eval()  # Set MLP to evaluation mode

with torch.no_grad():
    test_embeddings_tensor = torch.tensor(test_embeddings, dtype=torch.float32)
    test_outputs = mlp2(test_embeddings_tensor)  # Forward pass
    test_preds = torch.argmax(test_outputs, dim=1).numpy()  # Get predicted class indices

# Convert predictions: 1 for positive, -1 for negative
test_preds = [1 if pred == 1 else -1 for pred in test_preds]

# Create a list of tweet indices (1-based index)
tweet_indices = np.arange(1, len(test_preds) + 1)

# Save predictions using the provided function
create_csv_submission(tweet_indices, test_preds, "test_predictions_mlp2.csv")
print("Finished creating 'test_predictions_mlp2.csv'")

Extracting test embeddings...


Extracting Embeddings: 100%|██████████| 1250/1250 [01:03<00:00, 19.79it/s]


Predicting sentiments on test data...
Finished creating 'test_predictions_mlp2.csv'


## SVM classsifier

In [None]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Assuming the embeddings and labels are already loaded:
# train_embeddings, train_labels_np, train_labels

train_labels_np = np.array(train_labels)


# Step 1: Standardize the embeddings before PCA
print("Standardizing the embeddings...")
scaler = StandardScaler()
train_embeddings_scaled = scaler.fit_transform(train_embeddings)

# Step 2: Apply PCA to reduce dimensionality
print("Applying PCA to reduce dimensionality...")
pca = PCA(n_components=50)  # Retain top 100 principal components
train_embeddings_reduced = pca.fit_transform(train_embeddings_scaled)

print(f"Original shape: {train_embeddings.shape}, Reduced shape: {train_embeddings_reduced.shape}")

# Step 3: Train SVM Classifier
print("Training SVM Classifier...")
svm_clf = SVC(kernel='rbf', C=1.0, gamma='scale', verbose=True)  # RBF kernel, default hyperparameters
svm_clf.fit(train_embeddings_reduced, train_labels_np)


Standardizing the embeddings...
Applying PCA to reduce dimensionality...
Original shape: (200000, 768), Reduced shape: (200000, 50)
Training SVM Classifier...
[LibSVM].......................

In [35]:
RED = 2000
# Step 4: Evaluate on Training Set
print("Evaluating SVM on Training Set...")
train_preds = svm_clf.predict(train_embeddings_reduced[:RED])

# Convert predictions and labels to -1 and 1
train_preds = [1 if pred == 1 else -1 for pred in train_preds]
train_labels_converted = [1 if label == 1 else -1 for label in train_labels_np[:RED]]

# Compute accuracy and F1-score
train_accuracy = accuracy_score(train_labels_converted, train_preds)
train_f1 = f1_score(train_labels_converted, train_preds, average="binary", pos_label=1)

# Print evaluation metrics
print("Training Set Evaluation:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"F1-Score: {train_f1:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(train_labels_converted, train_preds, target_names=["negative (-1)", "positive (1)"]))


Evaluating SVM on Training Set...
Training Set Evaluation:
Accuracy: 0.7290
F1-Score: 0.7617

Classification Report:
               precision    recall  f1-score   support

negative (-1)       0.79      0.61      0.69       974
 positive (1)       0.69      0.84      0.76      1026

     accuracy                           0.73      2000
    macro avg       0.74      0.73      0.72      2000
 weighted avg       0.74      0.73      0.72      2000



In [None]:
# Evaluate on Training Set
print("Evaluating SVM on Training Set...")
train_preds = svm_clf.predict(train_embeddings)

# Convert predictions and labels to -1 and 1
train_preds = [1 if pred == 1 else -1 for pred in train_preds]
train_labels_converted = [1 if label == 1 else -1 for label in train_labels]

# Compute accuracy and F1-score
train_accuracy = accuracy_score(train_labels_converted, train_preds)
train_f1 = f1_score(train_labels_converted, train_preds, average="binary", pos_label=1)

# Print evaluation metrics
print("Training Set Evaluation:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"F1-Score: {train_f1:.4f}")

# Print detailed classification report
print("\nClassification Report:")
print(classification_report(train_labels_converted, train_preds, target_names=["negative (-1)", "positive (1)"]))