In [1]:
import fasttext
import fasttext.util
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the FastText model
fasttext.util.download_model('en', if_exists='ignore')  # Download English model
ft = fasttext.load_model('cc.en.300.bin')  # Load the model

# Load and shuffle the dataset
df = pd.read_csv('datasets/cleaned_SOLID9M_learner.tsv', sep="\t")
df = df.sample(frac=1, random_state=42)

df['labels'] = df['average'].apply(lambda x: 1 if x >= 0.8 else 0) # threshold the average values

sample_size = 30000
positive_ratio = 0.75

# Select the most confident positive values
tweets_pos_df = df[df['average'] > 0.8].sample(n=np.floor(sample_size*positive_ratio).astype(int), random_state=1)

# Select the most confident negative values
tweets_neg_df = df[df['average'] < 0.2].sample(n=np.floor(sample_size*(1-positive_ratio)).astype(int), random_state=1)

tweets_df = pd.concat([tweets_pos_df, tweets_neg_df])
tweets_df = tweets_df.sample(frac=1, random_state=42)

tweets = tweets_df['text'].values
labels = tweets_df['labels'].values

# tweets = df['tweet'].values
# labels = df['label'].values

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(tweets, labels, test_size=0.2, random_state=42)

# Tokenize and encode the training and validation texts using FastText
def embed_text(text, model):
    words = text.split()
    word_vectors = [model.get_word_vector(word) for word in words]
    return np.mean(word_vectors, axis=0)  # Average word vectors

train_encodings = np.array([embed_text(text, ft) for text in train_texts])
val_encodings = np.array([embed_text(text, ft) for text in val_texts])

# Check the vocabulary size (if needed)
vocab_size = len(ft.words)

del ft



In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence

class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item = {'embedding': torch.tensor(self.encodings[idx])}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)
    
def collate_fn(batch):
    embeddings = [item['embedding'] for item in batch]
    labels = torch.tensor([item['labels'] for item in batch])
    lengths = torch.tensor([len(embedding) for embedding in embeddings])
    padded_embeddings = pad_sequence(embeddings, batch_first=True, padding_value=0.0)
    return {'embedding': padded_embeddings, 'labels': labels, 'lengths': lengths}

train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)

train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=10, shuffle=False, collate_fn=collate_fn)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
# LSTM Model
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, embeddings):
        lstm_out, _ = self.lstm(embeddings)

        # lstm_out = torch.cat((lstm_out[:, -1, :self.hidden_size], lstm_out[:, 0, self.hidden_size:]), dim=1)
        out = self.fc(lstm_out)
        return self.sigmoid(out)

# Model setup
input_size = 300  # FastText embedding size
hidden_size = 512
output_size = 1
model = LSTMModel(input_size, hidden_size, output_size, num_layers=2)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Training
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        # input_ids = batch['input_ids'].to(device)
        embeddings = batch['embedding'].to(device)
        labels = batch['labels'].float().to(device)
        labels = labels.squeeze()
        # lengths = batch['lengths']
        # attention_mask = batch['attention_mask'].to(device)
        
        optimizer.zero_grad()
        outputs = model(embeddings)
        outputs = outputs.squeeze()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    average_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}")

  _torch_pytree._register_pytree_node(


Epoch [1/3], Average Loss: 0.0950
Epoch [2/3], Average Loss: 0.0430
Epoch [3/3], Average Loss: 0.0372


In [4]:
import numpy as np
import time
# Evaluation
model.eval()
# Perform evaluation on validation set and calculate metrics as needed
# Example: calculate accuracy
correct = 0
total = 0
i = 0
prediction_list = np.array([])

with torch.no_grad():
    test_start  = time.time()
    print('start')
    for batch in val_loader:
        # input_ids = batch['input_ids'].to(device)
        embeddings = batch['embedding'].to(device)

        labels = batch['labels'].float()
        # attention_mask = batch['attention_mask'].to(device)

        outputs = model(embeddings).detach().cpu()
        predictions = torch.round(outputs.squeeze())
        correct += ((outputs.squeeze() > 0.5) == labels).sum().item()

        total += labels.size(0)
        prediction_list = np.append(prediction_list, predictions.numpy())
    print('end')
    
    test_end = time.time()

accuracy = correct / total
print(f'Validation Accuracy: {accuracy}')


start
end
Validation Accuracy: 0.9781666666666666


In [5]:
torch.save(model.state_dict(), 'models/SOLID_bilstm.pth')

In [6]:
print(model)

LSTMModel(
  (lstm): LSTM(300, 512, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=1024, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [7]:
from sklearn.metrics import classification_report

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
report = classification_report(val_labels, prediction_list)

print(report)

              precision    recall  f1-score   support

           0       0.99      0.92      0.95      1488
           1       0.97      1.00      0.99      4512

    accuracy                           0.98      6000
   macro avg       0.98      0.96      0.97      6000
weighted avg       0.98      0.98      0.98      6000



In [8]:
from sklearn.metrics import confusion_matrix

# Assuming you have the true labels in `val_labels` and the predicted labels in `prediction_list`
cm = confusion_matrix(val_labels, prediction_list)

# Extract TP, TN, FP, FN from the confusion matrix
TP = cm[1, 1]
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]

print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")


True Positives (TP): 4499
True Negatives (TN): 1370
False Positives (FP): 118
False Negatives (FN): 13


In [9]:
# import pandas as pd

# # Create a DataFrame with the validation texts and labels
# validation_df = pd.DataFrame({'text': val_texts, 'label': val_labels})

# # Add the prediction list as a new column to the DataFrame
# validation_df['prediction'] = prediction_list

# # Save the DataFrame as a CSV file
# validation_df.to_csv('OLID_validation_with_predictions.csv', index=False)
