In [91]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch.utils.data import Dataset, DataLoader
from difftransformer import DifferentialTransformerClassifier, EmbeddingLayer

In [92]:
# Basic preprocessing function
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [93]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed(42)

In [94]:
# Download some NLP models for processing, optional
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-a/2022/amine.chraibi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/eleves-a/2022/amine.chraibi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [95]:
# Read all training files and concatenate them into one dataframe
li = []
for filename in os.listdir("train_tweets"):
    df = pd.read_csv("train_tweets/" + filename)
    li.append(df)
df = pd.concat(li, ignore_index=True)

In [96]:
# Apply preprocessing to each tweet # 13 min
df['Tweet'] = df['Tweet'].apply(preprocess_text)

In [97]:
def get_tweet_embedding(tweet, embeddings_model, vector_size):
    """
    Convert a tweet into a sequence of embeddings.
    """
    tokens = tweet.lower().split()
    embeddings = []
    for token in tokens:
        embedding = embeddings_model.get(token)
        if embedding is not None:
            embeddings.append(embedding)
        else:
            embeddings.append(np.zeros(vector_size))  # Handle unknown words
    return embeddings

In [98]:
class SimpleTokenizer:
    def __init__(self):
        self.word2idx = {'<PAD>': 0, '<UNK>': 1}
        self.idx = 2
    
    def build_vocab(self, texts):
        for text in texts:
            words = text.lower().split()
            for word in words:
                if word not in self.word2idx:
                    self.word2idx[word] = self.idx
                    self.idx += 1
    
    def __call__(self, text):
        return [self.word2idx.get(word, self.word2idx['<UNK>']) for word in text.lower().split()]
    
    def vocab_size(self):
        return len(self.word2idx)

In [99]:
# Group by periodId and count the number of tweets per period
tweets_per_period = df.groupby('PeriodID')['Tweet'].count()

# Find the maximum number of tweets in any period
max_tweets_per_period = tweets_per_period.max()
min_tweets_per_period = tweets_per_period.min()

# Calculate the number of words in each tweet
df['word_count'] = df['Tweet'].apply(lambda x: len(str(x).split()))

# Find the maximum number of words in any tweet
max_words_per_tweet = df['word_count'].max()

# Display the results
print("Maximum number of tweets per period:", max_tweets_per_period)
print("Minimum number of tweets per period:", min_tweets_per_period)
print("Maximum number of words in a tweet:", max_words_per_tweet)

Maximum number of tweets per period: 48710
Minimum number of tweets per period: 5062
Maximum number of words in a tweet: 44


In [100]:
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for tweets, labels in dataloader:
        tweets = tweets.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(tweets)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    return total_loss / len(dataloader)

In [101]:
import random
import torch
from torch.utils.data import Dataset, DataLoader

# Tokenizer and padding setup
all_tweets = df['Tweet'].values
tokenizer = SimpleTokenizer()
tokenizer.build_vocab(all_tweets)

MAX_TWEET_LENGTH = 44  # Maximum tweet length in tokens
NUM_TWEETS_PER_PERIOD = 92  # Fixed number of tweets per period

grouped_tweets = df.groupby(['MatchID', 'PeriodID'])['Tweet'].apply(list).unstack(fill_value=[])
grouped_labels = df.groupby(['MatchID', 'PeriodID'])['EventType'].max().unstack(fill_value=0)

def pad_tweet(tokens, max_length=MAX_TWEET_LENGTH):
    """Pad or truncate tokens to max_length."""
    if len(tokens) < max_length:
        return tokens + [tokenizer.word2idx['<PAD>']] * (max_length - len(tokens))
    else:
        return tokens[:max_length]


def sample_tweets_or_pad(period):
    """Randomly select NUM_TWEETS_PER_PERIOD tweets from the period or pad if empty."""
    if len(period) == 0:  # If the period is empty
        return [[tokenizer.word2idx['<PAD>']] * MAX_TWEET_LENGTH] * NUM_TWEETS_PER_PERIOD

    # Randomly sample NUM_TWEETS_PER_PERIOD tweets or pad if fewer
    sampled_tweets = random.sample(period, min(len(period), NUM_TWEETS_PER_PERIOD))
    padded_tweets = [pad_tweet(tokenizer(tweet)) for tweet in sampled_tweets]

    # If fewer than NUM_TWEETS_PER_PERIOD, pad with <PAD> tweets
    while len(padded_tweets) < NUM_TWEETS_PER_PERIOD:
        padded_tweets.append([tokenizer.word2idx['<PAD>']] * MAX_TWEET_LENGTH)

    return padded_tweets


def tokenize_and_sample_grouped_tweets(grouped_tweets):
    """Tokenize tweets and ensure consistent NUM_TWEETS_PER_PERIOD for each period."""
    tokenized_matches = []
    for _, periods in grouped_tweets.iterrows():
        tokenized_match = [sample_tweets_or_pad(period) for period in periods]
        tokenized_matches.append(tokenized_match)
    return tokenized_matches


# Tokenize and sample tweets
tokenized_and_sampled_tweets = tokenize_and_sample_grouped_tweets(grouped_tweets)
labels = grouped_labels.fillna(0).values.tolist()  # Ensure labels are padded with 0 for missing periods


# Dataset class
class TweetDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        match_features = torch.tensor(self.features[idx], dtype=torch.long)  # (num_periods, NUM_TWEETS_PER_PERIOD, MAX_TWEET_LENGTH)
        match_labels = torch.tensor(self.labels[idx], dtype=torch.bfloat16)  # (num_periods,)
        return match_features, match_labels


# Create Dataset and DataLoader
dataset = TweetDataset(tokenized_and_sampled_tweets, labels)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)



In [102]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
batch_size = 1
vocab_size = tokenizer.vocab_size()
depth = 2
batch_size = 1
n_embd = 162
n_head = 3
dropout = 0.2

In [103]:
import torch.nn as nn
from torch.nn import functional as F
torch.set_default_dtype(torch.bfloat16)
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(EmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
    
    def forward(self, x):
        return self.embedding(x)

class Head(nn.Module):
    """ one head of self-attention """
    def __init__(self, embedding_dim, head_size):
        super().__init__()
        self.key_1 = nn.Linear(embedding_dim, head_size, bias=False)
        self.query_1 = nn.Linear(embedding_dim, head_size, bias=False)
        self.key_2 = nn.Linear(embedding_dim, head_size, bias=False)
        self.query_2 = nn.Linear(embedding_dim, head_size, bias=False)
        self.value = nn.Linear(embedding_dim, head_size, bias=False)

    def forward(self, x, lamb):
        B, T, C = x.shape
        k_1 = self.key_1(x)
        q_1 = self.query_1(x)
        k_2 = self.key_2(x)
        q_2 = self.query_2(x)
        wei_1 = q_1 @ k_1.transpose(-2,-1) * k_1.shape[-1]**-0.5
        wei_2 = q_2 @ k_2.transpose(-2,-1) * k_2.shape[-1]**-0.5
        wei_1 = F.softmax(wei_1, dim=-1)
        wei_2 = F.softmax(wei_2, dim=-1)
        v = self.value(x)
        wei = wei_1 - lamb * wei_2
        out = wei @ v
        return out

class MultiHeadDifferentialAttention(nn.Module):
    """ multiple heads of self-attention in parallel """
    def __init__(self, embedding_dim, num_heads, lambda_init=0.8):
        super().__init__()
        head_size = embedding_dim // num_heads
        self.heads = nn.ModuleList([Head(embedding_dim, head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(embedding_dim, embedding_dim)
        self.norm = nn.LayerNorm(embedding_dim)
        self.lamb = nn.Parameter(torch.tensor(lambda_init))

    def forward(self, x):
        out = torch.cat([h(x, self.lamb) for h in self.heads], dim=-1)
        out = self.norm(out)
        out = (1 - self.lamb) * out
        return out

class Block(nn.Module):
    """ Transformer block: communication followed by computation """
    def __init__(self, embedding_dim, num_heads):
        super().__init__()
        self.sa = MultiHeadDifferentialAttention(embedding_dim, num_heads)
        self.ffwd = FeedFoward(embedding_dim)
        self.ln1 = nn.LayerNorm(embedding_dim)
        self.ln2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """
    def __init__(self, embedding_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embedding_dim, 4 * embedding_dim),
            nn.ReLU(),
            nn.Linear(4 * embedding_dim, embedding_dim),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, embedding_dim, num_heads):
        # embedding_dim: embedding dimension, num_heads: the number of heads we'd like
        super().__init__()
        head_size = embedding_dim // num_heads
        self.sa = MultiHeadDifferentialAttention(embedding_dim, num_heads)
        self.ffwd = FeedFoward(embedding_dim)
        self.ln1 = nn.LayerNorm(embedding_dim)
        self.ln2 = nn.LayerNorm(embedding_dim)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class TweetEncoder(nn.Module):
    """Encodes a sequence of tokens into a single tweet embedding."""
    def __init__(self, embedding_dim, num_heads):
        super(TweetEncoder, self).__init__()
        self.attention = MultiHeadDifferentialAttention(embedding_dim, num_heads)
        self.fc = nn.Linear(embedding_dim, embedding_dim)

    def forward(self, x):
        attn_output = self.attention(x)
        tweet_embedding = attn_output.mean(dim=1)
        return self.fc(tweet_embedding)

# Define the period encoder
class PeriodEncoder(nn.Module):
    def __init__(self, embedding_dim, num_heads, depth):
        super(PeriodEncoder, self).__init__()
        self.layers = nn.ModuleList([Block(embedding_dim, num_heads) for _ in range(depth)])
    
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x

# Define the classification head
class ClassificationHead(nn.Module):
    def __init__(self, embedding_dim):
        super(ClassificationHead, self).__init__()
        self.fc = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.fc(x))
    

# Define the full model
class DifferentialTransformerClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_heads, depth):
        super(DifferentialTransformerClassifier, self).__init__()
        self.embedding = EmbeddingLayer(vocab_size, embedding_dim)
        self.tweet_encoder = TweetEncoder(embedding_dim, num_heads)
        self.period_encoder = PeriodEncoder(embedding_dim, num_heads, depth)
        self.classifier = ClassificationHead(embedding_dim)
        self.embedding_dim = embedding_dim
    
    def forward(self, features):
        """
        Args:
            features: Tensor of shape (batch_size, num_periods, num_tweets_per_period, tweet_length)
                        Represents the tokenized and padded tweets grouped by period.
        Returns:
            Output: Tensor of shape (batch_size,)
                    Binary predictions for each period in the batch.
        """
        batch_size, num_periods, num_tweets_per_period, tweet_length = features.shape

        # Flatten for embedding
        tweets_flat = features.view(-1, tweet_length)  # (batch_size * num_periods * num_tweets_per_period, tweet_length)
        x = self.embedding(tweets_flat)  # (batch_size * num_periods * num_tweets_per_period, tweet_length, embedding_dim)
        # Encode each tweet
        x = self.tweet_encoder(x)  # (batch_size * num_periods * num_tweets_per_period, embedding_dim)

        # Reshape back to periods
        x = x.view(batch_size * num_periods, num_tweets_per_period, self.embedding_dim)  # (batch_size * num_periods, num_tweets_per_period, embedding_dim)
        
        # Encode the periods
        x = self.period_encoder(x)  # (batch_size * num_periods, num_tweets_per_period, embedding_dim)

        # Pool over periods
        x = x.mean(dim=1)  # (batch_size * num_periods, embedding_dim)
                
        # Classification
        out = self.classifier(x)  # (batch_size, 1)
        
        out = out.view(batch_size, num_periods)  # (batch_size, num_periods)

        return out

In [104]:
model = DifferentialTransformerClassifier(
    vocab_size=vocab_size,
    embedding_dim=n_embd,  # Ensure this matches the dimension used in embeddings
    num_heads=n_head,
    depth=depth
).to(device)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.BCELoss()


In [105]:
# Training loop
epochs = 10
for epoch in range(epochs):
    loss = train(model, dataloader, optimizer, criterion, device)
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss:.4f}')

Epoch 1/10, Loss: 0.6824


Epoch 2/10, Loss: 0.6310
Epoch 3/10, Loss: 0.5617
Epoch 4/10, Loss: 0.5312
Epoch 5/10, Loss: 0.5219
Epoch 6/10, Loss: 0.5147
Epoch 7/10, Loss: 0.5095
Epoch 8/10, Loss: 0.5095
Epoch 9/10, Loss: 0.5033
Epoch 10/10, Loss: 0.5024


In [112]:
def evaluate(model, dataloader, criterion, device):
    """
    Evaluate the model on the given dataloader.
    """
    model.eval()
    total_loss = 0
    all_labels = []
    all_predictions = []
    
    with torch.no_grad():
        for tweets, labels in dataloader:
            tweets = tweets.to(device)
            labels = labels.to(device)
            
            outputs = model(tweets)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            all_labels.extend(labels.cpu().float().numpy())
            all_predictions.extend((outputs.cpu().float().numpy() > 0.5).astype(int))
    
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss, all_labels, all_predictions

In [107]:
# model = DifferentialTransformerClassifier(vocab_size, embedding_dim, n_heads, depth)
# model.load_state_dict(torch.load(model_save_path))
# model.to(device)

# Switch to evaluation mode
model.eval()

eval_li = []
for filename in os.listdir("val_tweets"):
    df_eval = pd.read_csv("val_tweets/" + filename)
    eval_li.append(df_eval)
df_eval = pd.concat(eval_li, ignore_index=True)

# Preprocess the evaluation data
df_eval['Tweet'] = df_eval['Tweet'].apply(preprocess_text)


In [108]:
df_eval.head()

Unnamed: 0,ID,MatchID,PeriodID,EventType,Timestamp,Tweet
0,18_0,18,0,0,1276869000000,usa stateside follower stand represent beautif...
1,18_0,18,0,0,1276869000000,lynz_ think ref might basil fawlty actually wo...
2,18_0,18,0,0,1276869000000,hoping usa win help ease pain last night loss ...
3,18_0,18,0,0,1276869000000,actually start worldcup
4,18_0,18,0,0,1276869000000,hanson roy proper pundit line worldcup


In [109]:
grouped_tweets = df_eval.groupby(['MatchID', 'PeriodID'])['Tweet'].apply(list).unstack(fill_value=[])
grouped_labels = df_eval.groupby(['MatchID', 'PeriodID'])['EventType'].max().unstack(fill_value=0)
tokenized_and_sampled_tweets = tokenize_and_sample_grouped_tweets(grouped_tweets)
labels = grouped_labels.fillna(0).values.tolist() 
# Create Dataset and DataLoader
dataset = TweetDataset(tokenized_and_sampled_tweets, labels)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)



NameError: name 'PeriodDataset' is not defined

In [113]:

# Evaluate the model
eval_loss, eval_labels, eval_predictions = evaluate(model, dataloader, criterion, device)

# Print evaluation results
from sklearn.metrics import classification_report, accuracy_score

print(f"Evaluation Loss: {eval_loss:.4f}")
print("Classification Report:")
print(classification_report(eval_labels, eval_predictions))
print(f"Accuracy: {accuracy_score(eval_labels, eval_predictions):.4f}")

Evaluation Loss: 0.7396
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.33      1.00      0.50         1
           3       0.67      1.00      0.80         2
           4       0.33      1.00      0.50         1
           5       0.67      1.00      0.80         2
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         3
           8       0.67      1.00      0.80         2
           9       0.67      1.00      0.80         2
          10       0.67      1.00      0.80         2
          11       0.67      1.00      0.80         2
          12       0.67      1.00      0.80         2
          13       0.33      1.00      0.50         1
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
