In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from torch.utils.data import Dataset, DataLoader
from difftransformer import DifferentialTransformerClassifier, EmbeddingLayer
import utils

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-a/2022/amine.chraibi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/eleves-a/2022/amine.chraibi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preparing dataset
Data and tokenizer ready


In [2]:
vocab_size = utils.tokenizer.vocab_size()
depth = 5
n_embd = 144
n_head = 4
batch_size = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dropout = 0.014500254910782884

In [3]:
# Process each evaluation file
li = []
for filename in os.listdir("eval_tweets"):
    test_df = pd.read_csv("eval_tweets/" + filename)
    li.append(test_df)
test_df = pd.concat(li, ignore_index=True)

In [5]:
test_df['Tweet'] = test_df['Tweet'].apply(utils.preprocess_text)


In [6]:
import torch
import os
import pandas as pd

# Load the model
model = DifferentialTransformerClassifier(
    vocab_size=vocab_size,
    embedding_dim=n_embd,  # Ensure this matches the dimension used in embeddings
    num_heads=n_head,
    depth=depth,
    dropout = dropout
)
model.load_state_dict(torch.load("model_checkpoint_10.pth"))
model.to(device)

# Switch to evaluation mode
model.eval()

# Prepare for predictions
def majority_vote(subperiod_predictions):
    """
    Perform majority vote on subperiod predictions to determine the overall period prediction.
    """
    return max(set(subperiod_predictions), key=subperiod_predictions.count)

# Prepare dataset and dataloader
test_dataset, period_to_subperiod_mapping = utils.prepare_dataset(test_df, False)
test_dataset = torch.tensor(test_dataset, device = device)
# Predict on subperiods
subperiod_predictions = []
with torch.no_grad():
    for tweets in test_dataset:
        tweets = tweets.unsqueeze(0) # add batch dimension
        outputs = model(tweets)  # (batch_size, )
        preds = (outputs.float().cpu().numpy() > 0.5).astype(int).tolist()  # Binary predictions
        subperiod_predictions.extend(preds)

# Aggregate subperiod predictions into period predictions
period_predictions = {}
for (match_id,period_id), subperiod_indices in period_to_subperiod_mapping.items():
    if not subperiod_indices :
        continue
    subperiod_preds = [subperiod_predictions[idx] for idx in subperiod_indices]
    period_predictions[f'{match_id}_{period_id}'] = majority_vote(subperiod_preds)

# Save predictions
output_df = pd.DataFrame({
    'ID': list(period_predictions.keys()),
    'Prediction': list(period_predictions.values())
})

output_df.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")



  self.lamb = nn.Parameter(torch.tensor(lambda_init))


Predictions saved to submission.csv
