In [1]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
from time import time
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

  from .autonotebook import tqdm as notebook_tqdm


## Load Data

In [2]:
def load_data(path, verbose=False):
    """
    Load and concatenate CSV files from a specified directory.

    This function reads all CSV files in the given directory, concatenates them into a single DataFrame,
    and optionally prints the first few rows and the shape of the resulting DataFrame.

    Args:
      path (str): The directory path containing the CSV files to be loaded.
      verbose (bool, optional): If True, prints the first few rows and the shape of the concatenated DataFrame. Defaults to False.

    Returns:
      pd.DataFrame: A DataFrame containing the concatenated data from all CSV files in the specified directory.
    """
    li = []
    for filename in os.listdir(path):
        df = pd.read_csv(os.path.join(path, filename))
        li.append(df)
    output = pd.concat(li)
    if verbose:
        print(output.head())
        print(f'The shape of the data is: {output.shape}')
    return output

## Concatenation of tweets

In [3]:
def concat_tweets(df, MAX_SUBGROUP=150, event_type=True):
    """
    For each group of tweets with the same ID/MatchID/PeriodID/EventType,
    We create `MAX_SUBGROUP` subgroups of tweets by concatenating them.
    Therefore if `MAX_SUBGROUP` = 1, we have 1 tweet per ID/MatchID/PeriodID/EventType
    If `MAX_SUBGROUP` = +inf, we have all the tweets in different subgroups.
    """

    ### Grouping tweets who have same timestamp by concatenating them
    # Create an array of random integers in {0, ..., MAX_SUBGROUP} of size len(df_train)
    df["random_id"] = np.random.randint(0, MAX_SUBGROUP, len(df))
    if event_type:
        df_bis = df.groupby(['ID', "MatchID", "PeriodID", "EventType", "random_id"])['Tweet'].apply(lambda x: ' '.join(x)).reset_index().drop(columns='random_id')
    else:
        df_bis = df.groupby(['ID', "MatchID", "PeriodID", "random_id"])['Tweet'].apply(lambda x: ' '.join(x)).reset_index().drop(columns='random_id')
    df_bis = df_bis.sample(frac=1).reset_index(drop=True)
    return df_bis

## Embedding

In [4]:
# Bertweet model for tweet embeddings
model_name = "vinai/bertweet-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_Bertweet = AutoModel.from_pretrained(model_name)

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/bertweet-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def get_embeddings_in_batches(model, tokenizer, tweets, batch_size=10, device="cpu"):
  """
  Extracts [CLS] token embeddings and mean-pooled token embeddings from a list of tweets in batches.

  Args:
    model (transformers.PreTrainedModel): The pre-trained transformer model to use for generating embeddings.
    tokenizer (transformers.PreTrainedTokenizer): The tokenizer corresponding to the pre-trained model.
    tweets (list of str): A list of tweets to process.
    batch_size (int, optional): The number of tweets to process in each batch. Default is 100.
    device (str, optional): The device to run the model on ('cpu' or 'cuda'). Default is 'cpu'.

  Returns:
    np.ndarray: A NumPy array containing the concatenated [CLS] and mean-pooled token embeddings for all tweets.
  """

  embeddings = []
  dataloader = DataLoader(tweets, batch_size=batch_size, shuffle=False)
  model = model.to(device)
  for batch in dataloader:
    inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
      outputs = model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :].to(device)
    token_embeddings = outputs.last_hidden_state[:, 1:, :]
    attention_mask = inputs["attention_mask"][:, 1:]
    mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
    sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
    sum_mask = torch.sum(mask_expanded, dim=1)
    mean_pooling = sum_embeddings / sum_mask
    combined_embeddings = torch.cat((cls_embeddings, mean_pooling), dim=1)
    embeddings.append(combined_embeddings)

  return np.concatenate([emb.cpu().numpy() for emb in embeddings], axis=0)

## Embeddings for training

In [6]:
print("-"*50)
print("Loading data...")
t = time()
path_to_data = "../challenge_data/"
path_to_training_tweets = os.path.join(path_to_data, "train_tweets")
df_train = load_data(path_to_training_tweets)
print(f"Data loaded in {time()-t:.2f} seconds")
print("-"*50+"\n")

--------------------------------------------------
Loading data...
Data loaded in 0.03 seconds
--------------------------------------------------



In [7]:
Group_train = 1000
print("Grouping tweets...")
t = time()
df_train_bis = concat_tweets(df_train, MAX_SUBGROUP=Group_train, event_type=True)
print(f"Tweets grouped in {time()-t:.2f} seconds")
print("-"*50+"\n")

Grouping tweets...
Tweets grouped in 0.29 seconds
--------------------------------------------------



In [8]:
embeddings = get_embeddings_in_batches(
    model=model_Bertweet,
    tokenizer=tokenizer,
    tweets=df_train_bis["Tweet"].tolist(),
    batch_size=10,  # Taille du lot (à ajuster selon votre mémoire)
    device="cuda" if torch.cuda.is_available() else "cpu"  # GPU si disponible
)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## Training with NN

In [9]:
class NNModel(nn.Module):
    def __init__(self, input_dim, hidden_dims, output_dim, dropouts):
        super().__init__()
        assert len(hidden_dims) == len(dropouts)
        self.layers = nn.ModuleList()
        in_dim = input_dim
        for hidden_dim, dropout in zip(hidden_dims, dropouts):
            self.layers.append(nn.Linear(in_dim, hidden_dim))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.BatchNorm1d(hidden_dim))
            self.layers.append(nn.Dropout(dropout))
            in_dim = hidden_dim
        self.output_layer = nn.Linear(in_dim, output_dim)
    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        x = self.output_layer(x)
        return torch.sigmoid(x)

In [10]:
def train_model(model, train_loader, test_loader, optimizer, criterion, device, num_epochs=10, scheduler=None, weight_1_0=0.5):
    history = {'train_loss': [], 'val_loss': [], 'val_accuracy': []}
    for epoch in range(num_epochs):
        # Scheduler
        if scheduler:
            scheduler.step()
        # Training
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        # Validation
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item()
                predicted = (outputs > weight_1_0).float()
                correct += (predicted == labels).sum().item()
                total += labels.size(0)
        val_accuracy = correct / total
        history['train_loss'].append(train_loss / len(train_loader))
        history['val_loss'].append(val_loss / len(test_loader))
        history['val_accuracy'].append(val_accuracy)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, "
              f"Val Loss: {val_loss/len(test_loader):.4f}, Val Accuracy: {val_accuracy:.4f}")

    return history

In [11]:
df_train_bis["tweets_embedded"] = embeddings.tolist()
train, test = train_test_split(df_train_bis, test_size=0.01, random_state=42)
X_train = train["tweets_embedded"]
X_test = test['tweets_embedded']
X_train = np.vstack(X_train.tolist())  # Convertir les listes en tableau 2D
X_test = np.vstack(X_test.tolist())   # Même opération pour test_X

In [12]:
def create_sets(train, test, train_X, test_X, batch_size = 64):
    # Scale the data
    scaler = StandardScaler()
    scaler.fit(train_X)
    train_X = scaler.transform(train_X)
    test_X = scaler.transform(test_X)
    # Labels
    train_y = np.array(train['EventType']).reshape(-1, 1).flatten()
    test_y = np.array(test['EventType']).reshape(-1, 1).flatten()
    train_y = torch.tensor(train_y, dtype=torch.float32).view(-1, 1)
    test_y = torch.tensor(test_y, dtype=torch.float32).view(-1, 1)

    trainset = TensorDataset(torch.tensor(train_X, dtype=torch.float32), train_y)
    testset = TensorDataset(torch.tensor(test_X, dtype=torch.float32), test_y)

    train_loader = DataLoader(trainset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(testset, batch_size=batch_size, shuffle=False)

    return train_loader, test_loader, scaler

train_loader, test_loader, scaler = create_sets(train, test, X_train, X_test)

In [13]:
# Hyperparameters
output_dim = 1
embedding_dim = 2048
input_dim = embedding_dim
weight_1_0 = 0.55
hidden_dims = [embedding_dim*3,embedding_dim//20, embedding_dim//10]
dropouts = [0.95] * len(hidden_dims)
epochs = 20
lr = 0.001
decay = 1e-5

In [14]:
# Model
model = NNModel(input_dim, hidden_dims, output_dim, dropouts)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
# Loss and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=decay)
scheduler = None

In [15]:
# Training
print("Training the model...")
t = time()
print(train_model(model, train_loader, test_loader, optimizer, criterion, device, num_epochs=epochs, weight_1_0=weight_1_0, scheduler=scheduler))
print(f"Model trained in {time()-t:.2f} seconds")
print("-"*50+"\n")

Training the model...
Epoch 1/20, Train Loss: 0.7332, Val Loss: 0.6459, Val Accuracy: 0.6745
Epoch 2/20, Train Loss: 0.6430, Val Loss: 0.6449, Val Accuracy: 0.6745
Epoch 3/20, Train Loss: 0.6396, Val Loss: 0.6459, Val Accuracy: 0.6745
Epoch 4/20, Train Loss: 0.6383, Val Loss: 0.6471, Val Accuracy: 0.6745
Epoch 5/20, Train Loss: 0.6369, Val Loss: 0.6451, Val Accuracy: 0.6745
Epoch 6/20, Train Loss: 0.6337, Val Loss: 0.6466, Val Accuracy: 0.6745
Epoch 7/20, Train Loss: 0.6332, Val Loss: 0.6407, Val Accuracy: 0.6745
Epoch 8/20, Train Loss: 0.6299, Val Loss: 0.6335, Val Accuracy: 0.6745
Epoch 9/20, Train Loss: 0.6272, Val Loss: 0.6357, Val Accuracy: 0.6745
Epoch 10/20, Train Loss: 0.6251, Val Loss: 0.6265, Val Accuracy: 0.6745
Epoch 11/20, Train Loss: 0.6201, Val Loss: 0.6208, Val Accuracy: 0.6745
Epoch 12/20, Train Loss: 0.6175, Val Loss: 0.6159, Val Accuracy: 0.6745
Epoch 13/20, Train Loss: 0.6131, Val Loss: 0.6134, Val Accuracy: 0.6745
Epoch 14/20, Train Loss: 0.6104, Val Loss: 0.6088, 

## Training with other models (SVM, LG, RF, XGB)

In [16]:
Y_train = train['EventType']
Y_test = test['EventType']
# Regularized SVM
print("Testing on SVM with regularization:")
t = time()
svm = SVC(C=0.1, kernel='linear')  # Smaller C increases regularization
svm.fit(X_train, Y_train)
print("SVM Accuracy:", svm.score(X_test, Y_test))
print(f"Model trained in {time()-t:.2f} seconds")

# Regularized Logistic Regression
print("Testing on Logistic Regression with regularization:")
t = time()
lg = LogisticRegression(max_iter=1000, C=0.1, penalty='l2', solver='liblinear')  # Stronger L2 regularization
lg.fit(X_train, Y_train)
print("Logistic Regression Accuracy:", lg.score(X_test, Y_test))
print(f"Model trained in {time()-t:.2f} seconds")

# Regularized Random Forest
print("Testing on Random Forest with regularization:")
t = time()
rf = RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5, min_samples_leaf=3)  # Depth/leaf regularization
rf.fit(X_train, Y_train)
print("Random Forest Accuracy:", rf.score(X_test, Y_test))
print(f"Model trained in {time()-t:.2f} seconds")

# Regularized XGBoost
print("Testing on XGBoost with regularization:")
t = time()
xgb = XGBClassifier(eval_metric='logloss', reg_alpha=1, reg_lambda=1, max_depth=6, learning_rate=0.05, n_estimators=200)
xgb.fit(X_train, Y_train)
print("XGBoost Accuracy:", xgb.score(X_test, Y_test))
print(f"Model trained in {time()-t:.2f} seconds")

Testing on SVM with regularization:
SVM Accuracy: 0.718475073313783
Model trained in 606.96 seconds
Testing on Logistic Regression with regularization:
Logistic Regression Accuracy: 0.7155425219941349
Model trained in 18.36 seconds
Testing on Random Forest with regularization:
Random Forest Accuracy: 0.7126099706744868
Model trained in 52.15 seconds
Testing on XGBoost with regularization:
XGBoost Accuracy: 0.7272727272727273
Model trained in 29.62 seconds


## Embeddings for evaluation

In [17]:
print("-"*50)
print("Loading data...")
t = time()
path_to_data = "../challenge_data/"
path_to_eval_tweets = os.path.join(path_to_data, "eval_tweets")
df_eval = load_data(path_to_eval_tweets)
print(f"Data loaded in {time()-t:.2f} seconds")
print("-"*50+"\n")

--------------------------------------------------
Loading data...
Data loaded in 1.26 seconds
--------------------------------------------------



In [18]:
# Grouping tweets
Group_eval = 2000
print("Grouping tweets...")
t = time()
df_eval_bis = concat_tweets(df_eval, MAX_SUBGROUP=Group_eval, event_type=False)
print(f"Tweets grouped in {time()-t:.2f} seconds")
print("-"*50+"\n")

Grouping tweets...
Tweets grouped in 4.58 seconds
--------------------------------------------------



In [None]:
embeddings_eval = get_embeddings_in_batches(
    model=model_Bertweet,
    tokenizer=tokenizer,
    tweets=df_eval_bis["Tweet"].tolist(),
    batch_size=10,  # Taille du lot (à ajuster selon votre mémoire)
    device="cuda" if torch.cuda.is_available() else "cpu"  # GPU si disponible
)

## Evaluation for NN

In [None]:
df_eval_bis["tweets_embedded"] = embeddings_eval.tolist()
X_eval = df_eval_bis["tweets_embedded"]
X_eval = np.vstack(X_eval.tolist())

In [None]:
def create_sets_for_eval(eval_X, scaler, batch_size = 64):
    # Scale the data
    eval_X = scaler.transform(eval_X)

    evalset = TensorDataset(torch.tensor(eval_X, dtype=torch.float32))

    eval_loader = DataLoader(evalset, batch_size=batch_size, shuffle=False)

    return eval_loader

eval_loader = create_sets_for_eval(X_eval, scaler)

In [None]:
predictions_NN = []
model.eval()
with torch.no_grad():
    for batch in eval_loader:
        inputs = batch[0].to(device)
        outputs = model(inputs)
        predictions_NN.extend(outputs.cpu().numpy().flatten())

## Evaluation for other models (SVM, LG, RF, XGB)

In [None]:
pridictions_lg = lg.predict(X_eval)
pridictions_svm = svm.predict(X_eval)
pridictions_rf = rf.predict(X_eval)
pridictions_xgb = xgb.predict(X_eval)

## Grouping evaluations for Kaggle

In [None]:
# Group by ID/PeriodID/MatchID and average the predictions and sort by matchID and then PeriodID
predictions_eval = predictions_eval.groupby(['ID', 'MatchID', 'PeriodID']).mean().reset_index().sort_values(['MatchID', 'PeriodID'])
predictions_eval['Predicted_NN_three'] = (predictions_eval['Predicted_NN'] > weight_1_0).astype(int)
predictions_eval['Predicted_LG_three'] = (predictions_eval['Predicted_LG'] > 0.5).astype(int)
predictions_eval['Predicted_SVM_three'] = (predictions_eval['Predicted_SVM'] > 0.5).astype(int)
predictions_eval['Predicted_RF_three'] = (predictions_eval['Predicted_RF'] > 0.5).astype(int)
predictions_eval['Predicted_XGB_three'] = (predictions_eval['Predicted_XGB'] > 0.5).astype(int)

In [None]:
# Histogram of the predictions and the true values as before
plt.figure(figsize=(12, 12))
plt.subplot(3, 2, 1)
plt.hist(predictions_eval['Predicted_NN'], bins=20, alpha=0.5, label='NN')
plt.axvline(weight_1_0, color='r', linestyle='dotted', label='Threshold')
plt.legend()
plt.title(f'NN Predictions')

plt.subplot(3, 2, 2)
plt.hist(predictions_eval['Predicted_LG'], bins=20, alpha=0.5, label='LG')
plt.axvline(0.5, color='r', linestyle='dotted', label='Threshold')
plt.legend()
plt.title(f'LG Predictions')

plt.subplot(3, 2, 3)
plt.hist(predictions_eval['Predicted_SVM'], bins=20, alpha=0.5, label='SVM')
plt.axvline(0.5, color='r', linestyle='dotted', label='Threshold')
plt.legend()
plt.title(f'SVM Predictions')

plt.subplot(3, 2, 4)
plt.hist(predictions_eval['Predicted_RF'], bins=20, alpha=0.5, label='RF')
plt.axvline(0.5, color='r', linestyle='dotted', label='Threshold')
plt.legend()
plt.title(f'RF Predictions')

plt.subplot(3, 2, 5)
plt.hist(predictions_eval['Predicted_XGB'], bins=20, alpha=0.5, label='XGB')
plt.axvline(0.5, color='r', linestyle='dotted', label='Threshold')
plt.legend()
plt.title(f'XGB Predictions')