In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/'Travail 3A'/INF554/Kaggle/challenge_data

/content/drive/MyDrive/Travail 3A/INF554/Kaggle/challenge_data


In [3]:
import os
import re
import gensim.downloader as api
import nltk
import numpy as np
import pandas as pd
import torch
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [4]:
def set_seed(seed):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed(42)

In [5]:
# Download some NLP models for processing, optional
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [6]:
# Load GloVe model with Gensim's API # 7min
embeddings_model = api.load("glove-twitter-200")  # 200-dimensional GloVe embeddings



In [7]:
# Basic preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [8]:
# Read all training files and concatenate them into one dataframe
li = []
for filename in os.listdir("train_tweets"):
    df = pd.read_csv("train_tweets/" + filename)
    li.append(df)
df = pd.concat(li, ignore_index=True)

# NE PAS RUN

In [None]:
# Apply preprocessing to each tweet # 13 min
df['Tweet'] = df['Tweet'].apply(preprocess_text)

In [None]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [None]:
# Apply preprocessing to each tweet and obtain vectors
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)

NameError: name 'np' is not defined

In [None]:
# Attach the vectors into the original dataframe
period_features = pd.concat([df, tweet_df], axis=1)
# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

# We drop the non-numerical features and keep the embeddings values for each period
X = period_features.drop(columns=['EventType', 'MatchID', 'PeriodID', 'ID']).values
# We extract the labels of our training samples
y = period_features['EventType'].values

In [None]:
###### Evaluating on a test set:

# We split our data into a training and test set that we can use to train our classifier without fine-tuning into the
# validation set and without submitting too many times into Kaggle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# We set up a basic classifier that we train and then calculate the accuracy on our test set
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

# KAGGLE SUB

In [None]:
# This time we train our classifier on the full dataset that it is available to us.
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X, y)
# We add a dummy classifier for sanity purposes
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

predictions = []
dummy_predictions = []

In [None]:
# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.
for fname in os.listdir("eval_tweets"):
    val_df = pd.read_csv("eval_tweets/" + fname)
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text)

    tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in val_df['Tweet']])
    tweet_df = pd.DataFrame(tweet_vectors)

    period_features = pd.concat([val_df, tweet_df], axis=1)
    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
    X = period_features.drop(columns=['MatchID', 'PeriodID', 'ID']).values

    preds = clf.predict(X)
    dummy_preds = dummy_clf.predict(X)

    period_features['EventType'] = preds
    period_features['DummyEventType'] = dummy_preds

    predictions.append(period_features[['ID', 'EventType']])
    dummy_predictions.append(period_features[['ID', 'DummyEventType']])

In [None]:
pred_df = pd.concat(predictions)
pred_df.to_csv('logistic_predictions.csv', index=False)

pred_df = pd.concat(dummy_predictions)
pred_df.to_csv('dummy_predictions.csv', index=False)

# LSTM tweet wise + Voting system

In [None]:
print(df.shape)
print(df.columns)

sub_size = 1000
selected_indexes = np.random.choice(df.index, size=sub_size, replace=False)
sub_df = df.loc[selected_indexes]

print(sub_df.shape)

(5056050, 6)
Index(['ID', 'MatchID', 'PeriodID', 'EventType', 'Timestamp', 'Tweet'], dtype='object')
(1000, 6)


In [None]:
# droper tout les attributs à part la colonne tweet et la colonne Event_type
sub_y = sub_df['EventType']
sub_X = sub_df.drop(columns=['EventType', 'ID' ,'MatchID', 'PeriodID', 'Timestamp'])

In [None]:
def get_embedding(tweet, model=embeddings_model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        word_vectors = [np.zeros(vector_size)]
    return word_vectors

In [None]:
tweet_embeddings = sub_X['Tweet'].apply(get_embedding).to_numpy()

In [None]:
#print(tweet_embeddings.shape)
#print(tweet_embeddings.dtype)
#print(type(tweet_embeddings[0]))
#print(len(tweet_embeddings[0]))

for x in range(10):
  print(len(tweet_embeddings[x]))
  print(sub_X[x:x + 1]['Tweet'])

5
434409    rt mictwizzle argentina better crush germany
Name: Tweet, dtype: object
4
736785    niallsuitandtie germany bc holland lost argent...
Name: Tweet, dtype: object
4
3843594    rt iquotecomedy brazil defender like httptcoua...
Name: Tweet, dtype: object
5
2632297    rt zjahr jeez germany playing terrible
Name: Tweet, dtype: object
9
830738    rt fifaworldcup social see whats said worldcup...
Name: Tweet, dtype: object
6
1845505    goooooal fra pogba nod french front
Name: Tweet, dtype: object
10
650416    jesus germany shape defense please like enough...
Name: Tweet, dtype: object
11
3183820    already know germany got one ill watching port...
Name: Tweet, dtype: object
7
2215962    rt fifaworldcup photo matshummels effort goal ...
Name: Tweet, dtype: object
13
895933    rt khatuberry argentina coach super hyper migh...
Name: Tweet, dtype: object


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from tqdm import tqdm

class TextDataset(Dataset):
    def __init__(self, texts, labels, embedding_model):
        self.texts = texts
        self.labels = labels
        self.embedding_model = embedding_model

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        # Get embeddings for the text
        embeddings = torch.tensor(self.embedding_model.encode(text), dtype=torch.float32)
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return embeddings, label

def collate_fn(batch):
    # Separate embeddings and labels
    embeddings, labels = zip(*batch)

    # Pad sequences to max length in batch
    padded_embeddings = pad_sequence(embeddings, batch_first=True)
    labels = torch.stack(labels)

    return padded_embeddings, labels

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_classes, num_layers=1, dropout=0.1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # x shape: (batch_size, seq_len, embedding_dim)
        lstm_out, (hidden, _) = self.lstm(x)
        # Use last hidden state
        out = self.dropout(hidden[-1])
        return self.fc(out)

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device):
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0

        for embeddings, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
            embeddings, labels = embeddings.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(embeddings)
            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            _, predicted = outputs.max(1)
            train_total += labels.size(0)
            train_correct += predicted.eq(labels).sum().item()

        # Validation phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for embeddings, labels in val_loader:
                embeddings, labels = embeddings.to(device), labels.to(device)
                outputs = model(embeddings)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = outputs.max(1)
                val_total += labels.size(0)
                val_correct += predicted.eq(labels).sum().item()

        # Print metrics
        print(f'Epoch {epoch+1}:')
        print(f'Train Loss: {train_loss/len(train_loader):.4f}, Acc: {100.*train_correct/train_total:.2f}%')
        print(f'Val Loss: {val_loss/len(val_loader):.4f}, Acc: {100.*val_correct/val_total:.2f}%')

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pt')

# Example usage:
"""
# Assuming you have:
# - df: DataFrame with 'text' and 'label' columns
# - embedding_model: your embedding model
# - train_df, val_df: train/val splits of your data

# Initialize dataset and dataloader
train_dataset = TextDataset(train_df['text'].values, train_df['label'].values, embedding_model)
val_dataset = TextDataset(val_df['text'].values, val_df['label'].values, embedding_model)

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    collate_fn=collate_fn
)
val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=collate_fn
)

# Initialize model and training components
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LSTMClassifier(
    embedding_dim=768,  # Adjust based on your embedding model
    hidden_dim=128,
    num_classes=num_classes,  # Number of classes in your task
    num_layers=2,
    dropout=0.1
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, device=device)
"""

# START

In [9]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from tqdm import tqdm
from torch import optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
def get_embedding(tweet, model=embeddings_model, vector_size=200):
    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        word_vectors = [np.zeros(vector_size)]
    return word_vectors

In [11]:
print(df.shape)
print(df.columns)
print(df['EventType'].sum() / df.shape[0])

(5056050, 6)
Index(['ID', 'MatchID', 'PeriodID', 'EventType', 'Timestamp', 'Tweet'], dtype='object')
0.5646836957704137


In [12]:
# pour l'instant
df = df.drop(columns=['ID', 'MatchID', 'PeriodID', 'Timestamp'])
y = df['EventType']
X = df.drop(columns=['EventType'])

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95, random_state=42)
X_val, X_test_2, y_val, y_test_2 = train_test_split(X_test, y_test, test_size=0.99, random_state=42)

In [74]:
print(X_val.shape)
print(X_train.shape)

(48032, 1)
(252802, 1)


In [75]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes, dropout=0.2):
        super(LSTMClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=False,  # Changed to False for better memory efficiency
            dropout=dropout if num_layers > 1 else 0
        )

        self.fc = nn.Linear(hidden_size, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x, apply_softmax=False):
        # x shape: (seq_length, batch_size, input_size)
        h0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        # Use last time step output
        out = out[-1]  # Changed from out[:, -1, :] to out[-1]
        out = self.fc(out)

        if apply_softmax:
            out = self.softmax(out)
        return out

In [76]:
model = LSTMClassifier(
        input_size=200,
        hidden_size=64,
        num_layers=3,
        num_classes=2,
        dropout=0.3
    ).to('cuda' if torch.cuda.is_available() else 'cpu')

In [77]:
epochs = 50
batch_size = 1024 * 2 * 2
lr = 0.006

In [78]:
def preprocess_df(init_df, indices=None):
  if(indices is None) : indices = torch.arange(len(init_df))

  batch = init_df.iloc[indices].copy()

  batch['Tweet'] = batch['Tweet'].apply(preprocess_text)
  batch['Tweet'] = batch['Tweet'].apply(get_embedding)

  max_len = max(len(seq) for seq in batch['Tweet'].values)

  batch['Tweet'] = batch['Tweet'].apply(lambda seq: np.array((seq + [np.zeros(200)] * (max_len - len(seq))), dtype=np.float32))

  HHB = np.stack(batch['Tweet'].values)

  batch = torch.tensor(HHB).to(device)
  batch = batch.transpose(0, 1)

  return batch

In [79]:
def eval_perf(df, y_gt, label='val'):
  model.eval()

  y_gt = torch.tensor(y_gt.copy().values).to(device)

  criterion = nn.CrossEntropyLoss()

  input = preprocess_df(df)
  y_pred = model(input)

  print('Loss ' + label + ' : ', criterion(y_pred, y_gt).item())
  # Move y_pred to CPU before converting to NumPy array
  print('Accuracy ' + label + ' : ', accuracy_score(y_gt.cpu(), y_pred.cpu().argmax(dim=1)), '\n')
  model.train()

In [80]:
def train_LSTM(model, X_train, y_train, X_test, y_test, epochs, batch_size, lr, freq_train=1, freq_val=1):
  # on définit le dataloader sur les indexes
  train_loader = DataLoader(torch.arange(len(X_train)), batch_size=batch_size, shuffle=True)

  optimizer = optim.Adam(model.parameters(), lr=lr)

  criterion = nn.CrossEntropyLoss()

  for i in range(epochs):

    epoch_loss = 0.

    for (j, batch_indices) in enumerate(train_loader):

      batch_labels = y_train.iloc[batch_indices].copy()
      batch_labels = torch.tensor(batch_labels.values).to(device)


      # on process les données :
      batch = X_train.iloc[batch_indices].copy()
      batch['Tweet'] = batch['Tweet'].apply(preprocess_text)
      batch['Tweet'] = batch['Tweet'].apply(get_embedding)

      #if(i == 0 and j == 0) :

      max_len = max(len(seq) for seq in batch['Tweet'].values)

      # Pad sequences with torch.zeros(200)
      batch['Tweet'] = batch['Tweet'].apply(lambda seq: np.array((seq + [np.zeros(200)] * (max_len - len(seq))), dtype=np.float32))

      HHB = np.stack(batch['Tweet'].values)

      batch = torch.tensor(HHB).to(device)
      batch = batch.transpose(0, 1)

      y_pred = model(batch)

      loss = criterion(y_pred, batch_labels)

      epoch_loss += loss.item()

      #if((i + j) % freq_print == 0) : print('Loss epoch ', i, ' : ', loss.item(), '\n')

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    if((i % freq_val == 0) or (i % freq_train == 0)):
      print('Epoch : ', i , '\n')

      if(i % freq_train == 0) : print('Loss train : ', epoch_loss / len(train_loader), '\n')
      if(i % freq_val == 0) : eval_perf(X_val, y_val)

In [None]:
train_LSTM(model, X_train, y_train, X_test, y_test, epochs, batch_size, lr, freq_train=1, freq_val=10)

Epoch :  0 

Loss train :  0.6760094348461397 

Loss val :  0.6590946912765503
Accuracy val :  0.5876915389740173 

Epoch :  1 

Loss train :  0.6520076980513911 

Epoch :  2 

Loss train :  0.6395111295484728 

Epoch :  3 

Loss train :  0.6302119753053111 

Epoch :  4 

Loss train :  0.6217824326407525 

Epoch :  5 

Loss train :  0.6141334137608928 

Epoch :  6 

Loss train :  0.6066555284684704 

