In [1]:
import os
import re
import torch
import numpy as np
from transformers import BertTokenizer, BertModel, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import torch.nn as nn
from tqdm import tqdm
import utils
import pickle
import lightgbm as lgb
import random
import optuna
from transformers import AlbertTokenizer, AlbertModel
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize 
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('punkt_tab')
# Hyperparameters
batch_size = 128
learning_rate = 0.01
num_epochs = 100
max_length = 44
MAX_TWEETS= 650
PAD_TWEET = "[PAD]" 

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-a/2022/amine.chraibi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/eleves-a/2022/amine.chraibi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preparing dataset


In [2]:
# count number of characters 
def count_chars(text):
    return len(text)

# count number of words 
def count_words(text):
    return len(text.split())

# count number of capital characters
def count_capital_chars(text):
    count=0
    for i in text:
        if i.isupper():
            count+=1
    return count

# count number of capital words
def count_capital_words(text):
    return sum(map(str.isupper,text.split()))

# count number of punctuations
def count_punctuations(text):
    punctuations='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    d=dict()
    for i in punctuations:
        d[str(i)+' count']=text.count(i)
    return d

# count number of words in quotes
def count_words_in_quotes(text):
    x = re.findall("\'.\'|\".\"", text)
    count=0
    if x is None:
        return 0
    else:
        for i in x:
            t=i[1:-1]
            count+=count_words(t)
        return count
    
# count number of sentences
def count_sent(text):
    return len(nltk.sent_tokenize(text))

# calculate average word length
def avg_word_len(char_cnt,word_cnt):
    return char_cnt/word_cnt

# calculate average sentence length
def avg_sent_len(word_cnt,sent_cnt):
    return word_cnt/sent_cnt

# count number of unique words 
def count_unique_words(text):
    return len(set(text.split()))
            
# words vs unique feature
def words_vs_unique(words,unique):
    return unique/words

# count of hashtags
def count_htags(text):
    x = re.findall(r'(\#\w[A-Za-z0-9]*)', text)
    return len(x)

# count of mentions
def count_mentions(text):
    x = re.findall(r'(\@\w[A-Za-z0-9]*)', text)
    return len(x)

stop_words = set(stopwords.words('english'))  

# count of stopwords
def count_stopwords(text):
    word_tokens = word_tokenize(text)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)

# stopwords vs words
def stopwords_vs_words(stopwords_cnt,text):
    return stopwords_cnt/len(word_tokenize(text))

In [3]:
# Read all training files and concatenate them into one dataframe
li = []
for filename in os.listdir("train_tweets"):
    df = pd.read_csv("train_tweets/" + filename)
    li.append(df)
df = pd.concat(li, ignore_index=True)

In [4]:
df['char_count'] = df["Tweet"].apply(lambda x:count_chars(x))
df['word_count'] = df["Tweet"].apply(lambda x:count_words(x))
df['sent_count'] = df["Tweet"].apply(lambda x:count_sent(x))
df['capital_char_count'] = df["Tweet"].apply(lambda x:count_capital_chars(x))
df['capital_word_count'] = df["Tweet"].apply(lambda x:count_capital_words(x))
df['quoted_word_count'] = df["Tweet"].apply(lambda x:count_words_in_quotes(x))
df['stopword_count'] = df["Tweet"].apply(lambda x:count_stopwords(x))
df['unique_word_count'] = df["Tweet"].apply(lambda x:count_unique_words(x))
df['htag_count'] = df["Tweet"].apply(lambda x:count_htags(x))
df['mention_count'] = df["Tweet"].apply(lambda x:count_mentions(x))
df['punct_count'] = df["Tweet"].apply(lambda x:count_punctuations(x))
df['avg_wordlength']=df['char_count']/df['word_count']
df['avg_sentlength']=df['word_count']/df['sent_count']
df['unique_vs_words']=df['unique_word_count']/df['word_count']
df['stopwords_vs_words']=df['stopword_count']/df['word_count']
df['Tweet'] = df['Tweet'].apply(utils.preprocess_text)
df['Tweet'] = df['Tweet'].astype('string')
df['Tweet'].fillna('', inplace = True)
df_punct = pd.DataFrame(list(df.punct_count))

df=pd.merge(df,df_punct,left_index=True, right_index=True)

df.drop(columns=['punct_count'],inplace=True)

with open("preprocessed_data2.pkl", 'wb') as f:
    pickle.dump(df, f)

In [5]:
import pickle

with open('preprocessed_data2.pkl', 'rb') as file:
    df = pickle.load(file)


In [6]:
vectorizer =  TfidfVectorizer(max_features=500)
tf_idf_features =  vectorizer.fit_transform(df['Tweet'])
tf_idf          = pd.DataFrame(tf_idf_features)
stacked_df = pd.concat([df.reset_index(drop=True), tf_idf.reset_index(drop=True)], axis=1)

In [9]:
group_cols = ['MatchID', 'PeriodID']
stacked_df = stacked_df.drop(columns=['Timestamp'])
stacked_df['TweetCount'] = stacked_df.groupby(group_cols)['Tweet'].transform('count')

In [11]:
import pandas as pd
## For efficient memory management
## This cell looks like it doesn't work as it prints an error but it has the intended effect (you can check stacked_df.info() before
# and after this cell)
def optimize_dataframe(df):
    """
    Optimize the data types of a DataFrame to reduce memory usage.
    """
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type == 'int64':
            if df[col].min() >= 0:
                df[col] = pd.to_numeric(df[col], downcast='unsigned')
            else:
                df[col] = pd.to_numeric(df[col], downcast='signed')
        
        elif col_type == 'float64':
            df[col] = pd.to_numeric(df[col], downcast='float')
        
        elif col_type == 'object':
            num_unique = df[col].nunique()
            num_total = len(df[col])
            if num_unique / num_total < 0.5:
                df[col] = df[col].astype('category')
    
    return df

stacked_df = optimize_dataframe(stacked_df)

TypeError: unhashable type: 'csr_matrix'

In [12]:
stacked_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5056050 entries, 0 to 5056049
Data columns (total 53 columns):
 #   Column              Dtype   
---  ------              -----   
 0   ID                  category
 1   MatchID             uint8   
 2   PeriodID            uint8   
 3   EventType           uint8   
 4   Tweet               string  
 5   char_count          uint8   
 6   word_count          uint8   
 7   sent_count          uint8   
 8   capital_char_count  uint8   
 9   capital_word_count  uint8   
 10  quoted_word_count   uint8   
 11  stopword_count      uint8   
 12  unique_word_count   uint8   
 13  htag_count          uint8   
 14  mention_count       uint8   
 15  avg_wordlength      float32 
 16  avg_sentlength      float32 
 17  unique_vs_words     float32 
 18  stopwords_vs_words  float32 
 19  ! count             uint8   
 20  " count             uint8   
 21  # count             uint8   
 22  $ count             uint8   
 23  % count             uint8   
 24

In [14]:
group_cols = ['MatchID', 'PeriodID']
agg_dict = {
    col: 'first' if col == 'EventType' else 'count'
    for col in stacked_df.columns
    if col not in group_cols
}

agg_dict['Tweet'] = lambda x: list(x)
agg_dict['TweetCount'] = 'first'

agg_df = stacked_df.groupby(group_cols).agg(agg_dict).reset_index()

grouped_labels = stacked_df.groupby(['MatchID', 'PeriodID'])['EventType'].max().unstack(fill_value=0)

In [19]:
import os
import pickle
import torch

def prepare_features_and_labels(grouped_tweets, grouped_labels, bert_model, tokenizer, output_file="features_labels.pkl"):
    """
The function `prepare_features_and_labels` is preparing features and labels using BERT embeddings and saving them to a file. 
It takes as input parameters a DataFrame `grouped_tweets` with columns MatchID, PeriodID, and OriginalTweets, 
another DataFrame `grouped_labels` with labels (EventType) by MatchID and PeriodID, a pre-trained BERT model `bert_model`,
a BERT tokenizer `tokenizer`, and an optional output file name `output_file` (default is "features_labels.pkl").
    """
    if os.path.exists(output_file):
        print(f"Chargement des features et labels depuis {output_file}...")
        with open(output_file, "rb") as f:
            data = pickle.load(f)
        return data["grouped_tweets"], data["labels"]

    labels = []
    embeddings = []

    print("Calcul des embeddings et labels...")
    for idx, row in tqdm(grouped_tweets.iterrows()):
        tweets = row['Tweet']
        if not isinstance(tweets, list) or len(tweets) == 0:
            embeddings.append(torch.zeros(bert_model.config.hidden_size))  
            continue

        tokenized = tokenizer(
            tweets,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        with torch.no_grad():
            embed = bert_model.embeddings.word_embeddings(tokenized['input_ids'])
        
        features_mean = embed.mean(dim=1).mean(dim=0)  # Average across tokens, then across tweets
        embeddings.append(features_mean)

        match_id, period_id = row['MatchID'], row['PeriodID']
        labels.append(grouped_labels.loc[match_id, period_id])

    grouped_tweets['Embedding'] = embeddings

    print(f"Sauvegarde des features et labels dans {output_file}...")
    grouped_tweets = grouped_tweets.drop(columns=['Tweet'])
    with open(output_file, "wb") as f:
        pickle.dump({"grouped_tweets": grouped_tweets, "labels": labels}, f)
    return grouped_tweets, labels

In [20]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

features, labels = prepare_features_and_labels(
    grouped_tweets=agg_df,
    grouped_labels=grouped_labels,
    bert_model=bert_model,
    tokenizer=tokenizer,
    output_file="features_labels_lgbm_spacy_new_features.pkl"
)

Calcul des embeddings et labels...


2137it [1:06:35,  1.87s/it]


Sauvegarde des features et labels dans features_labels_lgbm_spacy_new_features.pkl...


In [23]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report
import joblib

# =========================
# Step 1: Extract Embeddings
# =========================

# Function to convert torch tensors to numpy arrays
def convert_embedding(embedding_tensor):
    return embedding_tensor.detach().cpu().numpy()

# Extract and convert embeddings for training set
embedding = np.vstack(features['Embedding'].apply(convert_embedding).values)

columns_to_drop = ['EventType', 'MatchID', 'PeriodID', 'Embedding'] # We drop the embeddings here because we hstack with embedding later

X_features = features.drop(columns=columns_to_drop).values


# Which is why we dropped 'Embedding' earlier
X = np.hstack([X_features, embedding])
y = features['EventType'].values

# Best parameters obtained with Optuna

params = {
    'learning_rate': 0.020559564255188685,
    'num_leaves': 175,
    'max_depth': 38,
    'min_data_in_leaf': 24,
    'feature_fraction': 0.8206274379058552,
    'bagging_fraction': 0.5008300119237329,
    'bagging_freq': 9,
    'lambda_l1': 3.0495576636476773,
    'lambda_l2': 0.05257346531813992,
    'min_gain_to_split': 0.26065506527165566,
    'n_estimators': 759,
    'verbose': -1
}

# Initialize the LightGBM classifier
model = lgb.LGBMClassifier(**params)

# Train the model on the training data
print("Training the LightGBM model...")
model.fit(X, y)


# Save the trained model for future use
model_filename = 'lightgbm_model.pkl'
joblib.dump(model, model_filename)
print(f"Trained model saved as {model_filename}.")


Training the LightGBM model...
Trained model saved as lightgbm_model.pkl.


In [25]:
# Read all training files and concatenate them into one dataframe
li = []
for filename in os.listdir("eval_tweets"):
    df = pd.read_csv("eval_tweets/" + filename)
    li.append(df)
df_eval = pd.concat(li, ignore_index=True)

In [26]:
df_eval['char_count'] = df_eval["Tweet"].apply(lambda x:count_chars(x))
df_eval['word_count'] = df_eval["Tweet"].apply(lambda x:count_words(x))
df_eval['sent_count'] = df_eval["Tweet"].apply(lambda x:count_sent(x))
df_eval['capital_char_count'] = df_eval["Tweet"].apply(lambda x:count_capital_chars(x))
df_eval['capital_word_count'] = df_eval["Tweet"].apply(lambda x:count_capital_words(x))
df_eval['quoted_word_count'] = df_eval["Tweet"].apply(lambda x:count_words_in_quotes(x))
df_eval['stopword_count'] = df_eval["Tweet"].apply(lambda x:count_stopwords(x))
df_eval['unique_word_count'] = df_eval["Tweet"].apply(lambda x:count_unique_words(x))
df_eval['htag_count'] = df_eval["Tweet"].apply(lambda x:count_htags(x))
df_eval['mention_count'] = df_eval["Tweet"].apply(lambda x:count_mentions(x))
df_eval['punct_count'] = df_eval["Tweet"].apply(lambda x:count_punctuations(x))
df_eval['avg_wordlength']=df_eval['char_count']/df_eval['word_count']
df_eval['avg_sentlength']=df_eval['word_count']/df_eval['sent_count']
df_eval['unique_vs_words']=df_eval['unique_word_count']/df_eval['word_count']
df_eval['stopwords_vs_words']=df_eval['stopword_count']/df_eval['word_count']
df_eval['Tweet'] = df_eval['Tweet'].apply(utils.preprocess_text)
df_eval['Tweet'] = df_eval['Tweet'].astype('string')
df_eval['Tweet'].fillna('', inplace = True)
df_eval_punct = pd.DataFrame(list(df_eval.punct_count))

df_eval=pd.merge(df_eval,df_eval_punct,left_index=True, right_index=True)
df_eval.drop(columns=['punct_count'],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_eval['Tweet'].fillna('', inplace = True)


In [28]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [29]:
df_eval['Tweet'] = df_eval['Tweet'].apply(preprocess_text)

In [31]:
eval_tf_idf_features  =  vectorizer.transform(df_eval['Tweet'])
eval_tf_idf  = pd.DataFrame(eval_tf_idf_features)
df_eval = pd.concat([df_eval.reset_index(drop=True), eval_tf_idf.reset_index(drop=True)], axis=1)

In [32]:
df_eval = df_eval.drop(columns=['Timestamp'])
df_eval['TweetCount'] = df_eval.groupby(group_cols)['Tweet'].transform('count')

In [33]:
df_eval = optimize_dataframe(df_eval)

TypeError: unhashable type: 'csr_matrix'

In [35]:
group_cols = ['MatchID', 'PeriodID']
agg_dict = {
    col : 'count'
    for col in df_eval.columns
    if col not in group_cols
}

agg_dict['Tweet'] = lambda x: list(x)
agg_dict['TweetCount'] = 'first'

agg_df_eval = df_eval.groupby(group_cols).agg(agg_dict).reset_index()

In [39]:
agg_df_eval.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 516 entries, 0 to 515
Data columns (total 52 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   MatchID             516 non-null    uint8 
 1   PeriodID            516 non-null    uint8 
 2   ID                  516 non-null    int64 
 3   Tweet               516 non-null    object
 4   char_count          516 non-null    int64 
 5   word_count          516 non-null    int64 
 6   sent_count          516 non-null    int64 
 7   capital_char_count  516 non-null    int64 
 8   capital_word_count  516 non-null    int64 
 9   quoted_word_count   516 non-null    int64 
 10  stopword_count      516 non-null    int64 
 11  unique_word_count   516 non-null    int64 
 12  htag_count          516 non-null    int64 
 13  mention_count       516 non-null    int64 
 14  avg_wordlength      516 non-null    int64 
 15  avg_sentlength      516 non-null    int64 
 16  unique_vs_words     516 no

In [37]:
def prepare_features_eval(grouped_tweets, bert_model, tokenizer, output_file):
    """
    Prépare les features_eval et les labels en utilisant les embeddings BERT, et les sauvegarde dans un fichier.
    
    :param grouped_tweets: Grouped tweets par MatchID et PeriodID.
    :param grouped_labels: Grouped labels par MatchID et PeriodID.
    :param bert_model: Modèle BERT pré-entraîné.
    :param tokenizer: Tokenizer BERT.
    :param output_file: Nom du fichier pour sauvegarder les features_eval et labels.
    :return: Tuple (features_eval, labels)
    """
    if os.path.exists(output_file):
        print(f"Chargement des features_eval et labels depuis {output_file}...")
        with open(output_file, "rb") as f:
            data = pickle.load(f)
        return data["grouped_tweets"], data["labels"]

    embeddings = []

    print("Calcul des embeddings et labels...")
    for idx, row in tqdm(grouped_tweets.iterrows()):
        tweets = row['Tweet']
        if not isinstance(tweets, list) or len(tweets) == 0:
            embeddings.append(torch.zeros(bert_model.config.hidden_size))  
            continue

        tokenized = tokenizer(
            tweets,
            max_length=max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        with torch.no_grad():
            embed = bert_model.embeddings.word_embeddings(tokenized['input_ids'])
        
        features_eval_mean = embed.mean(dim=1).mean(dim=0)  # Average across tokens, then across tweets
        embeddings.append(features_eval_mean)

        match_id, period_id = row['MatchID'], row['PeriodID']

    grouped_tweets['Embedding'] = embeddings

    print(f"Sauvegarde des features_eval et labels dans {output_file}...")
    grouped_tweets = grouped_tweets.drop(columns=['Tweet'])
    print(f"Sauvegarde des features_eval et labels dans {output_file}...")
    with open(output_file, "wb") as f:
        pickle.dump({"features_eval": features_eval_mean}, f)
    return grouped_tweets

In [40]:
features_eval = prepare_features_eval(
    grouped_tweets=agg_df_eval,
    bert_model=bert_model,
    tokenizer=tokenizer,
    output_file="features_eval_labels_eval_spacy.pkl"
)

Calcul des embeddings et labels...


516it [12:01,  1.40s/it]

Sauvegarde des features_eval et labels dans features_eval_labels_eval_spacy.pkl...
Sauvegarde des features_eval et labels dans features_eval_labels_eval_spacy.pkl...





In [41]:
embedding_eval= np.vstack(features_eval['Embedding'].apply(convert_embedding).values)


columns_to_drop = ['MatchID', 'PeriodID', 'Embedding']

X_features_eval = features_eval.drop(columns=columns_to_drop).values
X_eval = np.hstack([X_features_eval, embedding_eval])

pred_eval = model.predict(X_eval)  

submission = pd.read_csv('submission.csv')

output_df = pd.DataFrame({
    'ID': submission['ID'],  #
    'Prediction': pred_eval
})

# Sauvegarder les prédictions
output_df.to_csv("submission_ensemble.csv", index=False)
print("Predictions saved to submission_ensemble.csv")

Predictions saved to submission_ensemble.csv
