In [2]:
import os
import re
import gensim.downloader as api
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Understanding `NTLK`

### 1. Download

In [3]:
# Download some NLP models for processing, optional
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /users/eleves-b/2022/axel.delaval/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/eleves-b/2022/axel.delaval/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 2. Stopwords 

*Les stopwords sont des mots communs (comme "the", "is", etc.) que l'on peut enlever dans des tâches de NLP pour simplifier le texte.*



In [4]:
# Exemple de suppression des stopwords
sample_text = "This is an example sentence to demonstrate the removal of stopwords."
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in sample_text.lower().split() if word not in stop_words]
print("Text with stopwords removed:", ' '.join(filtered_words))


Text with stopwords removed: example sentence demonstrate removal stopwords.


### 3. Lemmatisation
*Réduit les mots à une forme plus simple*

In [7]:
lemmatizer = WordNetLemmatizer()
words = ["wolves", "phenomena", "running", "jumps", "easily", "fairly"]
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print("Lemmatized words:", lemmatized_words)


Lemmatized words: ['wolf', 'phenomenon', 'running', 'jump', 'easily', 'fairly']


# Understanding `Gloves`

*GloVe, qui signifie Global Vectors for Word Representation, est un modèle de vecteurs de mots développé par l'équipe de Stanford. Son objectif est de capturer les relations sémantiques entre les mots en créant des représentations vectorielles où des mots ayant des significations similaires sont proches dans l'espace vectoriel.*

### 1. Load the model 

In [8]:
# Charger le modèle GloVe (200 dimensions)
embeddings_model = api.load("glove-twitter-200")

### 2. Look at the similarity between words

In [13]:
# Similarité entre deux mots
word1 = "football"
word2 = "soccer"
word3 = "basketball"
word4 = 'cat'
word5 = 'dog'
word6 = 'puppy'
similarity_12 = embeddings_model.similarity(word1, word2)
similarity_13 = embeddings_model.similarity(word1, word3)
similarity_14 = embeddings_model.similarity(word1, word4)
similarity_45 = embeddings_model.similarity(word4, word5)
similarity_46 = embeddings_model.similarity(word4, word6)
similarity_56 = embeddings_model.similarity(word5, word6)
print(f"Similarity between '{word1}' and '{word2}':", similarity_12)
print(f"Similarity between '{word1}' and '{word3}':", similarity_13)
print(f"Similarity between '{word1}' and '{word4}':", similarity_14)
print(f"Similarity between '{word4}' and '{word5}':", similarity_45)
print(f"Similarity between '{word4}' and '{word6}':", similarity_46)
print(f"Similarity between '{word5}' and '{word6}':", similarity_56)

Similarity between 'football' and 'soccer': 0.84885186
Similarity between 'football' and 'basketball': 0.79156566
Similarity between 'football' and 'cat': 0.33143294
Similarity between 'cat' and 'dog': 0.83243024
Similarity between 'cat' and 'puppy': 0.7023193
Similarity between 'dog' and 'puppy': 0.7890411


### 4. Show some vector 

In [21]:
def print_only_few_elements(vector, n=3):
    return '[' + ', '.join([str(x) for x in vector[:n]]) + ', ... , ' + ', '.join([str(x) for x in vector[-n:]]) + ']'

# Exemple avec un mot
word = "football"
if word in embeddings_model:
    vector = embeddings_model[word]
    ### show the vector but print only the first 3 and last 3 elements
    print(f"Vector for '{word}':", print_only_few_elements(vector))
    print("Vector dimension:", len(vector))
else:
    print(f"'{word}' not found in the model vocabulary.")


Vector for 'football': [-0.30796, 0.42961, 0.063245, ... , -0.05676, -0.3919, 0.65645]
Vector dimension: 200


### 5. test `get_avg_embedding`

In [24]:
# Function to compute the average word vector for a tweet
def get_avg_embedding(tweet, model, vector_size=200):
    """ 
    Compute the average embedding vector for a tweet
    
    Parameters:
        tweet (str): The tweet text
        model (gensim.models.keyedvectors.Word2VecKeyedVectors): The word embeddings model
        vector_size (int): The size of the word embedding vectors
        
    Returns:
        np.ndarray: The average embedding vector for the tweet
    """

    words = tweet.split()  # Tokenize by whitespace
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:  # If no words in the tweet are in the vocabulary, return a zero vector
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

# Test avec une phrase
sentence = "I love watching football games"
avg_vector = get_avg_embedding(sentence, embeddings_model, vector_size=200)
print(f"Average embedding vector for sentence '{sentence}' :", print_only_few_elements(avg_vector))

Average embedding vector for sentence 'I love watching football games' : [-0.015309997, 0.23268776, -0.05199425, ... , 0.020067502, -0.17745501, 0.31927276]


# Understanding pre-processing

In [27]:
# Basic preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenization
    words = text.split()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Test de la fonction de prétraitement
text = "I love watching football games! #Sport :) 🏈"
clean_text = preprocess_text(text)
print(f"Text before preprocessing: '{text}'")
print(f"Text after preprocessing: '{clean_text}'")

Text before preprocessing: 'I love watching football games! #Sport :) 🏈'
Text after preprocessing: 'love watching football game sport'


### 0. Initialisation

*paths* 

In [44]:
path_to_data = "../../challenge_data/"

path_to_training_tweets = path_to_data + "train_tweets"
path_to_eval_tweets = path_to_data + "eval_tweets"

*import the training data into a panda format*

In [36]:
# Read all training files and concatenate them into one dataframe
li = []
for filename in os.listdir(path_to_training_tweets):
    df = pd.read_csv(path_to_training_tweets + "/" + filename)
    li.append(df)
df = pd.concat(li, ignore_index=True)

# li is the list of dataframes, df is the concatenated dataframe
print(df)

             ID  MatchID  PeriodID  EventType      Timestamp  \
0           2_0        2         0          0  1403538600000   
1           2_0        2         0          0  1403538600000   
2           2_0        2         0          0  1403538600000   
3           2_0        2         0          0  1403538600000   
4           2_0        2         0          0  1403538600000   
...         ...      ...       ...        ...            ...   
5056045  17_129       17       129          1  1403805600000   
5056046  17_129       17       129          1  1403805600000   
5056047  17_129       17       129          1  1403805600000   
5056048  17_129       17       129          1  1403805600000   
5056049  17_129       17       129          1  1403805600000   

                                                     Tweet  
0        RT @soccerdotcom: If #ESP beats #AUS we'll giv...  
1        Visit the #SITEP official web site here http:/...  
2        RT @soccerdotcom: If #ESP beats #AUS we

*Prepocess by eleminating punctuation, prepositions, etc*

In [37]:
# Apply preprocessing to each tweet
df['Tweet'] = df['Tweet'].apply(preprocess_text)

print(df)

             ID  MatchID  PeriodID  EventType      Timestamp  \
0           2_0        2         0          0  1403538600000   
1           2_0        2         0          0  1403538600000   
2           2_0        2         0          0  1403538600000   
3           2_0        2         0          0  1403538600000   
4           2_0        2         0          0  1403538600000   
...         ...      ...       ...        ...            ...   
5056045  17_129       17       129          1  1403805600000   
5056046  17_129       17       129          1  1403805600000   
5056047  17_129       17       129          1  1403805600000   
5056048  17_129       17       129          1  1403805600000   
5056049  17_129       17       129          1  1403805600000   

                                                     Tweet  
0        rt soccerdotcom esp beat au well give away spa...  
1        visit sitep official web site httptcoehzkslan ...  
2        rt soccerdotcom esp beat au well give a

*Transform words into vectors*

In [38]:
# Apply preprocessing to each tweet and obtain vectors
vector_size = 200  # Adjust based on the chosen GloVe model
tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in df['Tweet']])
tweet_df = pd.DataFrame(tweet_vectors)

*Now create X,Y so that we are looking for f s.t. f(X)=Y*

In [39]:
# Attach the vectors into the original dataframe
period_features = pd.concat([df, tweet_df], axis=1)
# Drop the columns that are not useful anymore
period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
# Group the tweets into their corresponding periods. This way we generate an average embedding vector for each period
period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()

# We drop the non-numerical features and keep the embeddings values for each period
X = period_features.drop(columns=['EventType', 'MatchID', 'PeriodID', 'ID']).values
# We extract the labels of our training samples
y = period_features['EventType'].values

### 1. Logistic regression 

*Splitting the data between training and test*

In [40]:
# We split our data into a training and test set that we can use to train our classifier without fine-tuning into the
# validation set and without submitting too many times into Kaggle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

*Predicting with Logistic Regression*

In [41]:
# We set up a basic classifier that we train and then calculate the accuracy on our test set
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test set: ", accuracy_score(y_test, y_pred))

Test set:  0.7320872274143302


### 2. Predicting the evaluation data

*Re-evaluating the predictor on the whole dataset*

*By the way, we define a dummy classifier, which constantly predicts the most frequent label that
appears in the training set.*

In [42]:
# This time we train our classifier on the full dataset that it is available to us.
clf = LogisticRegression(random_state=42, max_iter=1000).fit(X, y)
# We add a dummy classifier for sanity purposes
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X, y)

*Now apply these models on the evaluation dataset*

In [45]:
predictions = []
dummy_predictions = []
# We read each file separately, we preprocess the tweets and then use the classifier to predict the labels.
# Finally, we concatenate all predictions into a list that will eventually be concatenated and exported
# to be submitted on Kaggle.
for fname in os.listdir(path_to_eval_tweets):
    val_df = pd.read_csv(path_to_eval_tweets + "/" + fname)
    val_df['Tweet'] = val_df['Tweet'].apply(preprocess_text) # Preprocess the tweets to have the same format as the training data
    tweet_vectors = np.vstack([get_avg_embedding(tweet, embeddings_model, vector_size) for tweet in val_df['Tweet']])
    tweet_df = pd.DataFrame(tweet_vectors)

    period_features = pd.concat([val_df, tweet_df], axis=1)
    period_features = period_features.drop(columns=['Timestamp', 'Tweet'])
    period_features = period_features.groupby(['MatchID', 'PeriodID', 'ID']).mean().reset_index()
    X = period_features.drop(columns=['MatchID', 'PeriodID', 'ID']).values

    preds = clf.predict(X)
    dummy_preds = dummy_clf.predict(X)

    period_features['EventType'] = preds
    period_features['DummyEventType'] = dummy_preds

    predictions.append(period_features[['ID', 'EventType']])
    dummy_predictions.append(period_features[['ID', 'DummyEventType']])

*Now save them into csv files*

In [46]:
pred_df = pd.concat(predictions)
pred_df.to_csv('logistic_predictions.csv', index=False)

pred_df = pd.concat(dummy_predictions)
pred_df.to_csv('dummy_predictions.csv', index=False)