In [29]:
import os
import polars as pl
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.utils import resample
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,classification_report
import re

# Loading

In [30]:
# Load data
def load_data(path, verbose=False):
    li = []
    for filename in os.listdir(path):
        df = pd.read_csv(os.path.join(path, filename))
        li.append(df)
    output = pd.concat(li)
    if verbose:
        print(output.head())
        print(f'The shape of the data is: {output.shape}')
    return output

# Define paths
path_to_data = "../../challenge_data/"
path_to_training_tweets = os.path.join(path_to_data, "train_tweets")
path_to_eval_tweets = os.path.join(path_to_data, "eval_tweets")
output_path = "evaluation_predictions.csv"

In [31]:
# Load and preprocess training data
df_train = load_data(path_to_training_tweets)

print(df_train)

            ID  MatchID  PeriodID  EventType      Timestamp  \
0          2_0        2         0          0  1403538600000   
1          2_0        2         0          0  1403538600000   
2          2_0        2         0          0  1403538600000   
3          2_0        2         0          0  1403538600000   
4          2_0        2         0          0  1403538600000   
...        ...      ...       ...        ...            ...   
256440  17_129       17       129          1  1403805600000   
256441  17_129       17       129          1  1403805600000   
256442  17_129       17       129          1  1403805600000   
256443  17_129       17       129          1  1403805600000   
256444  17_129       17       129          1  1403805600000   

                                                    Tweet  
0       RT @soccerdotcom: If #ESP beats #AUS we'll giv...  
1       Visit the #SITEP official web site here http:/...  
2       RT @soccerdotcom: If #ESP beats #AUS we'll giv...  
3  

In [32]:
MAX_SUBGROUP = 30

# Create an array of random integers in {0, ..., MAX_SUBGROUP} of size len(df_train)
df_train["random_id"] = np.random.randint(0, MAX_SUBGROUP, len(df_train))

print(df_train)

            ID  MatchID  PeriodID  EventType      Timestamp  \
0          2_0        2         0          0  1403538600000   
1          2_0        2         0          0  1403538600000   
2          2_0        2         0          0  1403538600000   
3          2_0        2         0          0  1403538600000   
4          2_0        2         0          0  1403538600000   
...        ...      ...       ...        ...            ...   
256440  17_129       17       129          1  1403805600000   
256441  17_129       17       129          1  1403805600000   
256442  17_129       17       129          1  1403805600000   
256443  17_129       17       129          1  1403805600000   
256444  17_129       17       129          1  1403805600000   

                                                    Tweet  random_id  
0       RT @soccerdotcom: If #ESP beats #AUS we'll giv...         20  
1       Visit the #SITEP official web site here http:/...         29  
2       RT @soccerdotcom: If #

In [33]:
# Concatenate tweets
df_train_bis = (
    df_train
    .groupby(["ID", "MatchID", "PeriodID", "random_id"], as_index=False)
    .agg({
        "Tweet": lambda x: " ".join(x),
        "EventType": "first"
    })
    .sort_values(["ID", "MatchID", "PeriodID", "random_id"])
)

# Drop the "random_id" column
df_train_bis = df_train_bis.drop(columns=["random_id"])

print(df_train_bis)

         ID  MatchID  PeriodID  \
0       0_0        0         0   
1       0_0        0         0   
2       0_0        0         0   
3       0_0        0         0   
4       0_0        0         0   
...     ...      ...       ...   
64097  8_99        8        99   
64098  8_99        8        99   
64099  8_99        8        99   
64100  8_99        8        99   
64101  8_99        8        99   

                                                   Tweet  EventType  
0      RT @trueSCRlife: If #Shaqiri scores vs #HON we...          0  
1      RT @rogerfederer: Playing tomorrow on Center C...          0  
2      The more goals France scores today the less go...          0  
3      World Cup games at 4 pm ET: France-Ecuador on ...          0  
4      RT @steveaoki: HONDURAS HERE WE COME! Line-up:...          0  
...                                                  ...        ...  
64097  RT @FIFAWorldCup: GOAL: #CMR 1-3 #BRA @fredgol...          1  
64098  Vamos #BRA ready for #Wo

# Preprocessing

In [None]:
df_train_bis["Tweet"] = df_train_bis["Tweet"].str.cat(sep=" ")
print(df_train_bis)


In [28]:
df_train_bis = df_train_bis.with_columns(
    pl.col("Concatenated_Tweets").apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x)).alias("Concatenated_Tweets")
)

AttributeError: 'Expr' object has no attribute 'apply'