**NOTE**: Make sure to select a GPU runtime. Otherwise, the model can take quite some time to create the document embeddings!

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# we start with installing bertopic from pypi before preparing the data

!pip install bertopic[all]

In [None]:
!pip install flair

In [None]:
import pandas as pd
from bertopic import BERTopic
from flair.embeddings import TransformerDocumentEmbeddings
from gensim.models.coherencemodel import CoherenceModel
import gensim.corpora as corpora
from gensim.models import LdaMulticore
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF


In [None]:
# add your data path

data=  pd.read_csv("/content/drive/MyDrive/sport_data.csv")
data.drop([data.columns[0]],axis=1,inplace=True)
data.head()

Unnamed: 0,text
0,حقق فريق تشيلسي فوزا ثمينا على مانشستر سيتي في...
1,لا يعتزم اللاعبون المسلمون في المنتخب الفرنسي ...
2,كشفت صحيفة دير شبيغل الألمانية الجمعة أن فوز ا...
3,كشفت تقارير إخبارية أن البرازيلي نيمار نصح زمي...
4,في ما يلي البرنامج الكامل لنهائيات كأس اوروبا ...


In [None]:
data.shape

(46522, 1)

In [None]:
df=data.dropna()
df.shape

(43675, 1)

In [None]:
documents = df['text'].values
len(documents)

43675

#Embedding model
BERTopic has two default embedding models: "distilbert-base-nli-stsb-mean-tokens'' for the English language and "xlm-r-bert-base-nli-stsb-meantokens" for any language other than English, where XLM-R models support 50+ languages.

In [None]:
arabert = TransformerDocumentEmbeddings('aubmindlab/bert-base-arabertv02')

tokenizer_config.json:   0%|          | 0.00/381 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/825k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.64M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/543M [00:00<?, ?B/s]

# **Create Topics**


For BERTopic you do not need to define the number of topics in advance, however, if you want to do so simply pass the number of topics to BERTopic with nr_topics paramete.

In [None]:
topic_model = BERTopic(language="arabic", low_memory=True ,calculate_probabilities=False,
                     embedding_model=arabert,verbose=True)

NOTE: Calculating probabilities can slow down BERTopic significantly at large amounts of data (>100_000 documents). It is advised to turn this off if you want to speed up the model.

In [None]:
topics, probs = topic_model.fit_transform(documents)

In [None]:
#extract most frequent topics

topic_model.get_topic_freq().head(5)

Unnamed: 0,Topic,Count
1,-1,17601
51,0,1629
313,1,1504
104,2,1241
93,3,706


-1 refers to all outliers and should typically be ignored. Next, let's take a look at the most frequent topic that was generated:

In [None]:
#show the top 10 words in topic 1

topic_model.get_topic(10)[:10]

[('الأندية', 0.010563022539130006),
 ('الجامعة', 0.009015484795396838),
 ('المتقي', 0.008334894740953813),
 ('الإله', 0.007446383309789071),
 ('لقجع', 0.007189516237009373),
 ('جامعة', 0.007131754850249184),
 ('المصادر', 0.006799272982967916),
 ('الأساسي', 0.0065358526890783855),
 ('الجمع', 0.006463224847901155),
 ('نفسها', 0.0060128217012677745)]

# Evaluation
To evaluate the model topics coherence we use [Gensim](https://radimrehurek.com/gensim/models/coherencemodel.html) implementation of the Normalized
Pointwise Mutual Information (NPMI).

In [None]:
texts = [[word for word in str(document).split()] for document in documents]
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
topics=[]
for i in topic_model.get_topics():
  row=[]
  topic= topic_model.get_topic(i)
  for word in topic:
     row.append(word[0])
  topics.append(row)

In [None]:
# compute Coherence Score

cm = CoherenceModel(topics=topics, texts=texts, corpus=corpus, dictionary=id2word, coherence='c_npmi')
coherence = cm.get_coherence()
print('\nCoherence Score: ', coherence)


Coherence Score:  0.1011377830572811


# **Visualize Topics**
After having trained our `BERTopic` model, we can iteratively go through perhaps a hundred topic to get a good
understanding of the topics that were extract. However, that takes quite some time and lacks a global representation.
Instead, we can visualize the topics that were generated in a way very similar to
[LDAvis](https://github.com/cpsievert/LDAvis):

In [None]:
topic_model.visualize_topics()

# **Model serialization**
The model and its internal settings can easily be saved. Note that the documents and embeddings will not be saved. However, UMAP and HDBSCAN will be saved.

In [None]:
# Save model
topic_model.save("my_model")



In [None]:
# Load model
my_model = BERTopic.load("my_model")

# LDA

We use the [ parallelized Latent Dirichlet Allocation (LDA)](https://radimrehurek.com/gensim/models/ldamulticore.html) from Gensim.

Note: for LDA you have to define topics number in advance.

In [None]:
#chang the number of topics here
no_topics = 1

# run LDA
lda = LdaMulticore(corpus, id2word=id2word, num_topics=no_topics)

In [None]:
#compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=texts, dictionary=id2word, coherence='c_npmi')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  -0.022347810015432682


In [None]:
lda.print_topics(num_topics=-1,num_words=10)

[(0,
  '0.034*"في" + 0.022*"من" + 0.015*"أن" + 0.015*"إلى" + 0.014*"على" + 0.007*"الفريق" + 0.007*"التي" + 0.007*"الذي" + 0.006*"عن" + 0.005*"مع"')]

In [None]:
import pandas as pd
from gensim import corpora
from gensim.models import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.isri import ISRIStemmer
import nltk

In [None]:
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

True

In [None]:
def preprocess(text):
    stop_words = set(stopwords.words('arabic'))
    stemmer = ISRIStemmer()
    words = word_tokenize(text)
    words = [w for w in words if w.isalpha() and w not in stop_words]
    words = [stemmer.stem(w) for w in words]
    return words

In [None]:
# Apply preprocessing
df['processed'] = df['text'].apply(preprocess)


In [None]:
texts = df['processed'].tolist()
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
# Train LDA model
num_topics = 5
lda_model = LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, passes=10, workers=2, random_state=20)


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.



In [None]:
# Compute Coherence Score to check model quality
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence='c_npmi')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.01619988193155986


In [None]:
# Inspect topics to identify the football topic
topics = lda_model.print_topics(num_words=10, num_topics=-1)
for idx, topic in topics:
    print(f"Topic {idx}: {topic}")

Topic 0: 0.036*"فرق" + 0.034*"لعب" + 0.020*"درب" + 0.010*"عقد" + 0.010*"وسم" + 0.007*"ريض" + 0.007*"قبل" + 0.006*"رجء" + 0.006*"جدد" + 0.006*"انه"
Topic 1: 0.019*"لعب" + 0.012*"دور" + 0.011*"رنس" + 0.010*"فرق" + 0.009*"هدف" + 0.009*"قدم" + 0.008*"ورب" + 0.008*"اسب" + 0.007*"ندي" + 0.007*"علم"
Topic 2: 0.029*"جمع" + 0.016*"ريض" + 0.013*"رئس" + 0.010*"قدم" + 0.010*"فرق" + 0.009*"قرر" + 0.009*"لعب" + 0.008*"ندي" + 0.008*"كتب" + 0.007*"لجن"
Topic 3: 0.035*"برا" + 0.027*"فرق" + 0.025*"هدف" + 0.019*"نقط" + 0.016*"ثني" + 0.016*"لعب" + 0.014*"دور" + 0.012*"بطل" + 0.011*"جمع" + 0.011*"فوز"
Topic 4: 0.043*"نخب" + 0.023*"غرب" + 0.022*"وطن" + 0.019*"جمع" + 0.012*"فرق" + 0.012*"علم" + 0.012*"كأس" + 0.011*"برا" + 0.011*"لعب" + 0.011*"قبل"


Based on the analysis, Topic 3 seems to be the most relevant to football as it includes specific football-related keywords

In [None]:
football_topic_idx = 3

# Assign a relevance score to each document based on the football topic
def get_relevance_score(document):
    bow = id2word.doc2bow(document)
    topics = lda_model.get_document_topics(bow)
    for topic_num, prob in topics:
        if topic_num == football_topic_idx:
            return prob*10
    return 0

In [None]:
df['relevance_score'] = df['processed'].apply(get_relevance_score)


In [None]:
df.drop(['processed'],axis=1)

Unnamed: 0,text,relevance_score
0,حقق فريق تشيلسي فوزا ثمينا على مانشستر سيتي في...,3.650022
1,لا يعتزم اللاعبون المسلمون في المنتخب الفرنسي ...,0.000000
2,كشفت صحيفة دير شبيغل الألمانية الجمعة أن فوز ا...,0.000000
3,كشفت تقارير إخبارية أن البرازيلي نيمار نصح زمي...,0.000000
4,في ما يلي البرنامج الكامل لنهائيات كأس اوروبا ...,1.713582
...,...,...
46517,اللاعب تأخر في العودة إلى التداريب والمدرب غاض...,0.964698
46518,المشرف العام لحسنية أكادير قال إنه سيغادر الفر...,0.000000
46519,نسب إليه نتائج الوداد وصحوة الرجاء وآخر صيحاته...,0.000000
46520,ستحتضن الرباط في الفترة مابين يوليوز المقبل دو...,0.000000


In [None]:
max_index = df['relevance_score'].idxmax()

print(df['text'][max_index])
print('________________________')
print(df['relevance_score'][max_index])


شباب المسيرة يضرب بقوة ووداد فاس يرغم الخميسات على التعادل حافظ شباب قصبة تادلة على صدارة القسم الثاني بتعادله أمام مضيفه أولمبيك مراكش دون أهداف السبت الماضي لحساب الدورة من منافسات بطولة القسم الثاني لكرة القدم واستغل الفريق التادلاوي تعادل جمعية سلا بميدانه للبقاء وحيدا في المقدمة برصيد نقطة جمعها من سبعة انتصارات وأربعة تعادلات فيما يحتل أولمبيك مراكش المركز الرابع نقطة ولم يستغل جمعية سلا المطارد المباشر عامل الاستقبال بميدانه وأرغمه أولمبيك الدشيرة على التعادل بهدف لمثله ورغم تعادل جمعية سلا إلا أنه يواصل تشديد الخناق على المتصدر قصبة تادلة إذ لم تفصله عنه سوى نقطة واحدة في الوقت الذي واصل أولمبيك الدشيرة نتائجه الإيجابية بحلوله في المركز الثالث نقطة من جهته ضرب شباب المسيرة بقوة وهو يفوز على الرسينغ البيضاوي بأربعة أهداف لصفر ليضع حدا لهزائمه المتتالية ويرتقي إلى المركز الثامن نقطة فيما حافظ الراسينغ البيضاوي على المركز الخامس نقطة إلى جانب أولمبيك مراكش وفشل اتحاد الخميسات في تحسين موقعه بعد تعادله أمام وداد فاس بهدفين لمثلهما في المباراة التي جمعتهما بملعب نونبر في الخميسات إذ

In [None]:
df['relevance_score'].value_counts()

relevance_score
0.000000    18943
1.003368        4
1.529186        2
2.699403        2
1.112502        2
            ...  
0.441897        1
3.745031        1
0.257102        1
0.588821        1
0.372773        1
Name: count, Length: 24719, dtype: int64

In [None]:
df = df[df['relevance_score'] > 1]

# Reset the index of the DataFrame after removing rows
df.reset_index(drop=True, inplace=True)

df['relevance_score'].value_counts()

relevance_score
1.003368    4
9.951023    2
9.925513    2
2.699403    2
1.529186    2
           ..
8.232177    1
1.382262    1
4.162155    1
2.162482    1
9.957753    1
Name: count, Length: 17114, dtype: int64

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
df.to_csv('data_with_score.csv')

NameError: name 'df' is not defined

In [1]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/data_with_score.csv')
df = df.head(5000)

In [2]:
df.drop(['Unnamed: 0'],axis = 1,inplace=True)

In [3]:
df

Unnamed: 0,text,processed,relevance_score
0,حقق فريق تشيلسي فوزا ثمينا على مانشستر سيتي في...,"['حقق', 'فرق', 'شيلس', 'فوز', 'ثمي', 'مانشستر'...",3.650022
1,في ما يلي البرنامج الكامل لنهائيات كأس اوروبا ...,"['يلي', 'رنمج', 'كمل', 'نهئ', 'كأس', 'ورب', 'ل...",1.713582
2,تأهلت بلجيكا لنهائيات كأس العالم لكرة القدم قب...,"['أهل', 'بلج', 'نهئ', 'كأس', 'علم', 'لكر', 'قد...",3.284560
3,واصل النجم البرتغالي كريستيانو رونالدو تألقه م...,"['وصل', 'نجم', 'رتغال', 'كريستيانو', 'رونالدو'...",3.360700
4,استهل باريس سان جيرمان مبارياته في الدوري الفر...,"['سهل', 'ارس', 'سان', 'جيرم', 'مبارياته', 'دور...",3.755725
...,...,...,...
4995,فاز المنتخب الوطني المغربي على نظيره البوركينا...,"['فاز', 'نخب', 'وطن', 'غرب', 'نظر', 'بوركينابي...",2.664632
4996,عبر لاعبو الكوكب المراكشي عن غضبهم من المكتب ا...,"['عبر', 'عبو', 'كوكب', 'ركش', 'غضب', 'كتب', 'س...",2.444358
4997,صنفت صحيفة فرانس فوتبول الفرنسية يونس بلهندة ا...,"['صنف', 'صحف', 'رنس', 'تبل', 'رنس', 'ونس', 'هن...",1.107268
4998,تمك نادي الوداد الرياضي البيضاوي من التأهل إلى...,"['تمك', 'ندي', 'ودد', 'ريض', 'يضو', 'أهل', 'دو...",6.697766


In [4]:
df['processed'] = df['processed'].apply(lambda x: ' '.join(x))

In [5]:
!pip install tensorflow



In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [8]:
# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['processed'])
sequences = tokenizer.texts_to_sequences(df['processed'])

In [19]:
# Pad the sequences
max_sequence_length = 500
X = pad_sequences(sequences, maxlen=max_sequence_length)

In [20]:
max_sequence_length

500

In [21]:
y = df['relevance_score'].values

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

In [23]:
X_train

array([[ 0,  0,  0, ...,  6,  2,  1],
       [ 1, 23,  9, ..., 19,  2,  1],
       [ 1,  1,  8, ..., 20,  3,  1],
       ...,
       [ 1,  1, 32, ..., 10,  9,  1],
       [ 1,  1, 10, ..., 17, 11,  1],
       [ 1,  8, 10, ...,  2, 10,  1]], dtype=int32)

In [24]:
# Define the RNN model
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [25]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Print model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 100)          13700     
                                                                 
 lstm_1 (LSTM)               (None, 64)                42240     
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 56005 (218.77 KB)
Trainable params: 56005 (218.77 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test),verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test, model.predict(X_test))



14.070557364310147