#Libraries

In [None]:
#Preprocessing
import pandas as pd
import re
import nltk
import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

nltk.download('punkt')
nltk.download('stopwords')

#Word representation/embedding
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel
from gensim.models.fasttext import FastText

#Models
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Input, Embedding, Dropout
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier

#Needed for models like optimizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from keras.layers import Reshape

#Evaluation metrics
from sklearn.metrics import accuracy_score, classification_report

# Load data
df_train = pd.read_excel('News_train.xlsx')
df_test = pd.read_excel('News_test.xlsx')

# Initialize Arabic stopwords
arabic_stop_words = set(stopwords.words('arabic'))

# Other preprocessing steps and model implementation will go here
from gensim.models import KeyedVectors
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf
from nltk.stem import SnowballStemmer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

#Code-1 (TF-IDF and naive bayes/DT)

In [None]:
df_train = pd.read_excel('News_train.xlsx')
df_test = pd.read_excel('News_test.xlsx')

def preprocess(text):

    text = re.sub(r'[^\w\s]', '', text)

    text = text.strip()
    text = re.sub(r'(.)\1+', r"\1\1", text)
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ة", "ه", text)
    text = re.sub(r'[A-Za-z0-9]', r'', text)
    return text

df_train['preprocessed_news'] = df_train['News'].apply(preprocess)
df_test['preprocessed_news'] = df_test['News'].apply(preprocess)

train_token = [nltk.word_tokenize(text) for text in df_train['preprocessed_news']]
test_token = [nltk.word_tokenize(text) for text in df_test['preprocessed_news']]

def remove_stop_words(tokens):
    return [token for token in tokens if token not in arabic_stop_words]

train_token = [remove_stop_words(tokens) for tokens in train_token]
test_token = [remove_stop_words(tokens) for tokens in test_token]

def snowball_stemmer(tokens):
    stemmer = SnowballStemmer('arabic')
    return [stemmer.stem(token) for token in tokens]

df_train['snowball_stemmed'] = [" ".join(snowball_stemmer(tokens)) for tokens in train_token]
df_test['snowball_stemmed'] = [" ".join(snowball_stemmer(tokens)) for tokens in test_token]

label_encoder = LabelEncoder()
df_train['Type'] = label_encoder.fit_transform(df_train['Type'])
df_test['Type'] = label_encoder.fit_transform(df_test['Type'])

X_train, X_val, y_train, y_val = train_test_split(df_train['preprocessed_news'], df_train['Type'], test_size=0.2, random_state=36)

tfidf_vectorizer = TfidfVectorizer()
tfidf_features_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_features_val = tfidf_vectorizer.transform(X_val)
tfidf_features_test = tfidf_vectorizer.transform(df_test['preprocessed_news'])

nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_features_train, y_train)

y_pred = nb_classifier.predict(tfidf_features_val)

accuracy = accuracy_score(y_val, y_pred)

report = classification_report(y_val, y_pred, target_names=label_encoder.classes_)
print("The classification Report: \n", report)

Classification Report:
               precision    recall  f1-score   support

    economic       0.96      0.42      0.58       176
    politics       0.81      0.99      0.90       647
       sport       0.98      0.80      0.88       163
        tech       0.00      0.00      0.00        14

    accuracy                           0.85      1000
   macro avg       0.69      0.55      0.59      1000
weighted avg       0.86      0.85      0.83      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
DT_model = DecisionTreeClassifier()

DT_model.fit(tfidf_features_train, y_train)

y_pred_dt = DT_model.predict(tfidf_features_val)

acc_train = accuracy_score(y_val, y_pred_dt)
print(f"Validation's accuracy: {acc_train}")

y_pred_test = DT_model.predict(tfidf_features_test)

acc_test = accuracy_score(df_test['Type'], y_pred_test)
print(f"Test accuracy: {acc_test}")

report_dt = classification_report(df_test['Type'], y_pred_test)
print("The Classification Report (Decision Tree): \n", report_dt)

Validation Accuracy (Dt): 0.848
Test Accuracy (Decision Tree): 0.779
Classification Report (Decision Tree):
               precision    recall  f1-score   support

           0       0.64      0.61      0.63       200
           1       0.77      0.88      0.82       512
           2       0.94      0.91      0.92       200
           3       0.77      0.26      0.39        88

    accuracy                           0.78      1000
   macro avg       0.78      0.67      0.69      1000
weighted avg       0.78      0.78      0.77      1000



##Without preprocessing

In [None]:
df_train = pd.read_excel('News_train.xlsx')
df_test = pd.read_excel('News_test.xlsx')

label_encoder = LabelEncoder()
df_train['Type'] = label_encoder.fit_transform(df_train['Type'])
df_test['Type'] = label_encoder.fit_transform(df_test['Type'])

X_train, X_val, y_train, y_val = train_test_split(df_train['News'], df_train['Type'], test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer()
tfidf_features_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_features_val = tfidf_vectorizer.transform(X_val)
tfidf_features_test = tfidf_vectorizer.transform(df_test['News'])

nb_classifier = MultinomialNB()
nb_classifier.fit(tfidf_features_train, y_train)

y_pred = nb_classifier.predict(tfidf_features_val)

accuracy = accuracy_score(y_val, y_pred)

report = classification_report(y_val, y_pred, target_names=label_encoder.classes_)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

    economic       0.96      0.40      0.57       176
    politics       0.81      0.99      0.89       647
       sport       0.98      0.79      0.87       163
        tech       0.00      0.00      0.00        14

    accuracy                           0.84      1000
   macro avg       0.69      0.55      0.58      1000
weighted avg       0.85      0.84      0.82      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Code-2 (Word2vec(Wiki 100/300) and naive bayes/DT)

In [None]:
!wget "https://archive.org/download/aravec2.0/wiki_cbow_100.zip"
!unzip "wiki_cbow_100.zip"

--2024-06-11 12:23:17--  https://archive.org/download/aravec2.0/wiki_cbow_100.zip
Resolving archive.org (archive.org)... 207.241.224.2
Connecting to archive.org (archive.org)|207.241.224.2|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ia803107.us.archive.org/0/items/aravec2.0/wiki_cbow_100.zip [following]
--2024-06-11 12:23:18--  https://ia803107.us.archive.org/0/items/aravec2.0/wiki_cbow_100.zip
Resolving ia803107.us.archive.org (ia803107.us.archive.org)... 207.241.232.157
Connecting to ia803107.us.archive.org (ia803107.us.archive.org)|207.241.232.157|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 124043077 (118M) [application/zip]
Saving to: ‘wiki_cbow_100.zip’


2024-06-11 12:26:33 (622 KB/s) - ‘wiki_cbow_100.zip’ saved [124043077/124043077]

Archive:  wiki_cbow_100.zip
  inflating: wikipedia_cbow_100      
  inflating: wikipedia_cbow_100.trainables.syn1neg.npy  
  inflating: wikipedia_cbow_100.wv.vectors.npy  


In [None]:
!wget "https://archive.org/download/aravec2.0/wiki_cbow_300.zip"
!unzip "wiki_cbow_300.zip"

--2024-06-09 16:37:26--  https://archive.org/download/aravec2.0/wiki_cbow_300.zip
Resolving archive.org (archive.org)... 207.241.224.2
Connecting to archive.org (archive.org)|207.241.224.2|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ia903107.us.archive.org/0/items/aravec2.0/wiki_cbow_300.zip [following]
--2024-06-09 16:37:27--  https://ia903107.us.archive.org/0/items/aravec2.0/wiki_cbow_300.zip
Resolving ia903107.us.archive.org (ia903107.us.archive.org)... 207.241.232.147
Connecting to ia903107.us.archive.org (ia903107.us.archive.org)|207.241.232.147|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 364888893 (348M) [application/zip]
Saving to: ‘wiki_cbow_300.zip’


2024-06-09 16:46:54 (629 KB/s) - ‘wiki_cbow_300.zip’ saved [364888893/364888893]

Archive:  wiki_cbow_300.zip
  inflating: wikipedia_cbow_300      
  inflating: wikipedia_cbow_300.trainables.syn1neg.npy  
  inflating: wikipedia_cbow_300.wv.vectors.npy  


In [None]:
df_train = pd.read_excel('News_train.xlsx')
df_test = pd.read_excel('News_test.xlsx')

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)

    text = text.strip()
    text = re.sub(r'(.)\1+', r"\1\1", text)
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ة", "ه", text)
    text = re.sub(r'[A-Za-z0-9]', r'', text)
    return text

df_train['preprocessed_news'] = df_train['News'].apply(preprocess)
df_test['preprocessed_news'] = df_test['News'].apply(preprocess)

train_token = [nltk.word_tokenize(text) for text in df_train['preprocessed_news']]
test_token = [nltk.word_tokenize(text) for text in df_test['preprocessed_news']]

def remove_stop_words(tokens):
    return [token for token in tokens if token not in arabic_stop_words]

train_token = [remove_stop_words(tokens) for tokens in train_token]
test_token = [remove_stop_words(tokens) for tokens in test_token]

t_model = Word2Vec.load('/content/wikipedia_cbow_100')

def word2vec_transform(tokens):
    vectors = []
    for token in tokens:
        if token in t_model.wv:
            vectors.append(t_model.wv[token])
    return vectors

word2vec_train = [word2vec_transform(tokens) for tokens in train_token]
word2vec_test = [word2vec_transform(tokens) for tokens in test_token]

def vec_avg(word_vectors):
    if len(word_vectors) == 0:
        return np.zeros(t_model.vector_size)
    return np.mean(word_vectors, axis=0)

X_train_avg, X_val_avg, y_train, y_val = train_test_split(word2vec_train, df_train['Type'], test_size=0.2, random_state=42)

X_train_avg = [vec_avg(vectors) for vectors in X_train_avg]
X_val_avg = [vec_avg(vectors) for vectors in X_val_avg]
X_test_avg = [vec_avg(vectors) for vectors in word2vec_test]

X_train_avg = np.array(X_train_avg)
X_val_avg = np.array(X_val_avg)
X_test_avg = np.array(X_test_avg)

nb_classifier = GaussianNB()
nb_classifier.fit(X_train_avg, y_train)

y_pred = nb_classifier.predict(X_val_avg)

accuracy = accuracy_score(y_val, y_pred)
print(f"Validation accuracy: {accuracy}")

y_pred_test = nb_classifier.predict(X_test_avg)

accuracy_test = accuracy_score(df_test['Type'], y_pred_test)
print(f"Test accuracy: {accuracy_test}")

report = classification_report(df_test['Type'], y_pred_test)
print("The classification Report:\n", report)

Validation Accuracy: 0.859
Test Accuracy: 0.841
Classification Report:
               precision    recall  f1-score   support

    economic       0.68      0.73      0.71       200
    politics       0.88      0.87      0.87       512
       sport       0.97      0.94      0.95       200
        tech       0.75      0.73      0.74        88

    accuracy                           0.84      1000
   macro avg       0.82      0.82      0.82      1000
weighted avg       0.84      0.84      0.84      1000



In [None]:
DT_model = DecisionTreeClassifier()

DT_model.fit(X_train_avg, y_train)

y_pred_dt = DT_model.predict(X_val_avg)

train_acc = accuracy_score(y_val, y_pred_dt)
print(f"Validation Accuracy: {train_acc}")
y_pred_test_dt = DT_model.predict(X_test_avg)

test_acc = accuracy_score(df_test['Type'], y_pred_test_dt)
print(f"Test Accuracy: {test_acc}")

report = classification_report(df_test['Type'], y_pred_test_dt)
print("The classification report : \n", report_dt)

Validation Accuracy (Decision Tree): 0.779
Test Accuracy (Decision Tree): 0.744
Classification Report (Decision Tree):
               precision    recall  f1-score   support

    economic       0.57      0.61      0.59       200
    politics       0.78      0.83      0.80       512
       sport       0.84      0.85      0.84       200
        tech       0.69      0.31      0.43        88

    accuracy                           0.74      1000
   macro avg       0.72      0.65      0.67      1000
weighted avg       0.74      0.74      0.74      1000



In [None]:
df_train = pd.read_excel('News_train.xlsx')
df_test = pd.read_excel('News_test.xlsx')

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)

    text = text.strip()
    text = re.sub(r'(.)\1+', r"\1\1", text)
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ة", "ه", text)
    text = re.sub(r'[A-Za-z0-9]', r'', text)
    return text

df_train['preprocessed_news'] = df_train['News'].apply(preprocess)
df_test['preprocessed_news'] = df_test['News'].apply(preprocess)

train_token = [nltk.word_tokenize(text) for text in df_train['preprocessed_news']]
test_token = [nltk.word_tokenize(text) for text in df_test['preprocessed_news']]

def remove_stop_words(tokens):
    return [token for token in tokens if token not in arabic_stop_words]

train_token = [remove_stop_words(tokens) for tokens in train_token]
test_token = [remove_stop_words(tokens) for tokens in test_token]

t_model = Word2Vec.load('/content/wikipedia_cbow_300')

def word2vec_transform(tokens):
    vectors = []
    for token in tokens:
        if token in t_model.wv:
            vectors.append(t_model.wv[token])
    return vectors

word2vec_train = [word2vec_transform(tokens) for tokens in train_token]
word2vec_test = [word2vec_transform(tokens) for tokens in test_token]

def vec_avg(word_vectors):
    if len(word_vectors) == 0:
        return np.zeros(t_model.vector_size)
    return np.mean(word_vectors, axis=0)

X_train_avg, X_val_avg, y_train, y_val = train_test_split(word2vec_train, df_train['Type'], test_size=0.2, random_state=42)

X_train_avg = [vec_avg(vectors) for vectors in X_train_avg]
X_val_avg = [vec_avg(vectors) for vectors in X_val_avg]
X_test_avg = [vec_avg(vectors) for vectors in word2vec_test]

X_train_avg = np.array(X_train_avg)
X_val_avg = np.array(X_val_avg)
X_test_avg = np.array(X_test_avg)

nb_classifier = GaussianNB()
nb_classifier.fit(X_train_avg, y_train)

y_pred = nb_classifier.predict(X_val_avg)

accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

y_pred_test = nb_classifier.predict(X_test_avg)

test_acc = accuracy_score(df_test['Type'], y_pred_test)
print(f"Test Accuracy: {test_acc}")

report = classification_report(df_test['Type'], y_pred_test)
print("The classification Report: \n", report)

Validation Accuracy: 0.859
Test Accuracy: 0.841
Classification Report:
               precision    recall  f1-score   support

    economic       0.69      0.74      0.72       200
    politics       0.88      0.86      0.87       512
       sport       0.96      0.94      0.95       200
        tech       0.73      0.75      0.74        88

    accuracy                           0.84      1000
   macro avg       0.81      0.82      0.82      1000
weighted avg       0.85      0.84      0.84      1000



In [None]:
DT_model = DecisionTreeClassifier()

DT_model.fit(X_train_avg, y_train)

y_pred = DT_model.predict(X_val_avg)

accuracy_dt = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy_dt}")

test_y_pred = DT_model.predict(X_test_avg)

accuracy_test_dt = accuracy_score(df_test['Type'], test_y_pred)
print(f"Test Accuracy: {accuracy_test_dt}")

report_dt = classification_report(df_test['Type'], test_y_pred)
print("The classification report: \n", report_dt)

Validation Accuracy (Decision Tree): 0.779
Test Accuracy (Decision Tree): 0.713
Classification Report (Decision Tree):
               precision    recall  f1-score   support

    economic       0.50      0.53      0.51       200
    politics       0.77      0.82      0.80       512
       sport       0.80      0.87      0.83       200
        tech       0.43      0.14      0.21        88

    accuracy                           0.71      1000
   macro avg       0.63      0.59      0.59      1000
weighted avg       0.69      0.71      0.70      1000



#Code-3 (BERT and naive bayes)

In [None]:
df_train = pd.read_excel('News_train.xlsx')
df_test = pd.read_excel('News_test.xlsx')

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    text = re.sub(r'(.)\1+', r"\1\1", text)
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ة", "ه", text)
    text = re.sub(r'[A-Za-z0-9]', r'', text)
    return text

df_train['preprocessed_news'] = df_train['News'].apply(preprocess)
df_test['preprocessed_news'] = df_test['News'].apply(preprocess)

def BERT_embeddings(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

#bf = bert features
train_bf = np.array([BERT_embeddings(text) for text in df_train['preprocessed_news']])
test_bf = np.array([BERT_embeddings(text) for text in df_test['preprocessed_news']])

label_encoder = LabelEncoder()
df_train['Type'] = label_encoder.fit_transform(df_train['Type'])
df_test['Type'] = label_encoder.fit_transform(df_test['Type'])

X_train, X_val, y_train, y_val = train_test_split(train_bf, df_train['Type'], test_size=0.2, random_state=42)

nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy}")

report = classification_report(y_val, y_pred, target_names=label_encoder.classes_)
print("The classification report:\n", report)

Validation Accuracy: 0.807
Classification Report:
               precision    recall  f1-score   support

    economic       0.70      0.74      0.72       176
    politics       0.95      0.82      0.88       647
       sport       0.92      0.83      0.87       163
        tech       0.09      0.64      0.15        14

    accuracy                           0.81      1000
   macro avg       0.66      0.76      0.66      1000
weighted avg       0.89      0.81      0.84      1000



#Code-4 (TF-IDF and BI-LSTM) and BI-LSTM alone

In [None]:
df_train = pd.read_excel('News_train.xlsx')
df_test = pd.read_excel('News_test.xlsx')

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    text = re.sub(r'(.)\1+', r"\1\1", text)
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ة", "ه", text)
    text = re.sub(r'[A-Za-z0-9]', r'', text)
    return text

df_train['preprocessed_news'] = df_train['News'].apply(preprocess)
df_test['preprocessed_news'] = df_test['News'].apply(preprocess)

train_token = [nltk.word_tokenize(text) for text in df_train['preprocessed_news']]
test_token = [nltk.word_tokenize(text) for text in df_test['preprocessed_news']]

def remove_stop_words(tokens):
    return [token for token in tokens if token not in arabic_stop_words]

train_token = [remove_stop_words(tokens) for tokens in train_token]
test_token = [remove_stop_words(tokens) for tokens in test_token]

def snowball_stemmer(tokens):
    stemmer = SnowballStemmer('arabic')
    return [stemmer.stem(token) for token in tokens]

df_train['snowball_stemmed'] = [" ".join(snowball_stemmer(tokens)) for tokens in train_token]
df_test['snowball_stemmed'] = [" ".join(snowball_stemmer(tokens)) for tokens in test_token]

label_encoder = LabelEncoder()
df_train['Type'] = label_encoder.fit_transform(df_train['Type'])
df_test['Type'] = label_encoder.fit_transform(df_test['Type'])

X_train, X_val, y_train, y_val = train_test_split(df_train['snowball_stemmed'], df_train['Type'], test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer()
tfidf_features_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_features_val = tfidf_vectorizer.transform(X_val)
tfidf_features_test = tfidf_vectorizer.transform(df_test['snowball_stemmed'])

X_train_dense = tfidf_features_train.toarray()
X_val_dense = tfidf_features_val.toarray()
X_test_dense = tfidf_features_test.toarray()

max_len = 26

X_train_padded = pad_sequences(X_train_dense, maxlen=max_len, padding='post')
X_val_padded = pad_sequences(X_val_dense, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_dense, maxlen=max_len, padding='post')

model = Sequential()
model.add(Embedding(input_dim=X_train_padded.shape[1], output_dim=100, input_length=max_len))
model.add(Bidirectional(LSTM(300)))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

adam = Adam(learning_rate=1e-5)
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

print(model.summary())

history = model.fit(X_train_padded, y_train, epochs=5, validation_data=(X_val_padded, y_val), batch_size=16, verbose=1)

loss, accuracy = model.evaluate(X_train_padded, y_train)
print(f'Training Accuracy: {accuracy:.2f}')

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_14 (Embedding)    (None, 26, 300)           7800      
                                                                 
 bidirectional_17 (Bidirect  (None, 600)               1442400   
 ional)                                                          
                                                                 
 dense_17 (Dense)            (None, 4)                 2404      
                                                                 
Total params: 1452604 (5.54 MB)
Trainable params: 1452604 (5.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Training Accuracy: 0.61


In [None]:
y_pred = model.predict(X_test_padded)
y_pred_classes = y_pred.argmax(axis=-1)

from sklearn.metrics import classification_report
class_names = label_encoder.classes_
y_true = df_test['Type']
y_pred = y_pred_classes

report = classification_report(y_true, y_pred, target_names=class_names)
print(report)

              precision    recall  f1-score   support

    economic       0.00      0.00      0.00       200
    politics       0.51      1.00      0.68       512
       sport       0.00      0.00      0.00       200
        tech       0.00      0.00      0.00        88

    accuracy                           0.51      1000
   macro avg       0.13      0.25      0.17      1000
weighted avg       0.26      0.51      0.35      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
df_train = pd.read_excel('News_train.xlsx')
df_test = pd.read_excel('News_test.xlsx')

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    text = re.sub(r'(.)\1+', r"\1\1", text)
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ة", "ه", text)
    text = re.sub(r'[A-Za-z0-9]', r'', text)
    return text

df_train['processed_text'] = df_train['News'].apply(preprocess)
df_test['processed_text'] = df_test['News'].apply(preprocess)

arabic_stop_words = set(stopwords.words('arabic'))
tokenized_train = [nltk.word_tokenize(text) for text in df_train['processed_text']]
tokenized_test = [nltk.word_tokenize(text) for text in df_test['processed_text']]

def remove_stop_words(tokens):
    return [token for token in tokens if token not in arabic_stop_words]

tokenized_train = [remove_stop_words(tokens) for tokens in tokenized_train]
tokenized_test = [remove_stop_words(tokens) for tokens in tokenized_test]

def snowball_stemmer(tokens):
    stemmer = SnowballStemmer('arabic')
    return [stemmer.stem(token) for token in tokens]

df_train['snowball_stemmed'] = [" ".join(snowball_stemmer(tokens)) for tokens in tokenized_train]
df_test['snowball_stemmed'] = [" ".join(snowball_stemmer(tokens)) for tokens in tokenized_test]

label_encoder = LabelEncoder()
df_train['Type'] = label_encoder.fit_transform(df_train['Type'])
df_test['Type'] = label_encoder.transform(df_test['Type'])

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['snowball_stemmed'])
X_train_sequences = tokenizer.texts_to_sequences(df_train['snowball_stemmed'])
X_val_sequences = tokenizer.texts_to_sequences(df_test['snowball_stemmed'])
X_test_sequences = tokenizer.texts_to_sequences(df_test['snowball_stemmed'])

max_len = 26
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_len, padding='post')
X_val_padded = pad_sequences(X_val_sequences, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_len, padding='post')

vocab_size = len(tokenizer.word_index) + 1  # +1 for unknown words

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=max_len))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
X_train, X_val, y_train, y_val = train_test_split(X_train_padded, df_train['Type'], test_size=0.2, random_state=42)
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, df_test['Type'])
print(f"Test Accuracy: {accuracy:.4f}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.8380


In [None]:
y_pred = model.predict(X_test_padded)
y_pred_classes = y_pred.argmax(axis=-1)

from sklearn.metrics import classification_report
class_names = label_encoder.classes_
y_true = df_test['Type']
y_pred = y_pred_classes

report = classification_report(y_true, y_pred, target_names=class_names)
print(report)

              precision    recall  f1-score   support

    economic       0.78      0.67      0.72       200
    politics       0.82      0.93      0.87       512
       sport       0.97      0.94      0.95       200
        tech       0.75      0.47      0.57        88

    accuracy                           0.84      1000
   macro avg       0.83      0.75      0.78      1000
weighted avg       0.84      0.84      0.83      1000



#Code-5 (Word2vec and BI-LSTM)

In [None]:
df_train = pd.read_excel('News_train.xlsx')
df_test = pd.read_excel('News_test.xlsx')

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    text = re.sub(r'(.)\1+', r"\1\1", text)
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ة", "ه", text)
    text = re.sub(r'[A-Za-z0-9]', r'', text)
    return text

df_train['preprocessed_news'] = df_train['News'].apply(preprocess)
df_test['preprocessed_news'] = df_test['News'].apply(preprocess)

train_token = [nltk.word_tokenize(text) for text in df_train['preprocessed_news']]
tokenized_test = [nltk.word_tokenize(text) for text in df_test['preprocessed_news']]

def remove_stop_words(tokens):
    return [token for token in tokens if token not in arabic_stop_words]

train_token = [remove_stop_words(tokens) for tokens in train_token]
tokenized_test = [remove_stop_words(tokens) for tokens in tokenized_test]

def snowball_stemmer(tokens):
    stemmer = SnowballStemmer('arabic')
    return [stemmer.stem(token) for token in tokens]

df_train['snowball_stemmed'] = [" ".join(snowball_stemmer(tokens)) for tokens in train_token]
df_test['snowball_stemmed'] = [" ".join(snowball_stemmer(tokens)) for tokens in tokenized_test]

label_encoder = LabelEncoder()
df_train['Type'] = label_encoder.fit_transform(df_train['Type'])
df_test['Type'] = label_encoder.transform(df_test['Type'])

X_train, X_val, y_train, y_val = train_test_split(df_train['snowball_stemmed'], df_train['Type'], test_size=0.2, random_state=42)

t_model = Word2Vec.load('/content/wikipedia_cbow_100')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(df_test['snowball_stemmed'])

max_length = 26
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

TOTAL_EMBEDDING_DIM = 100

embedding_matrix = np.zeros((vocab_size, TOTAL_EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if word in t_model.wv:
        embedding_matrix[i] = t_model.wv[word]

embedding_layer = tf.keras.layers.Embedding(vocab_size, TOTAL_EMBEDDING_DIM,
                                            weights=[embedding_matrix],
                                            input_length=max_length,
                                            trainable=False)

model = tf.keras.Sequential([
    embedding_layer,
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=128, dropout=0.2, recurrent_dropout=0.2)),
    tf.keras.layers.Dense(4, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


history = model.fit(X_train_pad, y_train, epochs=10, batch_size=32, validation_data=(X_val_pad, y_val))

loss, accuracy = model.evaluate(X_val_pad, y_val)
print("Validation Accuracy:", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Validation Accuracy: 0.9079999923706055


In [None]:
y_pred = model.predict(X_test_pad)
y_pred_classes = y_pred.argmax(axis=-1)

class_names = label_encoder.classes_
y_true = df_test['Type']
y_pred = y_pred_classes

report = classification_report(y_true, y_pred, target_names=class_names)
print(report)

              precision    recall  f1-score   support

    economic       0.73      0.65      0.69       200
    politics       0.80      0.91      0.85       512
       sport       0.86      0.90      0.88       200
        tech       0.96      0.30      0.45        88

    accuracy                           0.80      1000
   macro avg       0.84      0.69      0.72      1000
weighted avg       0.81      0.80      0.79      1000



#Code-6 (BERT and BI-LSTM)

In [None]:
df_train = pd.read_excel('News_train.xlsx')
df_test = pd.read_excel('News_test.xlsx')

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertModel.from_pretrained('bert-base-multilingual-cased')

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    text = re.sub(r'(.)\1+', r"\1\1", text)
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "و", text)
    text = re.sub("ة", "ه", text)
    text = re.sub(r'[A-Za-z0-9]', r'', text)
    return text

df_train['preprocessed_news'] = df_train['News'].apply(preprocess)
df_test['preprocessed_news'] = df_test['News'].apply(preprocess)

def get_bert_embeddings(text):
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

train_bf = np.array([get_bert_embeddings(text) for text in df_train['preprocessed_news']])
test_bf = np.array([get_bert_embeddings(text) for text in df_test['preprocessed_news']])

label_encoder = LabelEncoder()
df_train['Type'] = label_encoder.fit_transform(df_train['Type'])
df_test['Type'] = label_encoder.transform(df_test['Type'])

X_train, X_val, y_train, y_val = train_test_split(train_bf, df_train['Type'], test_size=0.2, random_state=42)

model = Sequential()
model.add(Input(shape=(train_bf.shape[1],)))
model.add(Dense(300, activation='relu'))
model.add(Reshape((1, 300)))  # Reshaping to add timestep dimension (Necessary)
model.add(Bidirectional(LSTM(128, return_sequences=False)))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

adam = Adam(learning_rate=1e-5)
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

print(model.summary())

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=1, validation_data=(X_val, y_val), callbacks=[early_stopping])

loss, accuracy = model.evaluate(test_bf, df_test['Type'])
print(f'Test Accuracy: {accuracy:.2f}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Model: "sequential_31"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_33 (Dense)            (None, 300)               230700    
                                                                 
 reshape (Reshape)           (None, 1, 300)            0         
                                                                 
 bidirectional_33 (Bidirect  (None, 256)               439296    
 ional)                                                          
                                                                 
 dense_34 (Dense)            (None, 4)                 1028      
                                                                 
Total params: 671024 (2.56 MB)
Trainable params: 671024 (2.56 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
E

In [None]:
y_pred = model.predict(bert_features_test)
y_pred_classes = y_pred.argmax(axis=-1)

report = classification_report(df_test['Type'], y_pred_classes, target_names=class_names)
print(report)

              precision    recall  f1-score   support

    economic       0.71      0.65      0.68       200
    politics       0.78      0.94      0.85       512
       sport       0.92      0.95      0.93       200
        tech       0.00      0.00      0.00        88

    accuracy                           0.80      1000
   macro avg       0.60      0.63      0.62      1000
weighted avg       0.73      0.80      0.76      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#Code-7 (BERT Model)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
np.random.seed(42)

#AraBERT
model_name = "aubmindlab/bert-base-arabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

df = pd.read_excel('News_train.xlsx')
label_encoder = LabelEncoder()
X = df['News']
y = df['Type']

y = label_encoder.fit_transform(y)

numberOfClasses = len(label_encoder.classes_)

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=32, shuffle=True)

encoded_train = tokenizer(X_train.tolist(), truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, return_tensors='pt')

X_train = encoded_train['input_ids'].to(device)
attention_mask_train = encoded_train['attention_mask'].to(device)
y_train = torch.tensor(y_train).to(device)

X_val = val_encodings['input_ids'].to(device)
attention_mask_val = val_encodings['attention_mask'].to(device)
y_val = torch.tensor(y_val).to(device)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=numberOfClasses)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 10
batch_size = 16

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    model.train()
    for i in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        batch_X = X_train[i:i+batch_size]
        batch_attention_mask = attention_mask_train[i:i+batch_size]
        batch_y = y_train[i:i+batch_size]

        outputs = model(input_ids=batch_X, attention_mask=batch_attention_mask, labels=batch_y)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if (i // batch_size) % 10 == 0:
            print(f"Batch {i // batch_size}/{len(X_train) // batch_size}, Loss: {loss.item()}")

model.eval()
with torch.no_grad():
    outputs = model(input_ids=X_val, attention_mask=attention_mask_val)
    logits = outputs.logits
    predictions = np.argmax(logits.cpu().numpy(), axis=1)

accuracy = accuracy_score(y_val.cpu().tolist(), predictions)
print("Accuracy:", accuracy)


Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10
Batch 0/234, Loss: 1.4063516855239868
Batch 10/234, Loss: 0.7090661525726318
Batch 20/234, Loss: 1.14046049118042
Batch 30/234, Loss: 0.8325386047363281
Batch 40/234, Loss: 0.6311667561531067
Batch 50/234, Loss: 0.30206412076950073
Batch 60/234, Loss: 0.33958324790000916
Batch 70/234, Loss: 0.32452672719955444
Batch 80/234, Loss: 0.2762323021888733
Batch 90/234, Loss: 0.3387797474861145
Batch 100/234, Loss: 0.3021828830242157
Batch 110/234, Loss: 0.4020525813102722
Batch 120/234, Loss: 0.15596021711826324
Batch 130/234, Loss: 0.09277378022670746
Batch 140/234, Loss: 0.03592954948544502
Batch 150/234, Loss: 0.05495041236281395
Batch 160/234, Loss: 0.040009453892707825
Batch 170/234, Loss: 0.4894813001155853
Batch 180/234, Loss: 0.24068833887577057
Batch 190/234, Loss: 0.2073265016078949
Batch 200/234, Loss: 0.043692443519830704
Batch 210/234, Loss: 0.05658339709043503
Batch 220/234, Loss: 0.13382934033870697
Batch 230/234, Loss: 0.11148389428853989
Epoch 2/10
Batch 0/234, Los

In [None]:
target_names = label_encoder.classes_
report = classification_report(y_val.cpu().tolist(), predictions, target_names=target_names)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

    economic       0.85      0.85      0.85       219
    politics       0.95      0.96      0.95       753
       sport       0.97      0.98      0.98       248
        tech       0.95      0.60      0.73        30

    accuracy                           0.93      1250
   macro avg       0.93      0.85      0.88      1250
weighted avg       0.93      0.93      0.93      1250



In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# Set the random seed for reproducibility
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
np.random.seed(42)

model_name = "aubmindlab/bert-base-arabert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

df = pd.read_excel('News_train.xlsx')
df_test = pd.read_excel('News_test.xlsx')
label_encoder = LabelEncoder()
X = df['News']
y = df['Type']

y_encoded = label_encoder.fit_transform(y)
test_y_encoded = label_encoder.transform(df_test['Type'])

num_classes = len(label_encoder.classes_)

X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, random_state=0, shuffle=True)

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(X_val.tolist(), truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(df_test['News'].tolist(), truncation=True, padding=True, return_tensors='pt')

X_train = train_encodings['input_ids'].to(device)
attention_mask_train = train_encodings['attention_mask'].to(device)
y_train = torch.tensor(y_train).to(device)

X_val = val_encodings['input_ids'].to(device)
attention_mask_val = val_encodings['attention_mask'].to(device)
y_val = torch.tensor(y_val).to(device)

X_test = test_encodings['input_ids'].to(device)
attention_mask_test = test_encodings['attention_mask'].to(device)
y_test = torch.tensor(test_y_encoded).to(device)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

num_epochs = 1
batch_size = 16

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    model.train()
    for i in range(0, len(X_train), batch_size):
        optimizer.zero_grad()
        batch_X = X_train[i:i+batch_size]
        batch_attention_mask = attention_mask_train[i:i+batch_size]
        batch_y = y_train[i:i+batch_size]

        outputs = model(input_ids=batch_X, attention_mask=batch_attention_mask, labels=batch_y)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        if (i // batch_size) % 10 == 0:  # Print loss every 10 batches
            print(f"Batch {i // batch_size}/{len(X_train) // batch_size}, Loss: {loss.item()}")

model.eval()
with torch.no_grad():
    outputs = model(input_ids=X_test, attention_mask=attention_mask_test)
    logits = outputs.logits
    predictions = np.argmax(logits.cpu().numpy(), axis=1)

accuracy = accuracy_score(y_test.cpu().tolist(), predictions)
print("Test Accuracy:", accuracy)

report = classification_report(y_test.cpu().tolist(), predictions, target_names=label_encoder.classes_)
print("Classification Report:\n", report)


Using device: cpu


model.safetensors:  25%|##5       | 136M/543M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/1
Batch 0/234, Loss: 1.3820428848266602
Batch 10/234, Loss: 0.7073324918746948
Batch 20/234, Loss: 0.858999490737915
Batch 30/234, Loss: 0.6104422807693481
Batch 40/234, Loss: 0.7617501616477966
Batch 50/234, Loss: 0.5383960604667664
Batch 60/234, Loss: 0.45408374071121216
Batch 70/234, Loss: 0.23851841688156128
Batch 80/234, Loss: 0.42813345789909363
Batch 90/234, Loss: 0.36323094367980957
Batch 100/234, Loss: 0.2301499992609024
Batch 110/234, Loss: 0.8705095052719116
Batch 120/234, Loss: 0.2173657864332199
Batch 130/234, Loss: 0.38814786076545715
Batch 140/234, Loss: 0.21315163373947144
Batch 150/234, Loss: 0.0882560983300209
Batch 160/234, Loss: 0.047658488154411316
Batch 170/234, Loss: 0.3001159429550171
Batch 180/234, Loss: 0.4809530973434448
Batch 190/234, Loss: 0.2580084502696991
Batch 200/234, Loss: 0.03158828616142273
Batch 210/234, Loss: 0.12465456873178482
Batch 220/234, Loss: 0.12454082816839218
Batch 230/234, Loss: 0.04337485507130623
Test Accuracy: 0.877
Classific