In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

# **Part1 Classification Regression:**

In [11]:
# List of Arabic websites concerning the following topic : Can 2025
urls = [
    'https://m.elbotola.com/article/2024-05-23-09-52-91.html',
    'https://www.dzair-tube.dz/%D8%A7%D9%84%D9%83%D8%A7%D9%81-%D9%81%D9%8A-%D9%85%D8%A3%D8%B2%D9%82-%D9%88%D9%86%D8%AD%D9%88-%D8%A5%D9%84%D8%BA%D8%A7%D8%A1-%D9%83%D8%A7%D9%86-2025-%D8%A8%D8%A7%D9%84%D9%85%D8%BA%D8%B1%D8%A8/',
    'https://ar.telquel.ma/%D8%A8%D8%B9%D8%AF-%D8%AC%D8%AF%D9%84-%D8%AA%D8%BA%D9%8A%D9%8A%D8%B1-%D8%AA%D9%88%D8%A7%D8%B1%D9%8A%D8%AE%D9%87-%D8%AC%D8%A7%D9%85%D8%B9%D8%A9-%D8%A7%D9%84%D9%83%D8%B1%D8%A9-%D8%AA%D9%83%D8%B4%D9%81/',
    'https://madar21.com/236536.html',
    'https://www.akhbarona.com/sport/worldfoot/385149.html',
    'https://m.alayam24.com/articles-487854.html',
    'https://al3omk.com/917426.html',
    'https://madar21.com/219573.html'
]

In [12]:
def scrape_articles(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    paragraphs = soup.find_all('p')  # Adjust this based on website structure

    texts = []
    for paragraph in paragraphs:
        sentences = paragraph.get_text(strip=True).split('.')
        for sentence in sentences:
                sentence = sentence.strip()
                if sentence:
                    # Keep only Arabic characters and numbers
                    filtered_sentence = re.sub(r'[^\u0600-\u06FF0-9\s]', '', sentence)
                    if filtered_sentence:
                        texts.append(filtered_sentence)
    
    return texts

In [None]:
# Collect data
all_texts = []
for url in urls:
    all_texts.extend(scrape_articles(url))

In [14]:
all_texts

['من المرتقب أن يعقد الاتحاد الأفريقي لكرة القدم، خلال الأسابيع المقبلة، اجتماعا من أجل تحديد موعد انطلاق بطولة كأس أمم أفريقيا 2025، المقررة بالمملكة المغربية',
 'وحسب الصحفي في موقع الكاف  ، فإن اللجنة التنفيذية للاتحاد الأفريقي، ستعقد اجتماعا رفقة الفيفا والسلطات المغربية خلال الأسابيع القادمة، من أجل اتخاذ قرار نهائي للكشف عن التاريخ الرسمي لانطلاق البطولة القارية وموعد اختتامها',
 'وأكد المصدر ذاته، أن المسابقة القارية ستقام بين دجنبر 2025 ويناير 2026، في انتظار تحديد الموعد الرسمي',
 'وكان موتسيبي قد أكد في تصريح خلال كونغريس الفيفا، أن موعد تنظيم كان 2025 لم يحسم في أمره بعد، مورداً أن الاتحاد الدولي للعبة هو من في يده القرار النهائي، بقولهالفيفا من سيقرر متى سيتم تنظيم التظاهرة كان 2025',
 'مدرب المنتخب الكونغولي يستدعي 28 لاعبا لمواجهتي النيجر والمغرب في تصفيات المونديال',
 'بطولة العالم للبارالمبية العداء المغربي أيوب سادني يتوج بالميدالية الذهبية لسباق 400م لفئة تاء 47',
 'سفيان المسرار مرشح لجائزة أفضل لاعب لكرة القدم داخل القاعة في العالم',
 'كأس أفريقيا لـمبتوري الأطراف ا

In [16]:
scores=[9,8,4,7,7,3,0,0,3,2,3,0,2,0,0,9,2,4,8,7,0,0,0,0,7,5,7,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,8,8,10,8,8,6,0,0,0,0,0,0,0,0,10,10,3,3,2,7,2,2,1,10,5,0,0,6,10,0,0,0,0,0,0,0,2,1,0,0,1,0,0,0,8,6,6,4,8,6,10,0,0,0,0,0,0,0,0,10,8,7,6,5,9,9,6,8,6,4,6,7,9,4,9,5,8,5,7,0,0,0,0,0,0,0]
len(scores)

153

In [17]:
# Remove empty elements
all_texts = [item for item in all_texts if item.strip()]

len(all_texts)

153

In [18]:
# Keywords related to CAN 2025
keywords = [
    "تصفيات","الاتحاد الأفريقي لكرة القدم","الكاف","بطولة كأس أمم إفريقيا 2025", "بطولة الأمم الإفريقية 2025", "كأس الأمم الإفريقية 2025",  "المباريات", "المنتخبات","الكان"
]

In [20]:
# Create DataFrame
df = pd.DataFrame({'Text': all_texts, 'Score': scores})

# Save to CSV
df.to_csv('Can2025_dataset.csv', index=False)

In [21]:
# Load the dataset
data = pd.read_csv('Can2025_dataset.csv')


In [22]:
data

Unnamed: 0,Text,Score
0,من المرتقب أن يعقد الاتحاد الأفريقي لكرة القدم...,9
1,وحسب الصحفي في موقع الكاف ، فإن اللجنة التنفي...,8
2,وأكد المصدر ذاته، أن المسابقة القارية ستقام بي...,4
3,وكان موتسيبي قد أكد في تصريح خلال كونغريس الفي...,7
4,مدرب المنتخب الكونغولي يستدعي 28 لاعبا لمواجهت...,7
...,...,...
148,التعليق,0
149,الاسم,0
150,البريد الإلكتروني,0
151,احفظ اسمي، بريدي الإلكتروني، والموقع الإلكترون...,0


In [26]:
pip install qalsadi

Collecting qalsadi
  Downloading qalsadi-0.5-py3-none-any.whl.metadata (12 kB)
Collecting Arabic-Stopwords>=0.4.2 (from qalsadi)
  Downloading Arabic_Stopwords-0.4.3-py3-none-any.whl.metadata (8.9 kB)
Collecting alyahmor>=0.2 (from qalsadi)
  Downloading alyahmor-0.2-py3-none-any.whl.metadata (11 kB)
Collecting arramooz-pysqlite>=0.4.2 (from qalsadi)
  Downloading arramooz_pysqlite-0.4.2-py3-none-any.whl.metadata (4.0 kB)
Collecting codernitydb3 (from qalsadi)
  Downloading codernitydb3-0.6.0.tar.gz (46 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.1/46.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting libqutrub>=1.2.3 (from qalsadi)
  Downloading libqutrub-1.2.4.1-py3-none-any.whl.metadata (7.5 kB)
Collecting mysam-tagmanager>=0.3.3 (from qalsadi)
  Downloading mysam_tagmanager-0.4-py3-none-any.whl.metadata (10 kB)
Collecting naftawayh>=0.3 (from qalsadi)
  Downloading Naftawayh-0.4-py3-none-a

In [29]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
import pandas as pd
import spacy
import qalsadi.lemmatizer

In [24]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt') 
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [30]:
# Initialize stopwords, stemmer, and lemmatizer
stop_words = set(stopwords.words('arabic'))
stemmer = ISRIStemmer()
lemmatizer = qalsadi.lemmatizer.Lemmatizer()

In [31]:
def preprocess_text(text):
    # Tokenization
    words = word_tokenize(text)
    
    # Remove stop words
    words = [word for word in words if word not in stop_words]
    
    # Stemming
    words = [stemmer.stem(word) for word in words]
    
    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)

In [32]:
data['processed_text'] = data['Text'].apply(preprocess_text)
data

Unnamed: 0,Text,Score,processed_text
0,من المرتقب أن يعقد الاتحاد الأفريقي لكرة القدم...,9,رقب عقد تحدي فرق كر قدم خلل بعا مقبل جمع حدد ع...
1,وحسب الصحفي في موقع الكاف ، فإن اللجنة التنفي...,8,حسب صحف وقع كاف ، جن نفذ تحدي ريق عقد جمع رفق ...
2,وأكد المصدر ذاته، أن المسابقة القارية ستقام بي...,4,كد صدر ذات سبق قر أقام دجنبر 2025 نير 2026 نظر...
3,وكان موتسيبي قد أكد في تصريح خلال كونغريس الفي...,7,كان موتسيب كدى صرح خلل كونغريس يف عد نظم 2025 ...
4,مدرب المنتخب الكونغولي يستدعي 28 لاعبا لمواجهت...,7,درب نخب كونغول سدع 28 عب مواجه جرى غرب أصفى مو...
...,...,...,...
148,التعليق,0,علق
149,الاسم,0,اسم
150,البريد الإلكتروني,0,برد تروي
151,احفظ اسمي، بريدي الإلكتروني، والموقع الإلكترون...,0,حفظ سم برد الكتروني وقع تروي تصفح استخدام مرة ...


In [33]:
data.to_csv('processed_Can2025dataset.csv', index=False)

In [34]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, SimpleRNN, Dense
from tensorflow.keras.layers import Bidirectional

2024-05-26 11:54:31.660475: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-26 11:54:31.660606: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-26 11:54:31.782289: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [36]:
# Load processed dataset
datap = pd.read_csv('processed_Can2025dataset.csv')

In [37]:
# Prepare the data
X = datap['processed_text'].values
y = datap['Score'].values

In [64]:
# Tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=100)

In [39]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

In [40]:
# Function to build RNN model
def build_rnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
    model.add(SimpleRNN(128))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [41]:
# Function to build GRU model
def build_gru_model():
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
    model.add(GRU(128))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [42]:
# Function to build LSTM model
def build_lstm_model():
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
    model.add(LSTM(128))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [43]:
# Function to build Bidirectional RNN model
def build_bidirectional_rnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=100))
    model.add(Bidirectional(SimpleRNN(128)))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [44]:
# Train models
rnn_model = build_rnn_model()
rnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

gru_model = build_gru_model()
gru_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

lstm_model = build_lstm_model()
lstm_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Train Bidirectional RNN model
bidirectional_rnn_model = build_bidirectional_rnn_model()
bidirectional_rnn_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 127ms/step - loss: 15.9625 - mae: 2.7090 - val_loss: 16.1207 - val_mae: 3.3059
Epoch 2/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 11.3666 - mae: 2.9183 - val_loss: 13.6258 - val_mae: 3.3365
Epoch 3/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 9.8317 - mae: 2.8688 - val_loss: 14.4244 - val_mae: 3.2421
Epoch 4/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - loss: 9.9857 - mae: 2.7688 - val_loss: 11.6729 - val_mae: 2.9401
Epoch 5/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 7.7815 - mae: 2.3990 - val_loss: 9.4870 - val_mae: 2.2534
Epoch 6/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - loss: 5.6870 - mae: 1.7825 - val_loss: 8.4401 - val_mae: 2.0556
Epoch 7/10
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 4.7613 -

<keras.src.callbacks.history.History at 0x7c714035e9b0>

In [45]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from nltk.translate.bleu_score import sentence_bleu

In [46]:
# Evaluate the models
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    return mse, mae

In [47]:
rnn_mse, rnn_mae = evaluate_model(rnn_model, X_test, y_test)
gru_mse, gru_mae = evaluate_model(gru_model, X_test, y_test)
lstm_mse, lstm_mae = evaluate_model(lstm_model, X_test, y_test)
bidirectional_rnn_mse, bidirectional_rnn_mae = evaluate_model(bidirectional_rnn_model, X_test, y_test)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 217ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 190ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 252ms/step


In [48]:
print("RNN:")
print("MSE:", rnn_mse)
print("MAE:", rnn_mae)
print("GRU:")
print("MSE:", gru_mse)
print("MAE:", gru_mae)
print("LSTM:")
print("MSE:", lstm_mse)
print("MAE:", lstm_mae)
print("Bidirectional RNN:")
print("MSE:", bidirectional_rnn_mse)
print("MAE:", bidirectional_rnn_mae)

RNN:
MSE: 7.422595392921256
MAE: 2.2024388613720096
GRU:
MSE: 7.6924502755849575
MAE: 2.0396484801125143
LSTM:
MSE: 6.779514753666441
MAE: 1.8107436018725556
Bidirectional RNN:
MSE: 5.982862276383602
MAE: 1.5492257579199729


# **Part 2 Transformer (Text generation):**

In [7]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np
from transformers import AdamW, get_linear_schedule_with_warmup

In [3]:
# Prepare the Dataset
def prepare_dataset(csv_file, tokenizer):
    df = pd.read_csv(csv_file)
    jokes = df['Joke'].tolist()
    return jokes

In [4]:
# Fine-tune GPT-2
def fine_tune_gpt2(train_dataset, tokenizer, model_name_or_path='gpt2', output_dir='./finetuned_model', num_train_epochs=3, batch_size=8, max_length=128):
    model = GPT2LMHeadModel.from_pretrained(model_name_or_path)
    model.resize_token_embeddings(len(tokenizer))

    optimizer = AdamW(model.parameters(), lr=5e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataset)*num_train_epochs)

    model.train()

    for epoch in range(num_train_epochs):
        for i in range(0, len(train_dataset), batch_size):
            batch = train_dataset[i:i+batch_size]
            inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=max_length)

            outputs = model(**inputs, labels=inputs.input_ids)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            if i % 100 == 0:
                print(f"Epoch {epoch}, Step {i}, Loss: {loss.item()}")

    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)


In [5]:
# Generate Paragraphs
def generate_paragraph(prompt_text, tokenizer, model_path='./finetuned_model'):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.eval()

    inputs = tokenizer(prompt_text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    outputs = model.generate(input_ids=inputs.input_ids, max_length=300, num_return_sequences=1, temperature=0.7)

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [12]:
tokenizerrr = GPT2Tokenizer.from_pretrained('gpt2')
tokenizerrr.pad_token = tokenizerrr.eos_token
jokes_dataset = prepare_dataset('/kaggle/input/shortj/shortjokes.csv', tokenizerrr)
fine_tune_gpt2(jokes_dataset, tokenizerrr)



Epoch 0, Step 0, Loss: 5.5245537757873535
Epoch 0, Step 200, Loss: 3.5892562866210938
Epoch 0, Step 400, Loss: 3.2280051708221436
Epoch 0, Step 600, Loss: 2.623577117919922
Epoch 0, Step 800, Loss: 2.9633688926696777
Epoch 0, Step 1000, Loss: 2.606614112854004
Epoch 0, Step 1200, Loss: 2.1425745487213135
Epoch 0, Step 1400, Loss: 2.6543514728546143
Epoch 0, Step 1600, Loss: 2.504044771194458
Epoch 0, Step 1800, Loss: 3.060067653656006
Epoch 0, Step 2000, Loss: 2.954200506210327
Epoch 0, Step 2200, Loss: 2.661029100418091
Epoch 0, Step 2400, Loss: 2.474323272705078
Epoch 0, Step 2600, Loss: 2.293729305267334
Epoch 0, Step 2800, Loss: 2.2833001613616943
Epoch 1, Step 0, Loss: 2.6425533294677734
Epoch 1, Step 200, Loss: 2.9163622856140137
Epoch 1, Step 400, Loss: 2.685316324234009
Epoch 1, Step 600, Loss: 2.2827208042144775
Epoch 1, Step 800, Loss: 2.6123154163360596
Epoch 1, Step 1000, Loss: 2.23970365524292
Epoch 1, Step 1200, Loss: 1.8903477191925049
Epoch 1, Step 1400, Loss: 2.4102034

In [14]:
prompt_text = "Why did the chicken cross the road?"
generated_paragraph = generate_paragraph(prompt_text, tokenizerrr)
print("Generated Paragraph:", generated_paragraph)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Paragraph: Why did the chicken cross the road? Because he was too lazy to drive.


In [15]:
prompt_text2 = "Two guys walk into a bar"
generated_paragraph2 = generate_paragraph(prompt_text2, tokenizerrr)
print("Generated Paragraph:", generated_paragraph2)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Paragraph: Two guys walk into a bar... The bartender says, "Hey, guys, what's up?" The guy says, "I'm just trying to get some beers."


# **Part 3 BERT:**

In [53]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np


In [33]:
with open('/kaggle/input/amazonfashion/AMAZON_FASHION_5.json', 'r') as f:
    for line in f:
        data = json.loads(line)

In [34]:
# Load the data
def load_data(file_path):
    data = pd.read_json(file_path, lines=True)
    data = data[['reviewText', 'overall']]
    data = data.dropna()
    data['label'] = data['overall'].apply(lambda x: 1 if x > 3 else 0)
    return data

In [54]:
# Custom Dataset class (Added)
class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [56]:
# Tokenize data
def tokenize_data(texts, tokenizer):
    return tokenizer(texts, truncation=True, padding=True, max_length=512)


In [57]:
# Prepare datasets (Modified)
def prepare_datasets(train_texts, train_labels, test_texts, test_labels, tokenizer):
    train_encodings = tokenize_data(train_texts, tokenizer)
    test_encodings = tokenize_data(test_texts, tokenizer)
    
    train_dataset = ReviewDataset(train_encodings, train_labels)
    test_dataset = ReviewDataset(test_encodings, test_labels)
    return train_dataset, test_dataset

In [58]:
# Compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}


In [59]:
file_path = '/kaggle/input/amazonfashion/AMAZON_FASHION_5.json'
data = load_data(file_path)

In [60]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
        data['reviewText'].tolist(), 
        data['label'].tolist(), 
        test_size=0.2
    )

In [62]:
tokenizerb = BertTokenizer.from_pretrained('bert-base-uncased')

In [63]:
train_dataset, test_dataset = prepare_datasets(train_texts, train_labels, test_texts, test_labels, tokenizerb)

In [64]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [65]:
data_collator = DataCollatorWithPadding(tokenizerb)

In [66]:
training_args = TrainingArguments(
        output_dir='./results',          
        num_train_epochs=3,              
        per_device_train_batch_size=8,  
        per_device_eval_batch_size=8,   
        warmup_steps=500,                
        weight_decay=0.01,              
        logging_dir='./logs',            
        logging_steps=10,
        evaluation_strategy="epoch"
    )

In [67]:
trainer = Trainer(
        model=model,                        
        args=training_args,                  
        train_dataset=train_dataset,         
        eval_dataset=test_dataset,           
        compute_metrics=compute_metrics,
        data_collator=data_collator  # Added
    )

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [68]:
trainer.train()
results = trainer.evaluate()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1635,0.088574,0.981013,0.988806
2,0.0055,0.023949,0.993671,0.996234
3,0.039,0.021084,0.995253,0.997178




In [69]:
print(f"Accuracy: {results['eval_accuracy']}")
print(f"F1 Score: {results['eval_f1']}")
print(f"Loss: {results['eval_loss']}")

Accuracy: 0.995253164556962
F1 Score: 0.9971777986829727
Loss: 0.02108418382704258


In [70]:
predictions = trainer.predict(test_dataset)
print(classification_report(test_labels, np.argmax(predictions.predictions, axis=1)))




              precision    recall  f1-score   support

           0       0.98      0.99      0.99       100
           1       1.00      1.00      1.00       532

    accuracy                           1.00       632
   macro avg       0.99      0.99      0.99       632
weighted avg       1.00      1.00      1.00       632



**Conclusion :**

Using a pre-trained BERT model for sentiment analysis of Amazon reviews is highly effective. BERT's contextual understanding of language enables it to accurately capture the nuances of review texts. Fine-tuning the pre-trained model on the specific dataset allows for domain adaptation, improving performance on this task. The results show good accuracy(0.995253164556962) and F1 scores(0.995253164556962), making BERT a strong choice for natural language processing tasks involving sentiment analysis. Additionally, leveraging pre-trained models significantly reduces the time and resources needed compared to training from scratch.