In [1]:
import os
os.getcwd()

'D:\\AhmedCoding'

In [4]:
# 2.1 تشغيل المكتبات

import re
import random
import math
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ثبات النتائج (اختياري)
random.seed(42)
torch.manual_seed(42)

device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cpu'

In [3]:
# 3) تجهيز بيانات تدريب صغيرة (للتعلم)


data = [
    ("i love this movie", 1),
    ("this film is amazing", 1),
    ("what a great experience", 1),
    ("i enjoyed the story", 1),
    ("fantastic acting and plot", 1),
    ("this is wonderful", 1),

    ("i hate this movie", 0),
    ("this film is terrible", 0),
    ("what a bad experience", 0),
    ("i disliked the story", 0),
    ("awful acting and plot", 0),
    ("this is horrible", 0),
]

random.shuffle(data)

split = int(0.8 * len(data))
train_data = data[:split]
val_data = data[split:]

len(train_data), len(val_data), train_data[:2], val_data[:2]


(9,
 3,
 [('this film is terrible', 0), ('this is wonderful', 1)],
 [('i love this movie', 1), ('this film is amazing', 1)])

In [5]:
# 4) Tokenization (تقطيع النص)
# نكتب Tokenizer بسيط:
# نحول لحروف صغيرة
# نشيل الرموز
# نقسم بكلمات


def tokenize(text: str):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    tokens = text.split()
    return tokens

tokenize("Fantastic acting!!! and plot.")  # اختبار


['fantastic', 'acting', 'and', 'plot']

In [6]:
# بناء Vocabulary + كلمات خاصة
# نحتاج كلمات خاصة:
# <PAD> للحشو
# <UNK> للكلمات غير المعروفة


PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"

def build_vocab(dataset, min_freq=1):
    counter = Counter()
    for text, _ in dataset:
        counter.update(tokenize(text))
    words = [w for w, c in counter.items() if c >= min_freq]
    vocab = {PAD_TOKEN: 0, UNK_TOKEN: 1}
    for w in sorted(words):
        if w not in vocab:
            vocab[w] = len(vocab)
    return vocab

vocab = build_vocab(train_data, min_freq=1)
vocab_size = len(vocab)
vocab_size, list(vocab.items())[:10]


(24,
 [('<PAD>', 0),
  ('<UNK>', 1),
  ('a', 2),
  ('acting', 3),
  ('and', 4),
  ('bad', 5),
  ('disliked', 6),
  ('enjoyed', 7),
  ('experience', 8),
  ('fantastic', 9)])

In [7]:
# 6) Encoding + Padding + Truncation

# Encoding: كل كلمة تتحول إلى رقم

# Padding: نخلي كل الجمل نفس الطول max_len

# Truncation: نقص الجمل الطويلة


def encode(tokens, vocab):
    return [vocab.get(t, vocab[UNK_TOKEN]) for t in tokens]

def pad_sequence(ids, max_len, pad_id=0):
    if len(ids) < max_len:
        return ids + [pad_id] * (max_len - len(ids))
    return ids[:max_len]

MAX_LEN = 6  # للتبسيط

# مثال
example = "i love this amazing film"
ids = encode(tokenize(example), vocab)
ids, pad_sequence(ids, MAX_LEN, vocab[PAD_TOKEN])


([14, 1, 21, 1, 10], [14, 1, 21, 1, 10, 0])

In [8]:
# Dataset و DataLoader
# سنُرجع:
# input_ids: شكلها [max_len]
# label: 0/1



class TextDataset(Dataset):
    def __init__(self, data, vocab, max_len):
        self.data = data
        self.vocab = vocab
        self.max_len = max_len
        self.pad_id = vocab[PAD_TOKEN]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, label = self.data[idx]
        tokens = tokenize(text)
        ids = encode(tokens, self.vocab)
        ids = pad_sequence(ids, self.max_len, self.pad_id)
        return torch.tensor(ids, dtype=torch.long), torch.tensor(label, dtype=torch.long)

train_ds = TextDataset(train_data, vocab, MAX_LEN)
val_ds = TextDataset(val_data, vocab, MAX_LEN)

train_loader = DataLoader(train_ds, batch_size=4, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=4, shuffle=False)

batch = next(iter(train_loader))
batch[0].shape, batch[1]


(torch.Size([4, 6]), tensor([1, 0, 1, 0]))

In [9]:
# 9) بناء نموذج LSTM للتصنيف
# الفكرة:

# Embedding → LSTM → نأخذ آخر تمثيل (أو hidden state) → Linear → logits

# ثم CrossEntropyLoss



class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=64, num_layers=1, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=False
        )
        self.fc = nn.Linear(hidden_dim, 2)  # 2 classes: negative/positive

    def forward(self, input_ids):
        # input_ids: [B, T]
        x = self.embedding(input_ids)      # [B, T, E]
        out, (h, c) = self.lstm(x)         # h: [num_layers, B, H]
        last_h = h[-1]                     # [B, H]
        logits = self.fc(last_h)           # [B, 2]
        return logits

model = LSTMSentiment(vocab_size=vocab_size, embed_dim=64, hidden_dim=64, pad_idx=vocab[PAD_TOKEN]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

model


LSTMSentiment(
  (embedding): Embedding(24, 64, padding_idx=0)
  (lstm): LSTM(64, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=2, bias=True)
)

In [10]:
# 10) التدريب خطوة بخطوة
# 10.1 دوال مساعدة: Accuracy + تمريرة Epoch


def accuracy_from_logits(logits, y):
    preds = logits.argmax(dim=1)
    return (preds == y).float().mean().item()

def run_epoch(model, loader, train=True):
    model.train() if train else model.eval()
    total_loss = 0.0
    total_acc = 0.0
    n_batches = 0

    for input_ids, labels in loader:
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        if train:
            optimizer.zero_grad()

        with torch.set_grad_enabled(train):
            logits = model(input_ids)
            loss = criterion(logits, labels)
            acc = accuracy_from_logits(logits, labels)

            if train:
                loss.backward()
                optimizer.step()

        total_loss += loss.item()
        total_acc += acc
        n_batches += 1

    return total_loss / n_batches, total_acc / n_batches


In [11]:
# 10.2 تدريب عدة Epochs/



EPOCHS = 20

for epoch in range(1, EPOCHS + 1):
    train_loss, train_acc = run_epoch(model, train_loader, train=True)
    val_loss, val_acc = run_epoch(model, val_loader, train=False)

    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch:02d} | "
              f"Train loss {train_loss:.4f} acc {train_acc:.3f} | "
              f"Val loss {val_loss:.4f} acc {val_acc:.3f}")


Epoch 01 | Train loss 0.7025 acc 0.333 | Val loss 0.6989 acc 0.333
Epoch 05 | Train loss 0.6609 acc 0.833 | Val loss 0.7492 acc 0.000
Epoch 10 | Train loss 0.5931 acc 0.750 | Val loss 0.9007 acc 0.000
Epoch 15 | Train loss 0.4018 acc 0.917 | Val loss 1.2979 acc 0.000
Epoch 20 | Train loss 0.1580 acc 1.000 | Val loss 2.1850 acc 0.000


In [12]:
# 11) تجربة النموذج على جمل جديدة (Inference)

# سنكتب دالة:
# tokenize → encode → pad → model → softmax → قرار


@torch.no_grad()
def predict_sentiment(text: str):
    model.eval()
    tokens = tokenize(text)
    ids = encode(tokens, vocab)
    ids = pad_sequence(ids, MAX_LEN, vocab[PAD_TOKEN])
    x = torch.tensor([ids], dtype=torch.long).to(device)  # [1, T]
    logits = model(x)
    probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
    pred = int(probs.argmax())
    label = "POSITIVE" if pred == 1 else "NEGATIVE"
    return label, probs

tests = [
    "i love this film",
    "this movie is horrible",
    "great acting",
    "bad plot",
    "i enjoyed this",
    "awful movie"
]

for t in tests:
    label, probs = predict_sentiment(t)
    print(f"{t:25s} -> {label} | probs={probs}")


i love this film          -> NEGATIVE | probs=[0.87378013 0.1262199 ]
this movie is horrible    -> NEGATIVE | probs=[0.9786391  0.02136091]
great acting              -> POSITIVE | probs=[0.22850707 0.77149296]
bad plot                  -> POSITIVE | probs=[0.30520442 0.6947956 ]
i enjoyed this            -> POSITIVE | probs=[0.47996175 0.5200383 ]
awful movie               -> NEGATIVE | probs=[0.8067603  0.19323973]


In [13]:
# 12) حفظ النموذج (اختياري لكنه مهم للبورتفوليو)



save_path = "lstm_sentiment.pt"
torch.save({
    "model_state": model.state_dict(),
    "vocab": vocab,
    "max_len": MAX_LEN
}, save_path)

save_path


'lstm_sentiment.pt'