# reading data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
complaints = pd.read_csv("/kaggle/input/complaints/complaints.csv")

# Understanding data

In [3]:
complaints.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5788 entries, 0 to 5787
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Ticket_ID       5788 non-null   object 
 1   Student_ID      5788 non-null   object 
 2   Category        5788 non-null   object 
 3   Complaint_Text  5788 non-null   object 
 4   Priority        5787 non-null   object 
 5   Status          5787 non-null   object 
 6   Date_Submitted  5771 non-null   object 
 7   Unnamed: 0.1    3286 non-null   float64
 8   Unnamed: 0      991 non-null    float64
dtypes: float64(2), object(7)
memory usage: 407.1+ KB


In [4]:
complaints.describe(include="all")

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0.2,Ticket_ID,Student_ID,Category,Complaint_Text,Priority,Status,Date_Submitted,Unnamed: 0.1,Unnamed: 0
count,5788,5788,5788,5788,5787,5787,5771,3286.0,991.0
unique,1300,2778,4,5788,7,19,820,,
top,TCKT0090,S11234,Academic Support and Resources,"Dear Team, My university email rejects attachm...",Medium,Open,2025-08-31,,
freq,10,15,1483,1,2892,3469,72,,
mean,,,,,,,,1642.5,514.889001
std,,,,,,,,948.730819,294.264098
min,,,,,,,,0.0,0.0
25%,,,,,,,,821.25,272.5
50%,,,,,,,,1642.5,520.0
75%,,,,,,,,2463.75,767.5


In [5]:
complaints=complaints[["Complaint_Text","Category"]]

In [6]:
complaints.columns

Index(['Complaint_Text', 'Category'], dtype='object')

In [7]:
import torch
device = torch.device("cuda")

In [8]:
!pip install transformers



# making label encoder on category column

In [9]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
complaints["Category"] = encoder.fit_transform(complaints["Category"])
sentences = complaints.Complaint_Text.values
labels = complaints.Category.values

In [10]:
np.unique(labels)


array([0, 1, 2, 3])

# having tokenizer of bert

In [11]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [12]:
max_len = 0

# For every sentence...
for sent in sentences:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(sent, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length: ', max_len)


Max sentence length:  62


In [13]:
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    encoded_dict = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
                        max_length = 64,            # Pad & truncate all sentences
                        padding = 'max_length',     # <-- بدل pad_to_max_length
                        truncation = True,          # لازم تحطها علشان يقطع لو أطول
                        return_attention_mask = True,
                        return_tensors = 'pt',     
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])


Original:  المصاريف دي عاملاني مشاكل كبيرة، والردود مش موجودة أبداً.
Token IDs: tensor([  101, 24177, 10720, 36139, 19455, 35476, 59673, 23860,   476, 80958,
        34003,   446,   479, 13259, 91296, 10727,   476, 11691, 29606, 10400,
        19767, 14363,   119,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0])


# making dataset divided into train valiation test

In [14]:
from torch.utils.data import TensorDataset, random_split

# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Calculate sizes
total_size = len(dataset)
train_size = int(0.7 * total_size)
val_size = int(0.2 * total_size)
test_size = total_size - train_size - val_size  # ensures all samples are used

# Split dataset
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print(f"{train_size:,} training samples")
print(f"{val_size:,} validation samples")
print(f"{test_size:,} test samples")


4,051 training samples
1,157 validation samples
580 test samples


In [15]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# The DataLoader needs to know our batch size for training, so we specify it 
# here. For fine-tuning BERT on a specific task, the authors recommend a batch 
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# using the pretrained classifier Bert 

In [16]:
from transformers import BertForSequenceClassification
from torch.optim import AdamW
import torch

# Load multilingual uncased BERT for classification
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-uncased",
    num_labels=4,              # adjust to your dataset (4 classes here)
    output_attentions=False,
    output_hidden_states=False,
)

# Move model to GPU (if available)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


2025-09-09 19:40:54.311931: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757446854.496587      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757446854.561035      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(105879, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [17]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )


In [18]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs. The BERT authors recommend between 2 and 4. 
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)


In [19]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, evaluate
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.5.1
    Uninstalling fsspec-2025.5.1:
      Successfully uninstalled fsspec-2025.5.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-clou

# Accuracy computation

In [20]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [21]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [22]:
id2label = dict(enumerate(encoder.classes_))
id2label

{0: 'Academic Support and Resources',
 1: 'Financial Support',
 2: 'IT',
 3: 'Student Affairs'}

F1-Score Function

In [23]:
def f1_score_from_scratch(y_true, y_pred):
    """
    Compute F1 score (macro average) from scratch.
    """
    classes = np.unique(y_true)
    f1_scores = []
    for cls in classes:
        tp = np.sum((y_pred == cls) & (y_true == cls))
        fp = np.sum((y_pred == cls) & (y_true != cls))
        fn = np.sum((y_pred != cls) & (y_true == cls))

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1)

    return np.mean(f1_scores)  # macro average


# Training the classifier on data

In [24]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

training_stats = []

# Measure total training time
total_t0 = time.time()

for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    total_train_accuracy = 0

    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask, 
            labels=b_labels
        )
        
        loss = outputs.loss
        logits = outputs.logits

        total_train_loss += loss.item()

        # Training accuracy
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_train_accuracy += flat_accuracy(logits, label_ids)

        # Backprop
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            
    avg_train_accuracy = total_train_accuracy / len(train_dataloader)  
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training Accuracy: {0:.2f}".format(avg_train_accuracy))  
    print("  Training epoch took: {:}".format(training_time))

    # ===== Validation =====
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# ===== Validation =====
print("")
print("Running Validation...")

t0 = time.time()
model.eval()

total_eval_accuracy = 0
total_eval_loss = 0

all_preds = []
all_labels = []

for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():        
        outputs = model(
            b_input_ids, 
            token_type_ids=None, 
            attention_mask=b_input_mask, 
            labels=b_labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
            
    total_eval_loss += loss.item()

    preds = np.argmax(logits.detach().cpu().numpy(), axis=1)
    labels = b_labels.to('cpu').numpy()

    all_preds.extend(preds)
    all_labels.extend(labels)

    total_eval_accuracy += np.sum(preds == labels) / len(labels)

avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
avg_val_loss = total_eval_loss / len(validation_dataloader)
validation_time = format_time(time.time() - t0)

print("  Validation Accuracy: {0:.2f}".format(avg_val_accuracy))
print("  Validation Loss: {0:.2f}".format(avg_val_loss))
print("  Validation took: {:}".format(validation_time))

# === NEW: Classification Report + Confusion Matrix + F1 ===
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=list(id2label.values())))

print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))

# F1 with sklearn
print("F1 Score (macro): {:.4f}".format(f1_score(all_labels, all_preds, average='macro')))
print("F1 Score (micro): {:.4f}".format(f1_score(all_labels, all_preds, average='micro')))
print("F1 Score (weighted): {:.4f}".format(f1_score(all_labels, all_preds, average='weighted')))

# F1 from scratch
print("F1 Score (macro, from scratch): {:.4f}".format(f1_score_from_scratch(np.array(all_labels), np.array(all_preds))))



Training...
  Batch    40  of    127.    Elapsed: 0:00:09.
  Batch    80  of    127.    Elapsed: 0:00:17.
  Batch   120  of    127.    Elapsed: 0:00:25.

  Average training loss: 0.67
  Training Accuracy: 0.73
  Training epoch took: 0:00:26

Training...
  Batch    40  of    127.    Elapsed: 0:00:08.
  Batch    80  of    127.    Elapsed: 0:00:16.
  Batch   120  of    127.    Elapsed: 0:00:24.

  Average training loss: 0.18
  Training Accuracy: 0.94
  Training epoch took: 0:00:26

Training...
  Batch    40  of    127.    Elapsed: 0:00:08.
  Batch    80  of    127.    Elapsed: 0:00:16.
  Batch   120  of    127.    Elapsed: 0:00:24.

  Average training loss: 0.10
  Training Accuracy: 0.97
  Training epoch took: 0:00:26

Training...
  Batch    40  of    127.    Elapsed: 0:00:08.
  Batch    80  of    127.    Elapsed: 0:00:16.
  Batch   120  of    127.    Elapsed: 0:00:24.

  Average training loss: 0.06
  Training Accuracy: 0.98
  Training epoch took: 0:00:26

Running Validation...
  Validat

# predict function to test data on

In [25]:
from transformers import BertTokenizer
import torch
import numpy as np

# Load the same tokenizer used for training (multilingual)
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")

def predict_sentence(sentence, model, device):
    # Tokenize the input (works with Arabic, English, etc.)
    encoded_dict = tokenizer.encode_plus(
        sentence,                      
        add_special_tokens=True,       # [CLS] and [SEP]
        max_length=64,                 # Pad & truncate
        padding='max_length',
        truncation=True,
        return_attention_mask=True,    
        return_tensors='pt',           
    )
    
    # Move tensors to device
    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)
    
    # Eval mode
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Convert logits to prediction
    probs = torch.nn.functional.softmax(logits, dim=1)
    predicted_class = torch.argmax(probs, dim=1).cpu().item()
    
    return predicted_class, probs.cpu().numpy()


 Testing loop on test data before predicting on new data 

In [26]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import classification_report, confusion_matrix
import torch
import numpy as np

# For testing we don’t need shuffling (use SequentialSampler)
batch_size = 32  # adjust depending on GPU memory

test_dataloader = DataLoader(
    test_dataset,  # <-- the dataset you got from random_split
    sampler=SequentialSampler(test_dataset),
    batch_size=batch_size
)

print("")
print("Running Test Evaluation...")

model.eval()
total_test_accuracy = 0
total_test_loss = 0

all_preds = []
all_labels = []

for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():
        outputs = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels
        )
        loss = outputs.loss
        logits = outputs.logits

    total_test_loss += loss.item()
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Accuracy
    total_test_accuracy += flat_accuracy(logits, label_ids)

    # Predictions
    preds = np.argmax(logits, axis=1).flatten()
    all_preds.extend(preds)
    all_labels.extend(label_ids)

avg_test_accuracy = total_test_accuracy / len(test_dataloader)
avg_test_loss = total_test_loss / len(test_dataloader)

print("  Test Accuracy: {0:.2f}".format(avg_test_accuracy))
print("  Test Loss: {0:.2f}".format(avg_test_loss))

# ---- Extra evaluation ----
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=encoder.classes_))

print("Confusion Matrix:")
print(confusion_matrix(all_labels, all_preds))



Running Test Evaluation...
  Test Accuracy: 0.96
  Test Loss: 0.15

Classification Report:
                                precision    recall  f1-score   support

Academic Support and Resources       0.95      0.93      0.94       148
             Financial Support       0.97      0.98      0.98       150
                            IT       0.94      0.98      0.96       135
               Student Affairs       0.99      0.96      0.97       147

                      accuracy                           0.96       580
                     macro avg       0.96      0.96      0.96       580
                  weighted avg       0.96      0.96      0.96       580

Confusion Matrix:
[[138   3   7   0]
 [  2 147   0   1]
 [  2   0 132   1]
 [  3   1   2 141]]


predicting on totally new data made by chatgpt different from the one from test dataset

In [27]:
import torch
import numpy as np
from transformers import BertTokenizer

# Load the same multilingual tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")

# Define your label mapping (order must match your training)
categories = ["Academic Support", "Financial Support", "IT", "Student Affairs"]

def predict_sentence(sentence, model, device):
    encoded_dict = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=64,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoded_dict['input_ids'].to(device)
    attention_mask = encoded_dict['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    probs = torch.nn.functional.softmax(logits, dim=1)
    predicted_class = torch.argmax(probs, dim=1).cpu().item()

    return categories[predicted_class], probs.cpu().numpy()

# Example test sentences (Arabic + English)
test_sentences = [
    "I need help understanding the course material.",
    "My scholarship payment has not arrived yet.",
    "The university Wi-Fi is not working.",
    "I want to apply for a student club.",
    "أحتاج إلى مساعدة في فهم محاضراتي.",
    "لم أتسلم المنحة المالية حتى الآن.",
    "الإنترنت في الجامعة بطيء جدًا.",
    "أرغب في التسجيل في نشاط طلابي."
]

# Run predictions
for s in test_sentences:
    pred_class, probs = predict_sentence(s, model, device)
    print(f"Sentence: {s}")
    print(f"Predicted Category: {pred_class}\n")
# Sentence: I need help understanding the course material.
# Predicted Category: Academic Support

# Sentence: My scholarship payment has not arrived yet.
# Predicted Category: Financial Support

# Sentence: The university Wi-Fi is not working.
# Predicted Category: IT

# Sentence: I want to apply for a student club.
# Predicted Category: Student Affairs

# Sentence: أحتاج إلى مساعدة في فهم محاضراتي.
# Predicted Category: Academic Support

# Sentence: لم أتسلم المنحة المالية حتى الآن.
# Predicted Category: Financial Support

# Sentence: الإنترنت في الجامعة بطيء جدًا.
# Predicted Category: IT

# Sentence: أرغب في التسجيل في نشاط طلابي.
# Predicted Category: Student Affairs



Sentence: I need help understanding the course material.
Predicted Category: Academic Support

Sentence: My scholarship payment has not arrived yet.
Predicted Category: Financial Support

Sentence: The university Wi-Fi is not working.
Predicted Category: IT

Sentence: I want to apply for a student club.
Predicted Category: Student Affairs

Sentence: أحتاج إلى مساعدة في فهم محاضراتي.
Predicted Category: Academic Support

Sentence: لم أتسلم المنحة المالية حتى الآن.
Predicted Category: Financial Support

Sentence: الإنترنت في الجامعة بطيء جدًا.
Predicted Category: IT

Sentence: أرغب في التسجيل في نشاط طلابي.
Predicted Category: IT



In [28]:
print(encoder.classes_)
id2label = dict(enumerate(encoder.classes_))
print(id2label)


['Academic Support and Resources' 'Financial Support' 'IT'
 'Student Affairs']
{0: 'Academic Support and Resources', 1: 'Financial Support', 2: 'IT', 3: 'Student Affairs'}


In [29]:
categ = ['Financial Support',  'Student Affairs',  'Academic Support and Resources',  'IT',  'Financial Support']
complaint = ['المصاريف دي عاملاني مشاكل كبيرة، والردود مش موجودة أبداً.',  'السكن الجامعي غير ملائم بتاتاً، والنظافة في أسوأ حال.', 'أحتاج تفاصيل أكثر عن طريقة استخدام المختبر لأني محتار.', 'خدمة الإنترنت في المكتبة غير ثابتة، وهذا يعيق تحضير الأبحاث.',  'تتغير الفواتير دون إبلاغ، وهذا غير مقبول ويجب تصحيحه فوراً.']
test={}
for i in range (len(categ)):
    test[complaint[i]]=categ[i]
test

{'المصاريف دي عاملاني مشاكل كبيرة، والردود مش موجودة أبداً.': 'Financial Support',
 'السكن الجامعي غير ملائم بتاتاً، والنظافة في أسوأ حال.': 'Student Affairs',
 'أحتاج تفاصيل أكثر عن طريقة استخدام المختبر لأني محتار.': 'Academic Support and Resources',
 'خدمة الإنترنت في المكتبة غير ثابتة، وهذا يعيق تحضير الأبحاث.': 'IT',
 'تتغير الفواتير دون إبلاغ، وهذا غير مقبول ويجب تصحيحه فوراً.': 'Financial Support'}

In [30]:
for s in test.keys():
    pred_class, probs = predict_sentence(s, model, device)
    print(f"Sentence: {s}")
    print(f"Predicted Category: {pred_class}\n")

Sentence: المصاريف دي عاملاني مشاكل كبيرة، والردود مش موجودة أبداً.
Predicted Category: Financial Support

Sentence: السكن الجامعي غير ملائم بتاتاً، والنظافة في أسوأ حال.
Predicted Category: Student Affairs

Sentence: أحتاج تفاصيل أكثر عن طريقة استخدام المختبر لأني محتار.
Predicted Category: Academic Support

Sentence: خدمة الإنترنت في المكتبة غير ثابتة، وهذا يعيق تحضير الأبحاث.
Predicted Category: IT

Sentence: تتغير الفواتير دون إبلاغ، وهذا غير مقبول ويجب تصحيحه فوراً.
Predicted Category: Financial Support



In [31]:
complaint = ['المحاضرات غير منظمة وتحتاج ترتيب.', 'المساعدات لا تصل للطلاب المحتاجين.',  'الواجهة الإلكترونية للجامعة مش واضحة ومحبطة', 'سكن الطلاب عامل زي ملعب مصارعة مصري، دايمًا فيه مصارعين جدد',  'عدم انتظام مواعيد النقل يسبب مشكلات كبيرة للطلاب']
categ = [ 'Academic Support and Resources', 'Financial Support', 'Academic Support and Resources', 'Student Affairs','Student Affairs']
test={}
for i in range (len(categ)):
    test[complaint[i]]=categ[i]
test

{'المحاضرات غير منظمة وتحتاج ترتيب.': 'Academic Support and Resources',
 'المساعدات لا تصل للطلاب المحتاجين.': 'Financial Support',
 'الواجهة الإلكترونية للجامعة مش واضحة ومحبطة': 'Academic Support and Resources',
 'سكن الطلاب عامل زي ملعب مصارعة مصري، دايمًا فيه مصارعين جدد': 'Student Affairs',
 'عدم انتظام مواعيد النقل يسبب مشكلات كبيرة للطلاب': 'Student Affairs'}

In [32]:
for s in test.keys():
    pred_class, probs = predict_sentence(s, model, device)
    print(f"Sentence: {s}")
    print(f"Predicted Category: {pred_class}\n")

Sentence: المحاضرات غير منظمة وتحتاج ترتيب.
Predicted Category: Academic Support

Sentence: المساعدات لا تصل للطلاب المحتاجين.
Predicted Category: Financial Support

Sentence: الواجهة الإلكترونية للجامعة مش واضحة ومحبطة
Predicted Category: Academic Support

Sentence: سكن الطلاب عامل زي ملعب مصارعة مصري، دايمًا فيه مصارعين جدد
Predicted Category: Student Affairs

Sentence: عدم انتظام مواعيد النقل يسبب مشكلات كبيرة للطلاب
Predicted Category: Student Affairs

