# **Fares Ahmed Moustafa**
### ***F.ahmed2270@nu.edu.eg***

## ***Imports***

In [24]:
import string
import ast
import torch
import json
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from gensim.models import Word2Vec
from transformers import AutoTokenizer, AutoModel

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## ***Datasets Uploading***

In [31]:
medical_train = pd.read_csv("/kaggle/input/my-datasets/train_mapped.csv")
medical_test = pd.read_csv("/kaggle/input/my-datasets/test_mapped.csv")
medical_df = pd.concat([medical_train, medical_test], ignore_index=True)

financial_df = pd.read_csv("/kaggle/input/my-datasets/financial_phrasebank_mapped.csv")

In [32]:
medical_df.head()

Unnamed: 0,condition_label,medical_abstract
0,general pathological conditions,Tissue changes around loose prostheses. A cani...
1,neoplasms,Neuropeptide Y and neuron-specific enolase lev...
2,digestive system diseases,"Sexually transmitted diseases of the colon, re..."
3,neoplasms,Lipolytic factors associated with murine and h...
4,nervous system diseases,Does carotid restenosis predict an increased r...


In [33]:
financial_df.head()

Unnamed: 0,sentence,label
0,"According to Gran , the company has no plans t...",neutral
1,With the new production plant the company woul...,positive
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,"In the third quarter of 2010 , net sales incre...",positive
4,Operating profit rose to EUR 13.1 mn from EUR ...,positive


## ***Datasets Cleaning***

In [34]:
def preprocess_text(text):
    if pd.isna(text):
        return []
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and t not in punctuation]
    return tokens

medical_df = medical_df.dropna(subset=['medical_abstract'])

medical_df['tokens'] = medical_df['medical_abstract'].apply(preprocess_text)

print("Medical dataset sample:")
print(medical_df[['medical_abstract', 'tokens']].head())


financial_df = financial_df.dropna(subset=['sentence'])

financial_df['tokens'] = financial_df['sentence'].apply(preprocess_text)

print("\nFinancial dataset sample:")
print(financial_df[['sentence', 'tokens']].head())

Medical dataset sample:
                                    medical_abstract  \
0  Tissue changes around loose prostheses. A cani...   
1  Neuropeptide Y and neuron-specific enolase lev...   
2  Sexually transmitted diseases of the colon, re...   
3  Lipolytic factors associated with murine and h...   
4  Does carotid restenosis predict an increased r...   

                                              tokens  
0  [tissue, changes, around, loose, prostheses, c...  
1  [neuropeptide, neuron-specific, enolase, level...  
2  [sexually, transmitted, diseases, colon, rectu...  
3  [lipolytic, factors, associated, murine, human...  
4  [carotid, restenosis, predict, increased, risk...  

Financial dataset sample:
                                            sentence  \
0  According to Gran , the company has no plans t...   
1  With the new production plant the company woul...   
2  For the last quarter of 2010 , Componenta 's n...   
3  In the third quarter of 2010 , net sales incre...   
4 

In [35]:

med_train, med_temp = train_test_split(
    medical_df,
    test_size=0.3,
    random_state=42,
    stratify=medical_df['condition_label']
)
med_val, med_test = train_test_split(
    med_temp,
    test_size=0.5,
    random_state=42,
    stratify=med_temp['condition_label']
)

print("Medical dataset splits:")
print("Train:", med_train.shape)
print("Validation:", med_val.shape)
print("Test:", med_test.shape)


fin_train, fin_temp = train_test_split(
    financial_df,
    test_size=0.3,
    random_state=42,
    stratify=financial_df['label']
)
fin_val, fin_test = train_test_split(
    fin_temp,
    test_size=0.5,
    random_state=42,
    stratify=fin_temp['label']
)

print("Financial dataset splits:")
print("Train:", fin_train.shape)
print("Validation:", fin_val.shape)
print("Test:", fin_test.shape)

Medical dataset splits:
Train: (10106, 3)
Validation: (2166, 3)
Test: (2166, 3)
Financial dataset splits:
Train: (2417, 3)
Validation: (518, 3)
Test: (518, 3)


In [36]:
med_train_texts = med_train['tokens'].apply(lambda x: ' '.join(x))
med_val_texts   = med_val['tokens'].apply(lambda x: ' '.join(x))
med_test_texts  = med_test['tokens'].apply(lambda x: ' '.join(x))

fin_train_texts = fin_train['tokens'].apply(lambda x: ' '.join(x))
fin_val_texts   = fin_val['tokens'].apply(lambda x: ' '.join(x))
fin_test_texts  = fin_test['tokens'].apply(lambda x: ' '.join(x))

## ***Pipeline A: TF-IDF + Classifier (Baseline)***

In [37]:
tfidf_med = TfidfVectorizer(max_features=5000)
X_med_train = tfidf_med.fit_transform(med_train_texts)
X_med_val = tfidf_med.transform(med_val_texts)
X_med_test = tfidf_med.transform(med_test_texts)

tfidf_fin = TfidfVectorizer(max_features=5000)
X_fin_train = tfidf_fin.fit_transform(fin_train_texts)
X_fin_val = tfidf_fin.transform(fin_val_texts)
X_fin_test = tfidf_fin.transform(fin_test_texts)

In [38]:
y_med_train = med_train['condition_label']
y_med_test = med_test['condition_label']

clf_med = LogisticRegression(max_iter=1000)
clf_med.fit(X_med_train, y_med_train)
y_med_pred = clf_med.predict(X_med_test)

y_fin_train = fin_train['label']
y_fin_test = fin_test['label']

clf_fin = LogisticRegression(max_iter=1000)
clf_fin.fit(X_fin_train, y_fin_train)
y_fin_pred = clf_fin.predict(X_fin_test)

In [39]:
def calculate_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1
    }

metrics_med = calculate_metrics(y_med_test, y_med_pred)
print("Medical TF-IDF + LR metrics:", metrics_med)

metrics_fin = calculate_metrics(y_fin_test, y_fin_pred)
print("Financial TF-IDF + LR metrics:", metrics_fin)

with open("pipelineA_metrics.json", "w") as f:
    json.dump({'medical': metrics_med, 'financial': metrics_fin}, f)

print("Pipeline A metrics saved.")

Medical TF-IDF + LR metrics: {'accuracy': 0.5891043397968606, 'precision': 0.5960949421923115, 'recall': 0.5891043397968606, 'f1_score': 0.5884020797012846}
Financial TF-IDF + LR metrics: {'accuracy': 0.7876447876447876, 'precision': 0.791115487346643, 'recall': 0.7876447876447876, 'f1_score': 0.767358784359363}
Pipeline A metrics saved.


## **Pipeline B: Word2Vec + Classifier (Semantic Baseline)**

In [40]:
w2v_med = Word2Vec(
    sentences=list(med_train['tokens']),
    vector_size=300,
    window=5,
    min_count=2,
    workers=4,
    sg=1
)
w2v_med.save("w2v_medical.model")
print("Medical Word2Vec model saved.")

w2v_fin = Word2Vec(
    sentences=list(fin_train['tokens']),
    vector_size=300,
    window=5,
    min_count=2,
    workers=4,
    sg=1
)
w2v_fin.save("w2v_financial.model")
print("Financial Word2Vec model saved.")


Medical Word2Vec model saved.
Financial Word2Vec model saved.


In [41]:
def document_vector(tokens, model):
    vecs = [model.wv[word] for word in tokens if word in model.wv]
    if len(vecs) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

X_med_train = np.vstack(med_train['tokens'].apply(lambda x: document_vector(x, w2v_med)))
X_med_val   = np.vstack(med_val['tokens'].apply(lambda x: document_vector(x, w2v_med)))
X_med_test  = np.vstack(med_test['tokens'].apply(lambda x: document_vector(x, w2v_med)))
y_med_train = med_train['condition_label']
y_med_test  = med_test['condition_label']

X_fin_train = np.vstack(fin_train['tokens'].apply(lambda x: document_vector(x, w2v_fin)))
X_fin_val   = np.vstack(fin_val['tokens'].apply(lambda x: document_vector(x, w2v_fin)))
X_fin_test  = np.vstack(fin_test['tokens'].apply(lambda x: document_vector(x, w2v_fin)))
y_fin_train = fin_train['label']
y_fin_test  = fin_test['label']

In [42]:
def calculate_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1_score': f1}

clf_med = LogisticRegression(max_iter=1000)
clf_med.fit(X_med_train, y_med_train)
y_med_pred = clf_med.predict(X_med_test)
metrics_med = calculate_metrics(y_med_test, y_med_pred)
print("Medical Word2Vec + Logistic Regression metrics:", metrics_med)

clf_fin = LogisticRegression(max_iter=1000)
clf_fin.fit(X_fin_train, y_fin_train)
y_fin_pred = clf_fin.predict(X_fin_test)
metrics_fin = calculate_metrics(y_fin_test, y_fin_pred)
print("Financial Word2Vec + Logistic Regression metrics:", metrics_fin)

with open("pipelineB_metrics.json", "w") as f:
    json.dump({'medical': metrics_med, 'financial': metrics_fin}, f)

print("Pipeline B metrics saved.")

Medical Word2Vec + Logistic Regression metrics: {'accuracy': 0.5914127423822715, 'precision': 0.6007172980840416, 'recall': 0.5914127423822715, 'f1_score': 0.5908181861854078}
Financial Word2Vec + Logistic Regression metrics: {'accuracy': 0.6737451737451737, 'precision': 0.614547890386263, 'recall': 0.6737451737451737, 'f1_score': 0.618183220580481}
Pipeline B metrics saved.


## ***Pipeline C: Pre-trained Transformer Embedder + Classifier (State-of-the-Art)***


In [16]:
# --- Medical: BioBERT ---
med_model_name = "dmis-lab/biobert-base-cased-v1.1"
med_tokenizer = AutoTokenizer.from_pretrained(med_model_name)
med_model     = AutoModel.from_pretrained(med_model_name)

# --- Financial: FinBERT ---
fin_model_name = "yiyanghkust/finbert-tone"
fin_tokenizer = AutoTokenizer.from_pretrained(fin_model_name)
fin_model     = AutoModel.from_pretrained(fin_model_name)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
med_model.to(device)
fin_model.to(device)

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

2025-11-17 08:14:08.424685: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763367248.650799      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763367248.714528      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30873, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [17]:
def get_cls_embeddings(text_list, tokenizer, model, device, max_len=128, batch_size=16):
    embeddings = []
    model.eval()
    with torch.no_grad():
        for i in range(0, len(text_list), batch_size):
            batch_texts = text_list[i:i+batch_size]
            enc = tokenizer(batch_texts, padding=True, truncation=True, max_length=max_len, return_tensors="pt")
            input_ids = enc['input_ids'].to(device)
            attention_mask = enc['attention_mask'].to(device)
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_emb = output.last_hidden_state[:,0,:]  # [CLS] token
            embeddings.append(cls_emb.cpu().numpy())
    return np.vstack(embeddings)

In [19]:
med_train_texts = med_train['tokens'].apply(lambda x: ' '.join(x)).tolist()
med_val_texts   = med_val['tokens'].apply(lambda x: ' '.join(x)).tolist()
med_test_texts  = med_test['tokens'].apply(lambda x: ' '.join(x)).tolist()

fin_train_texts = fin_train['tokens'].apply(lambda x: ' '.join(x)).tolist()
fin_val_texts   = fin_val['tokens'].apply(lambda x: ' '.join(x)).tolist()
fin_test_texts  = fin_test['tokens'].apply(lambda x: ' '.join(x)).tolist()


In [20]:
X_med_train = get_cls_embeddings(med_train_texts, med_tokenizer, med_model, device)
X_med_val   = get_cls_embeddings(med_val_texts, med_tokenizer, med_model, device)
X_med_test  = get_cls_embeddings(med_test_texts, med_tokenizer, med_model, device)
y_med_train = med_train['condition_label']
y_med_test  = med_test['condition_label']

X_fin_train = get_cls_embeddings(fin_train_texts, fin_tokenizer, fin_model, device)
X_fin_val   = get_cls_embeddings(fin_val_texts, fin_tokenizer, fin_model, device)
X_fin_test  = get_cls_embeddings(fin_test_texts, fin_tokenizer, fin_model, device)
y_fin_train = fin_train['label']
y_fin_test  = fin_test['label']


In [27]:
def calculate_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted')
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1_score': f1}

clf_med = LogisticRegression(max_iter=1000)
clf_med.fit(X_med_train, y_med_train)
y_med_pred = clf_med.predict(X_med_test)
metrics_med = calculate_metrics(y_med_test, y_med_pred)
print("Medical Transformer + LR metrics:", metrics_med)

clf_fin = LogisticRegression(max_iter=1000)
clf_fin.fit(X_fin_train, y_fin_train)
y_fin_pred = clf_fin.predict(X_fin_test)
metrics_fin = calculate_metrics(y_fin_test, y_fin_pred)
print("Financial Transformer + LR metrics:", metrics_fin)

with open("pipelineC_metrics.json", "w") as f:
    json.dump({'medical': metrics_med, 'financial': metrics_fin}, f)

print("Pipeline C metrics saved.")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Medical Transformer + LR metrics: {'accuracy': 0.561865189289012, 'precision': 0.5630233867127992, 'recall': 0.561865189289012, 'f1_score': 0.5611715377203867}
Financial Transformer + LR metrics: {'accuracy': 0.8455598455598455, 'precision': 0.8434275184275184, 'recall': 0.8455598455598455, 'f1_score': 0.844108362804018}
Pipeline C metrics saved.


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## ***Phase 3: Analysis and Reporting***

In [48]:
with open("/kaggle/working/pipelineA_metrics.json", "r") as f:
    metrics_A = json.load(f)

with open("/kaggle/working/pipelineB_metrics.json", "r") as f:
    metrics_B = json.load(f)

with open("/kaggle/working/pipelineC_metrics.json", "r") as f:
    metrics_C = json.load(f)

In [49]:
records = []

for domain in ['medical', 'financial']:
    for pipeline_name, metrics in zip(['Pipeline A', 'Pipeline B', 'Pipeline C'], 
                                      [metrics_A, metrics_B, metrics_C]):
        record = {
            'Pipeline': pipeline_name,
            'Domain': domain,
            'Accuracy': metrics[domain]['accuracy'],
            'Precision (weighted)': metrics[domain]['precision'],
            'Recall (weighted)': metrics[domain]['recall'],
            'F1 (weighted)': metrics[domain]['f1_score']
        }
        records.append(record)

metrics_df = pd.DataFrame(records)
metrics_df

Unnamed: 0,Pipeline,Domain,Accuracy,Precision (weighted),Recall (weighted),F1 (weighted)
0,Pipeline A,medical,0.589104,0.596095,0.589104,0.588402
1,Pipeline B,medical,0.591413,0.600717,0.591413,0.590818
2,Pipeline C,medical,0.561865,0.563023,0.561865,0.561172
3,Pipeline A,financial,0.787645,0.791115,0.787645,0.767359
4,Pipeline B,financial,0.673745,0.614548,0.673745,0.618183
5,Pipeline C,financial,0.84556,0.843428,0.84556,0.844108
