<a href="https://colab.research.google.com/github/AhlamBashiti1/MedCUI_ML_Project/blob/main/Text_Model_All.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gensim


Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

# Load your data
train_df = pd.read_csv('/content/Train_with_clusters.csv')
val_df = pd.read_csv('/content/Val_with_clusters.csv')
test_df = pd.read_csv('/content/Test_with_clusters.csv')

label_cols = ['0', '1', '2', '3']
y_train = train_df[label_cols].values
y_val = val_df[label_cols].values
y_test = test_df[label_cols].values
all_captions = pd.concat([train_df['Caption'], val_df['Caption']])


In [None]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectorizer.fit(all_captions)

X_train_tfidf = tfidf_vectorizer.transform(train_df['Caption'])
X_val_tfidf = tfidf_vectorizer.transform(val_df['Caption'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['Caption'])

# Word2Vec
sentences = [cap.lower().split() for cap in all_captions]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, epochs=20)

def get_w2v_features(captions, model):
    features = []
    for cap in captions:
        words = cap.lower().split()
        vecs = [model.wv[word] for word in words if word in model.wv]
        features.append(np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size))
    return np.array(features)

X_train_w2v = get_w2v_features(train_df['Caption'], w2v_model)
X_val_w2v = get_w2v_features(val_df['Caption'], w2v_model)
X_test_w2v = get_w2v_features(test_df['Caption'], w2v_model)

# Combine TF-IDF + Word2Vec
X_train_comb = hstack([X_train_tfidf, csr_matrix(X_train_w2v)])
X_val_comb = hstack([X_val_tfidf, csr_matrix(X_val_w2v)])
X_test_comb = hstack([X_test_tfidf, csr_matrix(X_test_w2v)])


In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm


In [None]:
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
bert_model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
bert_model.eval()

def get_bert_embeddings(texts, tokenizer, model, max_len=64):
    embeddings = []
    with torch.no_grad():
        for txt in tqdm(texts):
            inputs = tokenizer(txt, return_tensors='pt', truncation=True, padding='max_length', max_length=max_len)
            outputs = model(**inputs)
            cls_embed = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
            embeddings.append(cls_embed)
    return np.array(embeddings)

X_train_bert = get_bert_embeddings(train_df['Caption'], tokenizer, bert_model)
X_val_bert = get_bert_embeddings(val_df['Caption'], tokenizer, bert_model)
X_test_bert = get_bert_embeddings(test_df['Caption'], tokenizer, bert_model)


KeyboardInterrupt: 

In [None]:
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier


In [None]:
models = {
    #'LinearSVC': OneVsRestClassifier(LinearSVC(max_iter=2000)),
   'RandomForest': MultiOutputClassifier(RandomForestClassifier()), #very slow
    #'AdaBoost': MultiOutputClassifier(AdaBoostClassifier()),#next slow
   #'LogisticRegression': OneVsRestClassifier(LogisticRegression(solver='liblinear'))
}

params = {
    #'LinearSVC': {'estimator__C': [0.1, 1, 10]}
     'RandomForest': {
        'estimator__n_estimators': [100, 200],
        'estimator__max_depth': [10, 20, None]
     },
     #'AdaBoost': {'estimator__n_estimators': [50, 100]},
     #'LogisticRegression': {'estimator__C': [0.1, 1, 10]}
}

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm  #

def evaluate(model, X, y_true):
    y_pred = model.predict(X)
    return {
        "f1_micro": f1_score(y_true, y_pred, average='micro'),
        "precision_micro": precision_score(y_true, y_pred, average='micro', zero_division=0),
        "recall_micro": recall_score(y_true, y_pred, average='micro'),
        "accuracy": accuracy_score(y_true, y_pred)
    }

feature_sets = {
    'TFIDF': (X_train_tfidf, X_val_tfidf),
    'W2V': (X_train_w2v, X_val_w2v),
    'TFIDF+W2V': (X_train_comb, X_val_comb),
    # 'ClinicalBERT': (X_train_bert, X_val_bert)
}

results = []

# loop over feature sets with tqdm
for feat_name, (X_tr, X_va) in tqdm(feature_sets.items(), desc="Feature Sets"):
    # loop over models with tqdm
    for model_name, base_model in tqdm(models.items(), desc=f"Models for {feat_name}", leave=False):
        print(f"🔍 Running GridSearchCV for {model_name} with {feat_name}")
        grid = GridSearchCV(base_model, params[model_name], scoring='f1_micro', cv=2, n_jobs=-1, verbose=1)
        grid.fit(X_tr, y_train)

        scores = evaluate(grid.best_estimator_, X_va, y_val)
        scores.update({
            "model": model_name,
            "features": feat_name,
            "best_params": grid.best_params_
        })
        results.append(scores)


Feature Sets:   0%|          | 0/3 [00:00<?, ?it/s]
Models for TFIDF:   0%|          | 0/1 [00:00<?, ?it/s][A

🔍 Running GridSearchCV for LogisticRegression with TFIDF
Fitting 2 folds for each of 3 candidates, totalling 6 fits



Models for TFIDF: 100%|██████████| 1/1 [00:09<00:00,  9.92s/it][A
Feature Sets:  33%|███▎      | 1/3 [00:09<00:19,  9.93s/it]
Models for W2V:   0%|          | 0/1 [00:00<?, ?it/s][A

🔍 Running GridSearchCV for LogisticRegression with W2V
Fitting 2 folds for each of 3 candidates, totalling 6 fits



Models for W2V: 100%|██████████| 1/1 [01:13<00:00, 73.34s/it][A
Feature Sets:  67%|██████▋   | 2/3 [01:23<00:47, 47.24s/it]
Models for TFIDF+W2V:   0%|          | 0/1 [00:00<?, ?it/s][A

🔍 Running GridSearchCV for LogisticRegression with TFIDF+W2V
Fitting 2 folds for each of 3 candidates, totalling 6 fits



Models for TFIDF+W2V: 100%|██████████| 1/1 [05:08<00:00, 308.10s/it][A
Feature Sets: 100%|██████████| 3/3 [06:31<00:00, 130.46s/it]


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from tqdm import tqdm

# ✅ Define evaluation function
def evaluate(model, X, y_true):
    y_pred = model.predict(X)
    return {
        "f1_micro": f1_score(y_true, y_pred, average='micro'),
        "precision_micro": precision_score(y_true, y_pred, average='micro', zero_division=0),
        "recall_micro": recall_score(y_true, y_pred, average='micro'),
        "accuracy": accuracy_score(y_true, y_pred)
    }

# ✅ Feature sets (train, val, test tuples)
feature_sets = {
    'TFIDF': (X_train_tfidf, X_val_tfidf, X_test_tfidf),
    'W2V': (X_train_w2v, X_val_w2v, X_test_w2v),
    #'TFIDF+W2V': (X_train_comb, X_val_comb, X_test_comb),
    # 'ClinicalBERT': (X_train_bert, X_val_bert, X_test_bert)
}

results = []

# ✅ Loop over features and models
for feat_name, (X_tr, X_va, X_te) in tqdm(feature_sets.items(), desc="🔁 Feature Sets"):
    for model_name, base_model in tqdm(models.items(), desc=f"⚙️ Models for {feat_name}", leave=False):
        print(f"\n🔍 Running GridSearchCV for {model_name} with {feat_name}")

        grid = GridSearchCV(base_model, params[model_name], scoring='f1_micro', cv=2, n_jobs=-1, verbose=1)
        grid.fit(X_tr, y_train)

        # ✅ Validation evaluation
        val_scores = evaluate(grid.best_estimator_, X_va, y_val)
        val_scores.update({
            "model": model_name,
            "features": feat_name,
            "split": "val",
            "best_params": grid.best_params_
        })
        results.append(val_scores)

        # ✅ Final test evaluation (no retraining)
        test_scores = evaluate(grid.best_estimator_, X_te, y_test)
        test_scores.update({
            "model": model_name,
            "features": feat_name,
            "split": "test",
            "best_params": grid.best_params_
        })
        results.append(test_scores)

        print(f"✅ Completed: {model_name} with {feat_name}")
        print(f"📊 Validation F1: {val_scores['f1_micro']:.4f} | Test F1: {test_scores['f1_micro']:.4f}")


🔁 Feature Sets:   0%|          | 0/2 [00:00<?, ?it/s]
⚙️ Models for TFIDF:   0%|          | 0/1 [00:00<?, ?it/s][A


🔍 Running GridSearchCV for RandomForest with TFIDF
Fitting 2 folds for each of 6 candidates, totalling 12 fits



⚙️ Models for TFIDF: 100%|██████████| 1/1 [33:32<00:00, 2012.19s/it][A
🔁 Feature Sets:  50%|█████     | 1/2 [33:32<33:32, 2012.21s/it]

✅ Completed: RandomForest with TFIDF
📊 Validation F1: 0.8834 | Test F1: 0.8875



⚙️ Models for W2V:   0%|          | 0/1 [00:00<?, ?it/s][A


🔍 Running GridSearchCV for RandomForest with W2V
Fitting 2 folds for each of 6 candidates, totalling 12 fits



⚙️ Models for W2V: 100%|██████████| 1/1 [1:01:35<00:00, 3695.13s/it][A
🔁 Feature Sets: 100%|██████████| 2/2 [1:35:07<00:00, 2853.68s/it]

✅ Completed: RandomForest with W2V
📊 Validation F1: 0.8367 | Test F1: 0.8408





In [None]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df.to_csv("Random Forest.csv", index=False)
results_df


Unnamed: 0,f1_micro,precision_micro,recall_micro,accuracy,model,features,split,best_params
0,0.88337,0.890059,0.876782,0.569669,RandomForest,TFIDF,val,"{'estimator__max_depth': None, 'estimator__n_e..."
1,0.887531,0.89265,0.882472,0.579128,RandomForest,TFIDF,test,"{'estimator__max_depth': None, 'estimator__n_e..."
2,0.836677,0.83594,0.837416,0.435884,RandomForest,W2V,val,"{'estimator__max_depth': 20, 'estimator__n_est..."
3,0.840774,0.839121,0.842433,0.445855,RandomForest,W2V,test,"{'estimator__max_depth': 20, 'estimator__n_est..."
