In [None]:
import pandas as pd

splits = {'train': 'ohsumed/train-00000-of-00001.parquet', 'test': 'ohsumed/test-00000-of-00001.parquet'}
training = pd.read_parquet("hf://datasets/community-datasets/ohsumed/" + splits["train"])
test = pd.read_parquet("hf://datasets/community-datasets/ohsumed/" + splits["test"])

In [None]:
training['title_abstract'] = training['title'] + ' ' + training['abstract']
test['title_abstract'] = test['title'] + ' ' + test['abstract']

In [None]:
training.head()
test.head()

Unnamed: 0,seq_id,medline_ui,mesh_terms,title,publication_type,abstract,author,source,title_abstract
0,54711,88000001,Acetaldehyde/*ME; Buffers; Catalysis; HEPES/PD...,The binding of acetaldehyde to the active site...,JOURNAL ARTICLE.,"Ribonuclease A was reacted with [1-13C,1,2-14C...",Mauch TJ; Tuma DJ; Sorrell MF.,Alcohol Alcohol 8801; 22(2):103-12,The binding of acetaldehyde to the active site...
1,54711,88000002,"Adult; Alcohol, Ethyl/*AN; Breath Tests/*; Hum...",Reductions in breath ethanol readings in norma...,JOURNAL ARTICLE.,Blood ethanol concentrations were measured seq...,Gaylarde PM; Stambuk D; Morgan MY.,Alcohol Alcohol 8801; 22(2):113-6,Reductions in breath ethanol readings in norma...
2,54711,88000003,Alcoholism/*PP; Animal; Diprenorphine/PD; Fema...,Does the blockade of opioid receptors influenc...,JOURNAL ARTICLE.,We have tested whether the opioid antagonists ...,Kotlinska J; Langwinski R.,Alcohol Alcohol 8801; 22(2):117-9,Does the blockade of opioid receptors influenc...
3,54711,88000006,Adult; Alcohol Drinking/*PH; Alcoholism/*BL/CO...,Drinkwatchers--description of subjects and eva...,JOURNAL ARTICLE.,Clinical examination and measurement of MCV an...,Barrison IG; Ruzek J; Murray-Lyon IM.,Alcohol Alcohol 8801; 22(2):147-54,Drinkwatchers--description of subjects and eva...
4,54711,88000007,Adult; Alcoholism/*BL; Blood Platelets/*ME; Er...,Platelet affinity for serotonin is increased i...,JOURNAL ARTICLE.,The kinetics of 3H serotonin platelet uptake w...,Boismare F; Lhuintre JP; Daoust M; Moore N; Sa...,Alcohol Alcohol 8801; 22(2):155-9,Platelet affinity for serotonin is increased i...


In [None]:
training.isna().sum()

Unnamed: 0,0
seq_id,0
medline_ui,0
mesh_terms,0
title,0
publication_type,0
abstract,0
author,0
source,0
title_abstract,0


In [None]:
test.isna().sum()

Unnamed: 0,0
seq_id,0
medline_ui,0
mesh_terms,0
title,0
publication_type,0
abstract,0
author,0
source,0
title_abstract,0


In [None]:
training.shape

(54709, 9)

In [None]:
test.shape

(293855, 9)

In [None]:
import pandas as pd

def is_sensitive(mesh_terms):
  if isinstance(mesh_terms, str):
    return 1 if 'urogenital' in mesh_terms.lower() or 'pregnancy complications' in mesh_terms.lower() else 0
  else:
    return 0

training['sensitivity'] = training['mesh_terms'].apply(is_sensitive)

sensitivity_count = training['sensitivity'].sum()
print(f"Numero di documenti sensibili in training (sensitivity = 1): {sensitivity_count}")

test['sensitivity'] = test['mesh_terms'].apply(is_sensitive)

sensitivity_count = test['sensitivity'].sum()
print(f"Numero di documenti sensibili in test (sensitivity = 1): {sensitivity_count}")

Numero di documenti sensibili in training (sensitivity = 1): 605
Numero di documenti sensibili in test (sensitivity = 1): 2945


In [None]:
from sklearn.model_selection import train_test_split

train_data, val_data = train_test_split(training, test_size=0.15, random_state=23)  # 15% per la convalida

print(train_data.shape)
print(val_data.shape)
print(test.shape)

(46502, 10)
(8207, 10)
(293855, 10)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = training['title_abstract']
y_train = training['sensitivity']
X_test = test['title_abstract']
y_test = test['sensitivity']

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)

model = LogisticRegression()
model.fit(X_train_vec, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X_test_vec = vectorizer.transform(X_test)
y_pred = model.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.9901
Precision: 0.5478
Recall: 0.0934
F1-score: 0.1596
