In [1]:
import pandas as pd
import fasttext
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
import re


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_dataset = Dataset.from_file('YOUR-PATH-HERE')
test_dataset = Dataset.from_file('YOUR-PATH-HERE')

In [3]:
selected_columns = train_dataset.select_columns(["doctor_title", "doctor_speciality", "question_content", "question_answer"])
df_train = selected_columns.to_pandas()

selected_columns = test_dataset.select_columns(["doctor_title", "doctor_speciality", "question_content", "question_answer"])
df_test = selected_columns.to_pandas()

In [4]:
print(f"Training shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")

Training shape: (150105, 4)
Test shape: (37527, 4)


In [5]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [6]:
df_train['text'] = df_train.apply(
    lambda row: f"soru: {clean_text(row['question_content'])} cevap: {clean_text(row['question_answer'])}", 
    axis=1
)

df_test['text'] = df_test.apply(
    lambda row: f"soru: {clean_text(row['question_content'])} cevap: {clean_text(row['question_answer'])}", 
    axis=1
)

In [7]:
df_train['fasttext_line'] = df_train.apply(
    lambda row: f"__label__{row['doctor_speciality']} {row['text']}", 
    axis=1
)

df_test['fasttext_line'] = df_test.apply(
    lambda row: f"__label__{row['doctor_speciality']} {row['text']}", 
    axis=1
)

In [8]:
train_file = 'YOUR-PATH-HERE'
test_file = 'YOUR-PATH-HERE'

with open(train_file, 'w', encoding='utf-8') as f:
    for line in df_train['fasttext_line']:
        f.write(line + '\n')

with open(test_file, 'w', encoding='utf-8') as f:
    for line in df_test['fasttext_line']:
        f.write(line + '\n')

In [None]:
model = fasttext.train_supervised(
    input=train_file,
    lr=0.3,
    epoch=128,
    wordNgrams=2,
    dim=100,
    minn=2,
    maxn=5,
    loss='softmax',
    verbose=2,
    thread=4
)




Read 3M words
Number of words:  207437
Number of labels: 108
Progress: 100.0% words/sec/thread:  212720 lr:  0.000000 avg.loss:  0.468381 ETA:   0h 0m 0s  2.6% words/sec/thread:  214347 lr:  0.292306 avg.loss:  3.336607 ETA:   0h 8m37s  4.7% words/sec/thread:  228833 lr:  0.285925 avg.loss:  2.902667 ETA:   0h 7m54s  5.7% words/sec/thread:  232806 lr:  0.282793 avg.loss:  2.729100 ETA:   0h 7m40s 11.8% words/sec/thread:  238800 lr:  0.264709 avg.loss:  2.068113 ETA:   0h 7m 0s 11.9% words/sec/thread:  238967 lr:  0.264368 avg.loss:  2.059685 ETA:   0h 6m59s 12.4% words/sec/thread:  238560 lr:  0.262855 avg.loss:  2.020846 ETA:   0h 6m58s 19.4% words/sec/thread:  233199 lr:  0.241776 avg.loss:  1.606492 ETA:   0h 6m33s 20.5% words/sec/thread:  232775 lr:  0.238442 avg.loss:  1.555373 ETA:   0h 6m28s% words/sec/thread:  226504 lr:  0.226749 avg.loss:  1.398087 ETA:   0h 6m19s 223939 lr:  0.218283 avg.loss:  1.301667 ETA:   0h 6m 9s 31.9% words/sec/thread:  217275 lr:  0.204350 avg.loss: 

ValueError: /home/alicantanyeri/Belgeler/doctor_speciality_classification/Distilbert-HealtcareAssistant/fasttext_healthcare.bin cannot be opened for saving!

In [14]:
import os

save_path = 'YOUR-PATH-HERE'

# Eğer klasör değilse ama aynı isimde dosya varsa
if os.path.exists(save_path) and not os.path.isdir(save_path):
    print("Aynı isimde dosya var, kaldırılıyor...")
    os.remove(save_path)  # veya taşı: shutil.move()
    os.makedirs(save_path)  # klasörü oluştur
elif not os.path.exists(save_path):
    os.makedirs(save_path)

# Artık model güvenle kaydedilebilir
model.save_model(os.path.join(save_path, "fasttext_healthcare.bin"))



In [10]:

# Detailed evaluation
y_true = []
y_pred = []

for _, row in df_test.iterrows():
    text = row['text']
    true_label = row['doctor_speciality']
    
    predictions = model.predict(text, k=1)
    pred_label = predictions[0][0].replace('__label__', '')
    
    y_true.append(true_label)
    y_pred.append(pred_label)

accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")



Accuracy: 0.9886
F1 Score: 0.9874
