In [None]:
!pip install sklearn_crfsuite

Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn_crfsuite-0.5.0


In [None]:
# Gerekli kütüphaneleri içe aktarın

import numpy as np
import pandas as pd
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics


In [None]:
df1 = pd.read_csv("/content/aug_train_dataset.csv")
df2 = pd.read_csv("/content/aug_test_dataset.csv")

result = pd.concat([df1, df2])
result.to_csv('main_dataset.csv', index=False)

In [None]:
# Son dataframe'i yeniden yükleyin

final_df = pd.read_csv("/content/aug_dataset.csv")

In [None]:
# Cümleleri elde etmek için bir sınıf tanımlayın

class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s['word'].values.tolist(),
                                                           s['label'].values.tolist())]
        self.grouped = self.data.groupby('sentence_id').apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None


# Veri setinden cümleleri alın

getter = SentenceGetter(final_df)
sentences = getter.sentences

In [None]:
# Her kelimenin özelliklerini oluşturan bir fonksiyon tanımlayın

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
    else:
        features['EOS'] = True

    return features

# Bir cümleyi özelliklere dönüştüren bir fonksiyon tanımlayın
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# Bir cümlenin etiketlerini döndüren bir fonksiyon tanımlayın
def sent2labels(sent):
    return [label for token, label in sent]

# Bir cümlenin kelimelerini döndüren bir fonksiyon tanımlayın
def sent2tokens(sent):
    return [token for token, label in sent]

In [None]:
# Özellikleri ve etiketleri ayırın

X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [None]:
len(X)
len(y)

1145

In [None]:
from sklearn.model_selection import train_test_split

# Eğitim ve test veri setlerini ayırın

X_test ,X_train , y_test , y_train  = train_test_split(X, y, test_size=len(X)-250, random_state=42)

In [None]:
len(X_test)

250

In [None]:
from sklearn.model_selection import GridSearchCV
import sklearn_crfsuite

# Define parameter grid
param_grid = {
    'c1': [0.01, 0.02, 0.03, 0.1, 0.2 ,0.3 , 1],
    'c2': [0.01, 0.02, 0.03, 0.1, 0.2 ,0.3 , 1],
    'max_iterations': [100, 120, 140 , 160 , 200],
    'all_possible_transitions': [True, False]
}

# Define the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=160,
    all_possible_transitions=True,
    verbose=True
)

# Perform grid search with cross-validation
grid_search = GridSearchCV(crf, param_grid, cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and model
best_crf = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)


In [None]:
# CRF modelini tanımlayın ve eğitin

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True
)
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 895/895 [00:00<00:00, 2578.97it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 13214
Seconds required: 0.082

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 100
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=0.06  loss=77704.99 active=13019 feature_norm=1.00
Iter 2   time=0.03  loss=72816.82 active=12214 feature_norm=1.50
Iter 3   time=0.03  loss=65217.65 active=12646 feature_norm=2.38
Iter 4   time=0.04  loss=57565.19 active=12738 feature_norm=4.17
Iter 5   time=0.03  loss=53508.08 active=12877 feature_norm=5.34
Iter 6   time=0.03  loss=51210.06 active=13015 feature_norm=6.13
Iter 7   time=0.03  loss=49487.21 active=13004 feature_norm=7.69
Iter 8   time=0.03  loss=47917.87 active=13037 feature_norm=8.24
Iter 9   time=0.03  loss=46300.75 active=13010 feature_norm=9.49
Iter 10  time=

In [None]:
# Tüm varlıkları sıralayın

all_entities = sorted(final_df.label.unique().tolist())


In [None]:
# Test veri setinde tahmin yapın
y_pred = crf.predict(X_test)

# F1 skorunu hesaplayın
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=all_entities)


0.7718777139729169

In [None]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=[i for i in all_entities if i != 'O'])

0.7358367006852877

In [None]:
from sklearn.metrics import classification_report

# Convert y_test and y_pred to flat lists
y_test_flat = [label for sublist in y_test for label in sublist]
y_pred_flat = [label for sublist in y_pred for label in sublist]

# Generate the classification report
print(classification_report(y_test_flat, y_pred_flat, labels=all_entities))

               precision    recall  f1-score   support

         ANAT       0.72      0.76      0.74      4279
            O       0.81      0.79      0.80     11936
   OBS-ABSENT       0.74      0.85      0.79      1962
  OBS-PRESENT       0.72      0.65      0.68      3777
OBS-UNCERTAIN       0.65      0.68      0.66       167

     accuracy                           0.77     22121
    macro avg       0.73      0.75      0.74     22121
 weighted avg       0.77      0.77      0.77     22121

