In [1]:
!pip install git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git

Collecting git+https://github.com/MeMartijn/updated-sklearn-crfsuite.git
  Cloning https://github.com/MeMartijn/updated-sklearn-crfsuite.git to /tmp/pip-req-build-r3d08th8
  Running command git clone --filter=blob:none --quiet https://github.com/MeMartijn/updated-sklearn-crfsuite.git /tmp/pip-req-build-r3d08th8
  Resolved https://github.com/MeMartijn/updated-sklearn-crfsuite.git to commit 675038761b4405f04691a83339d04903790e2b95
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite==0.3.6)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sklearn-crfsuite
  Building wheel for sklearn-crfsuite (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn-crfsuite: filename=sklearn_crfsuite-0.3.6-py2.py3-none-any.whl si

In [2]:
!pip show scikit-learn

Name: scikit-learn
Version: 1.2.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: 
Author-email: 
License: new BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: joblib, numpy, scipy, threadpoolctl
Required-by: bigframes, fastai, imbalanced-learn, librosa, mlxtend, qudida, sklearn-pandas, yellowbrick


In [3]:
import nltk

nltk.download('treebank')

from nltk.corpus import treebank

penn_treebank = []
for fileid in treebank.fileids():
  tokens = []
  tags = []
  for word, tag in treebank.tagged_words(fileid):
    tokens.append(word)
    tags.append(tag)
  penn_treebank.append((tokens, tags))

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


In [4]:
!pip install conllu
! wget https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-GSD/master/es_gsd-ud-test.conllu

from io import open
from conllu import parse_incr

data_file = open("es_gsd-ud-test.conllu", "r", encoding="utf-8")
ud_files = []
for tokenlist in parse_incr(data_file):
    ud_files.append(tokenlist)

ud_treebank = []
for sentence in ud_files:
  tokens = []
  tags = []
  for token in sentence:
    tokens.append(token['form'])
    tags.append(token['upostag'])
  ud_treebank.append((tokens, tags))

Collecting conllu
  Downloading conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-4.5.3
--2024-03-30 16:12:27--  https://raw.githubusercontent.com/UniversalDependencies/UD_Spanish-GSD/master/es_gsd-ud-test.conllu
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 737048 (720K) [text/plain]
Saving to: ‘es_gsd-ud-test.conllu’


2024-03-30 16:12:27 (23.2 MB/s) - ‘es_gsd-ud-test.conllu’ saved [737048/737048]



In [5]:
import re
def extract_features(sentence, index):
  return {
      'word':sentence[index],
      'is_first':index==0,
      'is_last':index ==len(sentence)-1,
      'is_capitalized':sentence[index][0].upper() == sentence[index][0],
      'is_all_caps': sentence[index].upper() == sentence[index],
      'is_all_lower': sentence[index].lower() == sentence[index],
      'is_alphanumeric': int(bool((re.match('^(?=.*[0-9]$)(?=.*[a-zA-Z])',sentence[index])))),
      'prefix-1':sentence[index][0],
      'prefix-2':sentence[index][:2],
      'prefix-3':sentence[index][:3],
      'prefix-3':sentence[index][:4],
      'suffix-1':sentence[index][-1],
      'suffix-2':sentence[index][-2:],
      'suffix-3':sentence[index][-3:],
      'suffix-3':sentence[index][-4:],
      'prev_word':'' if index == 0 else sentence[index-1],
      'next_word':'' if index < len(sentence) else sentence[index+1],
      'has_hyphen': '-' in sentence[index],
      'is_numeric': sentence[index].isdigit(),
      'capitals_inside': sentence[index][1:].lower() != sentence[index][1:]
  }

In [6]:
def transform_to_dataset(tagged_sentences):
  X, y = [], []
  for sentence, tags in tagged_sentences:
    sent_word_features, sent_tags = [],[]
    for index in range(len(sentence)):
        sent_word_features.append(extract_features(sentence, index)),
        sent_tags.append(tags[index])
    X.append(sent_word_features)
    y.append(sent_tags)
  return X, y

penn_train_size = int(0.8*len(penn_treebank))
penn_training = penn_treebank[:penn_train_size]
penn_testing = penn_treebank[penn_train_size:]
X_penn_train, y_penn_train = transform_to_dataset(penn_training)
X_penn_test, y_penn_test = transform_to_dataset(penn_testing)

ud_train_size = int(0.8*len(ud_treebank))
ud_training = ud_treebank[:ud_train_size]
ud_testing = ud_treebank[ud_train_size:]
X_ud_train, y_ud_train = transform_to_dataset(ud_training)
X_ud_test, y_ud_test = transform_to_dataset(ud_testing)

In [7]:
import warnings
warnings.filterwarnings('ignore')

!pip install sklearn_crfsuite
from sklearn_crfsuite import CRF

penn_crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
print("Started training on Penn Treebank corpus!")
penn_crf.fit(X_penn_train, y_penn_train)
print("Finished training on Penn Treebank corpus!")

ud_crf = CRF(
    algorithm='lbfgs',
    c1=0.01,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
print("Started training on UD corpus!")
ud_crf.fit(X_ud_train, y_ud_train)
print("Finished training on UD corpus!")

Started training on Penn Treebank corpus!
Finished training on Penn Treebank corpus!
Started training on UD corpus!
Finished training on UD corpus!


In [8]:
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
print("## Penn ##")

y_penn_pred=penn_crf.predict(X_penn_test)
print("F1 score on Test Data")
print(metrics.flat_f1_score(y_penn_test, y_penn_pred,average='weighted',labels=penn_crf.classes_))
y_penn_pred_train=penn_crf.predict(X_penn_train)
print("F1 score on Training Data ")
print(metrics.flat_f1_score(y_penn_train, y_penn_pred_train,average='weighted',labels=penn_crf.classes_))

print("Class wise score:")
print(metrics.flat_classification_report(
    y_true=y_penn_test, y_pred=y_penn_pred, labels=penn_crf.classes_, digits=3
))

print("## UD ##")

y_ud_pred=ud_crf.predict(X_ud_test)
print("F1 score on Test Data ")
print(metrics.flat_f1_score(y_ud_test, y_ud_pred,average='weighted',labels=ud_crf.classes_))
y_ud_pred_train=ud_crf.predict(X_ud_train)
print("F1 score on Training Data ")
print(metrics.flat_f1_score(y_ud_train, y_ud_pred_train,average='weighted',labels=ud_crf.classes_))

print("Class wise score:")
print(metrics.flat_classification_report(
    y_ud_test, y_ud_pred, labels=ud_crf.classes_, digits=3
))

## Penn ##
F1 score on Test Data
0.9668646324625245
F1 score on Training Data 
0.9936643188628935
Class wise score:
              precision    recall  f1-score   support

         NNP      0.952     0.963     0.957      1213
           ,      1.000     1.000     1.000       592
          CD      1.000     0.999     0.999       683
         NNS      0.964     0.986     0.975       740
          JJ      0.879     0.912     0.895       731
          MD      0.993     1.000     0.996       135
          VB      0.980     0.946     0.963       313
          DT      0.992     0.993     0.992      1062
          NN      0.962     0.955     0.958      1899
          IN      0.981     0.980     0.981      1285
           .      1.000     1.000     1.000       509
         VBZ      0.958     0.936     0.947       219
         VBG      0.936     0.876     0.905       185
          CC      1.000     0.997     0.998       287
         VBD      0.965     0.945     0.955       492
         VBN      0

In [9]:
sent = "The tagger produced good results"
features = [extract_features(sent.split(), idx) for idx in range(len(sent.split()))]

penn_results = penn_crf.predict_single(features)
ud_results = ud_crf.predict_single(features)

penn_tups = [(sent.split()[idx], penn_results[idx]) for idx in range(len(sent.split()))]
ud_tups = [(sent.split()[idx], ud_results[idx]) for idx in range(len(sent.split()))]

print(penn_tups)
print(ud_tups)

[('The', 'DT'), ('tagger', 'NN'), ('produced', 'VBN'), ('good', 'JJ'), ('results', 'NNS')]
[('The', 'DET'), ('tagger', 'NOUN'), ('produced', 'ADJ'), ('good', 'NOUN'), ('results', 'ADJ')]


In [10]:
import pickle

penn_filename = 'penn_treebank_crf_postagger.sav'
pickle.dump(penn_crf, open(penn_filename, 'wb'))

ud_filename = 'ud_crf_postagger.sav'
pickle.dump(ud_crf, open(ud_filename,'wb'))