In [1]:
! git clone https://github.com/UniversalDependencies/UD_Turkish-IMST.git
! mkdir data
! mv UD_Turkish-IMST/tr_imst* data/

Cloning into 'UD_Turkish-IMST'...
remote: Enumerating objects: 444, done.[K
remote: Counting objects: 100% (159/159), done.[K
remote: Compressing objects: 100% (106/106), done.[K
remote: Total 444 (delta 104), reused 104 (delta 53), pack-reused 285[K
Receiving objects: 100% (444/444), 4.22 MiB | 17.47 MiB/s, done.
Resolving deltas: 100% (275/275), done.


In [2]:
! pip install flair==0.9

Collecting flair==0.9
  Downloading flair-0.9-py3-none-any.whl (319 kB)
[K     |████████████████████████████████| 319 kB 31.0 MB/s 
[?25hCollecting sentencepiece==0.1.95
  Downloading sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 54.7 MB/s 
[?25hCollecting konoha<5.0.0,>=4.0.0
  Downloading konoha-4.6.5-py3-none-any.whl (20 kB)
Collecting more-itertools~=8.8.0
  Downloading more_itertools-8.8.0-py3-none-any.whl (48 kB)
[K     |████████████████████████████████| 48 kB 7.5 MB/s 
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 57.3 MB/s 
Collecting sqlitedict>=1.6.0
  Downloading sqlitedict-2.0.0.tar.gz (46 kB)
[K     |████████████████████████████████| 46 kB 4.8 MB/s 
[?25hCollecting deprecated>=1.2.4
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Collecting ftfy
  Downloading ftfy-6.1.1-py3-none-any.whl (53 kB)
[K     |███

In [3]:
from argparse import ArgumentParser
import torch, flair

# dataset, model and embedding imports
from flair.datasets import UniversalDependenciesCorpus
from flair.embeddings import TransformerWordEmbeddings
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer

seed = 42
cuda = 0
model = 'dbmdz/bert-base-turkish-cased'
dataset = 'imst'

# use cuda device as passed
flair.device = f'cuda:{str(cuda)}'

# for each passed seed, do one experimental run
flair.set_seed(seed)

# model
hf_model = model

# initialize embeddings
embeddings = TransformerWordEmbeddings(
    model=hf_model,
    layers="-1",
    subtoken_pooling="first",
    fine_tune=True,
    use_context=False,
    respect_document_boundaries=False,
)

# select dataset depending on which language variable is passed

tag_type = None

if dataset in ["imst", "boun"]:
    tag_type = "upos"
    corpus = UniversalDependenciesCorpus(data_folder="./data",
                                        train_file=f"tr_{dataset}-ud-train.conllu",
                                        dev_file=f"tr_{dataset}-ud-dev.conllu",
                                        test_file=f"tr_{dataset}-ud-test.conllu")

# make the dictionary of tags to predict
tag_dictionary = corpus.make_tag_dictionary(tag_type)

# init bare-bones sequence tagger (no reprojection, LSTM or CRF)
tagger: SequenceTagger = SequenceTagger(
    hidden_size=256,
    embeddings=embeddings,
    tag_dictionary=tag_dictionary,
    tag_type=tag_type,
    use_crf=False,
    use_rnn=False,
    reproject_embeddings=False,
)

# init the model trainer
trainer = ModelTrainer(tagger, corpus, optimizer=torch.optim.AdamW)

# make string for output folder
output_folder = f"flert-{dataset}-{hf_model}-{seed}"

# train with XLM parameters (AdamW, 20 epochs, small LR)
from torch.optim.lr_scheduler import OneCycleLR

trainer.train(
    output_folder,
    learning_rate=5.0e-5,
    mini_batch_size=16,
    mini_batch_chunk_size=1,
    max_epochs=10,
    scheduler=OneCycleLR,
    embeddings_storage_mode='none',
    weight_decay=0.,
    train_with_dev=False,
    use_final_model_for_eval=True
)


Downloading:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/245k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/424M [00:00<?, ?B/s]

2022-05-04 16:49:03,823 Reading data from data
2022-05-04 16:49:03,824 Train: data/tr_imst-ud-train.conllu
2022-05-04 16:49:03,830 Dev: data/tr_imst-ud-dev.conllu
2022-05-04 16:49:03,831 Test: data/tr_imst-ud-test.conllu
2022-05-04 16:49:06,339 ----------------------------------------------------------------------------------------------------
2022-05-04 16:49:06,345 Model: "SequenceTagger(
  (embeddings): TransformerWordEmbeddings(
    (model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(32000, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_feat

{'dev_loss_history': [tensor(0.1739, device='cuda:0'),
  tensor(0.1279, device='cuda:0'),
  tensor(0.1292, device='cuda:0'),
  tensor(0.1465, device='cuda:0'),
  tensor(0.1743, device='cuda:0'),
  tensor(0.2230, device='cuda:0'),
  tensor(0.2307, device='cuda:0'),
  tensor(0.2581, device='cuda:0'),
  tensor(0.2629, device='cuda:0'),
  tensor(0.2641, device='cuda:0')],
 'dev_score_history': [0.9484371889309178,
  0.9638662154091181,
  0.9653593470037826,
  0.9660561417479594,
  0.9676488154489349,
  0.9666533943858252,
  0.9681465259804898,
  0.9670515628110692,
  0.9671511049173801,
  0.967250647023691],
 'test_score': 0.9668960015953734,
 'train_loss_history': [0.6023193328168743,
  0.2601176800715316,
  0.20592318529537698,
  0.1854141712413241,
  0.16290564363874144,
  0.15608359270702168,
  0.1426529177672671,
  0.14473560649758704,
  0.14686607256520523,
  0.1376262863712947]}

In [4]:
! tar -czvf model.tar.gz /content/flert-imst-dbmdz

tar: Removing leading `/' from member names
/content/flert-imst-dbmdz/
/content/flert-imst-dbmdz/bert-base-turkish-cased-42/
/content/flert-imst-dbmdz/bert-base-turkish-cased-42/final-model.pt
/content/flert-imst-dbmdz/bert-base-turkish-cased-42/dev.tsv
/content/flert-imst-dbmdz/bert-base-turkish-cased-42/test.tsv
/content/flert-imst-dbmdz/bert-base-turkish-cased-42/loss.tsv
/content/flert-imst-dbmdz/bert-base-turkish-cased-42/weights.txt
/content/flert-imst-dbmdz/bert-base-turkish-cased-42/training.log


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
! mv model.tar.gz /content/drive/MyDrive/taboo/models/