<a href="https://colab.research.google.com/github/CristinaGHolgado/old-french-lemmatization/blob/master/train_tag_udpipe_old_french.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%load_ext rpy2.ipython

In [None]:
%%R
install.packages("udpipe")

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
import os
os.chdir("gdrive/MyDrive/Lemmatisation AF") # path to folders with training data
!ls

TEST_1			  TEST_3_tagged_cc_udpipe  TEST_6_udpipe_model
TEST_10			  TEST_3_udpipe_model	   TEST_7
TEST_10_tagged_cc_udpipe  TEST_4		   TEST_7_tagged_cc_udpipe
TEST_10_udpipe_model	  TEST_4_tagged_cc_udpipe  TEST_7_udpipe_model
TEST_1_tagged_cc_udpipe   TEST_4_udpipe_model	   TEST_8
TEST_1_udpipe_model	  TEST_5		   TEST_8_tagged_cc_udpipe
TEST_2			  TEST_5_tagged_cc_udpipe  TEST_8_udpipe_model
TEST_2_tagged_cc_udpipe   TEST_5_udpipe_model	   TEST_9
TEST_2_udpipe_model	  TEST_6		   TEST_9_tagged_cc_udpipe
TEST_3			  TEST_6_tagged_cc_udpipe  TEST_9_udpipe_model


## Run for training

In [5]:
import glob
path_list = [os.path.join(dirpath,filename) for dirpath, _, filenames in os.walk('.') for filename in filenames if filename.endswith('UDPIPE_ca_final.conllu')]
path_list

['./TEST_7/TEST_7_UDPIPE_ca_final.conllu',
 './TEST_3/TEST_3_UDPIPE_ca_final.conllu',
 './TEST_1/TEST_1_UDPIPE_ca_final.conllu',
 './TEST_8/TEST_8_UDPIPE_ca_final.conllu',
 './TEST_10/TEST_10_UDPIPE_ca_final.conllu',
 './TEST_9/TEST_9_UDPIPE_ca_final.conllu',
 './TEST_6/TEST_6_UDPIPE_ca_final.conllu',
 './TEST_2/TEST_2_UDPIPE_ca_final.conllu',
 './TEST_4/TEST_4_UDPIPE_ca_final.conllu',
 './TEST_5/TEST_5_UDPIPE_ca_final.conllu']

In [8]:
### SPLIT A % OF THE TRAINING CORPUS AND USE IT AS DEVELOPMENT SET

import pandas as pd
import csv
import subprocess
import re

def validation_set(file, percentage_nb):
	''' Split input file into train/dev at choosen percentage

	Parameters
	----------
	file :	str
		input corpus (conLL-U format)

	percentage_nb : int
		% of input training corpus to be used as dev set
	'''
	
	split_index = []

	with open(file, "r", encoding="utf8") as f:
		
		corpus = f.readlines()
		sents_index = []
		
		for i, line in enumerate(corpus):

			if line.startswith("# sent_id"):

				sents_index.append(i) # append sentence id when sent_id in file
		

		dict_indexes = dict(zip(sents_index,[sents_index.index(i) for i in sents_index]))
		
		split_at = round((percentage_nb*len(sents_index))/100)+1 # split at sentence id corresponding to the choosen %
		
		split_from_line = list(dict_indexes.items())[-split_at:][0][0]
		split_index.append(split_from_line)


		print(f"File length: {len(corpus)}")
		print(f"Sentences (`sent_id` count) in file: {len(sents_index)}")


	filename = list(filter(None, re.split(r"\.conllu|\\", file)))[-1]

	# split corpus file from firt line to choosen index-1 as train set
	subprocess.check_output(['bash','-c', f"head -n {split_index[0]-1} {file} > {filename + '_train.conllu'}"])
	#  split corpus file from choosen index to end dev set
	subprocess.check_output(['bash','-c', f"tail -n +{split_index[0]} {file} > {filename + '_dev.conllu'}"])


for file in path_list:
  print(file)
  validation_set(file, 10) # 10% used as validation set
  print()

./TEST_7/TEST_7_UDPIPE_ca_final.conllu
File length: 454169
Sentences (`sent_id` count) in file: 15328
./TEST_3/TEST_3_UDPIPE_ca_final.conllu
File length: 473788
Sentences (`sent_id` count) in file: 16118
./TEST_1/TEST_1_UDPIPE_ca_final.conllu
File length: 197503
Sentences (`sent_id` count) in file: 6885
./TEST_8/TEST_8_UDPIPE_ca_final.conllu
File length: 474057
Sentences (`sent_id` count) in file: 16063
./TEST_10/TEST_10_UDPIPE_ca_final.conllu
File length: 467438
Sentences (`sent_id` count) in file: 16159
./TEST_9/TEST_9_UDPIPE_ca_final.conllu
File length: 468987
Sentences (`sent_id` count) in file: 16179
./TEST_6/TEST_6_UDPIPE_ca_final.conllu
File length: 468217
Sentences (`sent_id` count) in file: 16101
./TEST_2/TEST_2_UDPIPE_ca_final.conllu
File length: 426975
Sentences (`sent_id` count) in file: 14666
./TEST_4/TEST_4_UDPIPE_ca_final.conllu
File length: 438432
Sentences (`sent_id` count) in file: 14249
./TEST_5/TEST_5_UDPIPE_ca_final.conllu
File length: 460008
Sentences (`sent_id` c

# Train models

In [68]:
%%R
library(udpipe)

training_corpus <- list.files(pattern = "UDPIPE_ca_final_train.conllu$", recursive = TRUE)
dev_corpus <- list.files(pattern = "UDPIPE_ca_final_dev.conllu$", recursive = TRUE)


params <- list()
params$tagger <- list(models = 2, 
  templates_1 = "tagger", 
      guesser_suffix_rules_1 = 8, guesser_enrich_dictionary_1 = 6, 
      guesser_prefixes_max_1 = 0, 
      use_lemma_1 = 1, use_xpostag_1 = 1, use_feats_1 = 1, 
      provide_lemma_1 = 0, provide_xpostag_1 = 1, 
      provide_feats_1 = 1, prune_features_1 = 0, 
  templates_2 = "lemmatizer", 
      guesser_suffix_rules_2 = 2, guesser_enrich_dictionary_2 = 4, 
      guesser_prefixes_max_2 = 4, 
      use_lemma_2 = 1, use_xpostag_2 = 1, use_feats_2 = 1, 
      provide_lemma_2 = 1, provide_xpostag_2 = 0, 
      provide_feats_2 = 0, prune_features_2 = 0)


train_udpipe <- function(traindata, devdata) {
    traindata <- traindata
    modelname <- paste(strsplit(traindata, "/")[[1]][1],"_udpipe_model")
    print("Files used at training:")
    print(c(traindata, devdata))
    
    train <- udpipe_train(file = modelname, files_conllu_training = traindata, files_conllu_holdout  = devdata,
                          annotation_tokenizer = list(dimension = 24, epochs = 100, initialization_range = 0.1,
                                                      batch_size = 100, learning_rate=0.005, dropout = 0.1,
                                                      early_stopping = 1),
                          annotation_tagger = params$tagger,
                          annotation_parser = "none")
}


train = training_corpus
dev = dev_corpus

for (i in 1:length(x)){
    train_udpipe(train[i], dev[i])
}

[1] "Files used at training:"
[1] "TEST_1/TEST_1_UDPIPE_ca_final_train.conllu"
[2] "TEST_1/TEST_1_UDPIPE_ca_final_dev.conllu"  
Training tokenizer with the following options: tokenize_url=1, allow_spaces=0, dimension=24
  epochs=100, batch_size=100, segment_size=50, learning_rate=0.0050, learning_rate_final=0.0000
  dropout=0.1000, early_stopping=1
Epoch 1, logprob: -1.7499e+05, training acc: 87.36%, heldout tokens: 100.00%P/100.00%R/100.00%, sentences: 100.00%P/100.00%R/100.00%
Epoch 2, logprob: -6.3619e+03, training acc: 99.61%, heldout tokens: 100.00%P/100.00%R/100.00%, sentences: 100.00%P/100.00%R/100.00%
Epoch 3, logprob: -5.2538e+03, training acc: 99.63%, heldout tokens: 100.00%P/100.00%R/100.00%, sentences: 100.00%P/100.00%R/100.00%
Epoch 4, logprob: -4.9589e+03, training acc: 99.62%, heldout tokens: 100.00%P/100.00%R/100.00%, sentences: 100.00%P/100.00%R/100.00%
Epoch 5, logprob: -4.6226e+03, training acc: 99.63%, heldout tokens: 100.00%P/100.00%R/100.00%, sentences: 100.00%P/1

# Tag&lemmatize test set

In [None]:
%%R

library(stringi)
library(udpipe)

test_files <- list.files(pattern = "corpus_controle_final.csv$", recursive = TRUE, full.names=TRUE)
models_udpipe <- list.files(pattern = "_udpipe_model",recursive=TRUE, full.names=FALSE)

d <- data.frame(cc = test_files, mod = models_udpipe)
d$x <- paste(d$cc, "--", d$mod)
d <- d$x

tag_udpipe <- function(cc_file, model){
    load_model <- udpipe_load_model(file = model)
    corpuscontrole <- paste(readLines(cc_file))
    annotated <- udpipe_annotate(load_model, x = corpuscontrole, tokenizer = "vertical", tagger = "default",
                                  parser = "none", trace = FALSE)
    df <- as.data.frame(annotated)
     
    outputname = gsub("[[:blank:]]","", paste(strsplit(model,"_udpipe_model")[[1]][1],"_tagged_cc_udpipe"))
    write.table(df[6:10], outputname, sep='\t', fileEncoding = "UTF-8", quote=FALSE,  col.names=FALSE, row.names=FALSE)
}

for(i in d){
    cc = gsub("[[:blank:]]","", strsplit(i,"--")[[1]][1])
    models = gsub("[[:blank:]]","", strsplit(i, "--")[[1]][2])
    tag_udpipe(cc, models)
}