In [None]:
!pip install pytorch_lightning

import pandas as pd
from transformers import AutoModel, AutoTokenizer
from google.colab import drive
drive.mount('/content/drive')
import numpy as np

%pip install camel-tools

from pathlib import Path
S31_DB_PATH = Path('../data/disambig_db/calima-msa-s31.db')
S31_DB = MorphologyDB(S31_DB_PATH, 'a')
S31_AN = Analyzer(S31_DB, 'NOAN_ALL', cache_size=100000)
bert_disambig = BERTUnfactoredDisambiguator.pretrained('msa', top=1000, pretrained_cache = False)
bert_disambig._analyzer = S31_AN


Mounted at /content/drive
Collecting camel-tools
  Downloading camel_tools-1.5.2-py3-none-any.whl (124 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m124.3/124.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Collecting docopt (from camel-tools)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill (from camel-tools)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting emoji (from camel-tools)
  Downloading emoji-2.10.1-py2.py3-none-any.whl (421 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.5/421.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyrsistent (from camel-tools)
  Downloading pyrsistent-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.7

Some weights of the model checkpoint at /content/drive/MyDrive/camel_tools/data/disambig_bert_unfactored/msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Objective: on the ALIGNED-tagged corpus, get counts of all POS x Level


In [6]:
# aligned fragments data import

frag_train = pd.read_csv('../data/all_train_aligned.csv')
frag_dev = pd.read_csv('../data/all_dev_aligned.csv')
frag_test = pd.read_csv('../data/all_test_aligned.csv')


frag_train = frag_train[frag_train.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_dev = frag_dev[frag_dev.apply(lambda x: type(x['0']) == str, axis = 1)]
frag_test = frag_test[frag_test.apply(lambda x: type(x['0']) == str, axis = 1)]

def sort_score(list_of_analyses):
  list_of_analyses.sort(key = lambda x: x.score, reverse = True)
  highest_score = list_of_analyses[0].score
  analyses_with_equal_score = [x for x in list_of_analyses
                                if x.score == highest_score]
  return analyses_with_equal_score

def score_select(list_of_analyses):
  list_of_analyses = sort_score(list_of_analyses)
  return list_of_analyses[0].analysis

def counts_pipeline(fragment):
  tokens = [t.split('#')[0] for t in fragment.split(' ')]
  gt_levels = [t.split('#')[1] for t in fragment.split(' ')]
  analyses = [token.analyses for token in bert_disambig.disambiguate(tokens)]
  picked_analyses = [score_select(analysis) for analysis in analyses]

  return [[t, g, a['pos']] for t, a, g in zip(tokens, picked_analyses, gt_levels)]


In [9]:
from tqdm import tqdm
all_counts = []
for x in tqdm(frag_train['0']):
  all_counts.extend(counts_pipeline(x))
for x in tqdm(frag_dev['0']):
  all_counts.extend(counts_pipeline(x))
for x in tqdm(frag_test['0']):
  all_counts.extend(counts_pipeline(x))

100%|██████████| 14332/14332 [21:41<00:00, 11.01it/s]
100%|██████████| 2969/2969 [04:25<00:00, 11.20it/s]
100%|██████████| 3274/3274 [04:55<00:00, 11.09it/s]


#### Pivot on pos x level.

In [10]:
all_counts_df = pd.DataFrame(all_counts)
summary = all_counts_df.groupby([1, 2]).size().reset_index(name='count')
pivot_table = summary.pivot(index=2, columns=1, values='count').fillna(0)

In [18]:
pivot_table.to_csv('../data/analysis/all_pos_aligned.csv')

### Calculate on Train only

In [35]:
all_counts_train = []
for x in tqdm(frag_train['0']):
  all_counts_train.extend(counts_pipeline(x))

100%|██████████| 14332/14332 [21:51<00:00, 10.93it/s]


In [36]:
all_counts_train_df = pd.DataFrame(all_counts_train)
summary_train = all_counts_train_df.groupby([1, 2]).size().reset_index(name='count')
pivot_table_train = summary_train.pivot(index=2, columns=1, values='count').fillna(0)

In [37]:
pivot_table_train