In [None]:
import os.path

import numpy as np
import pandas as pd

import vjp.preprocess as preprocess
import vjp.folds as folds
import vjp.text as text

from sklearn.feature_extraction.text import TfidfVectorizer

import vjp.preprocess as preprocess

Retrieve a preprocessed dataframe using the pipeline defined by `data_exploration.ipynb`.

In [None]:
# Load from file if exists (e.g. pregenerated via preprocess CLI)
DF_FILENAME = 'connected_components.parquet'
if os.path.exists(DF_FILENAME):
    print(f'Reading from {DF_FILENAME}...')
    df = pd.read_parquet(DF_FILENAME)
else:           # Generate on the go
    print('File not found, generating dataframe...')
    namespace = preprocess.Namespace()
    namespace.connected_component_tags = ('req', 'arg', 'claim', 'mot', 'dec')
    namespace.use_child_text_tag_names = ('mot', 'dec')
    namespace.level = preprocess.PreprocessingLevels.CONNECTED_COMPONENTS
    df = preprocess.preprocess(namespace)

df.head()

All tag types are gathered, so that multiple experiments may be carried out by excluding some of the columns.

Balanced KFold splits are computed at document level using a MIP formulation. 

In [None]:
fold_maps = folds.compute_decision_folds(df, verbose=True)

Preview of the first split:

In [None]:
print(df[fold_maps[0]].shape)
df[fold_maps[0]].head()

A `split` function encapsules all the kfold logic and provides train-test splits based on the its results. The function has a similar interface to the one of scikit-learn's validators, and is suitable to be used with `GridSearchCV`.

In [None]:
for train_indeces, test_indeces in folds.split(df):
    print(len(train_indeces), len(test_indeces))

In [None]:
text.load_stopwords()

In [None]:
tags = ['fact','req', 'arg', 'claim', 'mot', 'dec']
file_name = "italian.txt"

df[tags] = df[tags].applymap(text.count_drop_pipeline)

df.head()

In [None]:
df_unified = df["req"]+df["arg"]+df["claim"]

In [None]:
vectorizer = TfidfVectorizer()
result = vectorizer.fit_transform(df_unified)
print(vectorizer.get_feature_names_out())
print(len(vectorizer.get_feature_names_out()))

In [None]:
lemmas = lemmatization.load_lemmas()
print(type(lemmas))
print(list(lemmas.keys())[:30])

In [None]:
# len(lemmas.items())
len(set(lemmas.values()))