In [None]:
import os.path

import numpy as np
import pandas as pd

import vjp.data as data
import vjp.preprocess as preprocess
import vjp.folds as folds
import vjp.text as text

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier

import vjp.preprocess as preprocess

Retrieve a preprocessed dataframe using the pipeline defined by `data_exploration.ipynb`.

In [None]:
# Load from file if exists (e.g. pregenerated via preprocess CLI)
DF_FILENAME = 'connected_components.parquet'
if os.path.exists(DF_FILENAME):
    print(f'Reading from {DF_FILENAME}...')
    df = pd.read_parquet(DF_FILENAME)
else:           # Generate on the go
    print('File not found, generating dataframe...')
    namespace = preprocess.Namespace()
    namespace.connected_component_tags = ('req', 'arg', 'claim', 'mot', 'dec')
    namespace.use_child_text_tag_names = ('mot', 'dec')
    namespace.level = preprocess.PreprocessingLevels.CONNECTED_COMPONENTS
    df = preprocess.preprocess(namespace)

df.head()

In [None]:
# Reproducibility
random_state = 1717

## Splits

All tag types are gathered, so that multiple experiments may be carried out by excluding some of the columns.

Balanced KFold splits are computed at document level using a MIP formulation. 

In [None]:
fold_maps = folds.compute_decision_folds(df, verbose=True)

Preview of the first split:

In [None]:
print(df[fold_maps[0]].shape)
df[fold_maps[0]].head()

A `split` function encapsules all the kfold logic and provides train-test splits based on the its results. The function has a similar interface to the one of scikit-learn's validators, and is suitable to be used with `GridSearchCV`.

In [None]:
for train_indeces, test_indeces in folds.split(df):
    print(len(train_indeces), len(test_indeces))

## Count based encodings
For count based encodings (e.g. tf-idf) text data shall be cleaned in a certain way. Punctuations and symbols, most stopwords, etc. are not required, as the order and structure of sentences is generally lost.

In [None]:
text.load_stopwords()

`vjp.text` contains some pipelines that: lower text, remove punctuation, remove stopwords and lemmatize. Such transformations can be applied to the desired features before feeding them to the model.

In [None]:
tags = ['fact', 'req', 'arg', 'claim', 'mot', 'dec']

df[tags] = df[tags].applymap(text.count_drop_pipeline)

Features are concatenated for easier vectorization and labels are splitted as demanded by `sklearn` models.

In [None]:
X, y = data.count_based_X_y(df, ['fact', 'req', 'arg', 'claim'])
print(X.head())
print(y.head())

## Models

A simple dummy baseline is defined (defaults to majority class).

In [None]:
dummy_baseline = DummyClassifier(random_state=random_state)
avg_results = cross_val_score(dummy_baseline, X, y, cv=folds.split(df),
                              scoring='f1_macro', n_jobs=-1).mean()

print('Prior dummy, F1 macro avg:', avg_results)

In [None]:
random_forest = Pipeline([('vectorizer', TfidfVectorizer()),
                          ('model', RandomForestClassifier(
                              random_state=random_state))])
avg_results = cross_val_score(random_forest, X, y, cv=folds.split(df),
                              scoring='f1_macro', n_jobs=-1).mean()

print('Random forest, F1 macro avg:', avg_results)