In [None]:
import os.path

import numpy as np
import pandas as pd

import vjp.data as data
import vjp.preprocess as preprocess
import vjp.folds as folds
import vjp.text as text

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt

import vjp.preprocess as preprocess
import vjp.models as models

Retrieve a preprocessed dataframe using the pipeline defined by `data_exploration.ipynb`.

In [None]:
# Load from file if exists (e.g. pregenerated via preprocess CLI)
DF_FILENAME = 'connected_components.parquet'
if os.path.exists(DF_FILENAME):
    print(f'Reading from {DF_FILENAME}...')
    df = pd.read_parquet(DF_FILENAME)
else:           # Generate on the go
    print('File not found, generating dataframe...')
    namespace = preprocess.Namespace()
    namespace.connected_component_tags = ('req', 'arg', 'claim', 'mot', 'dec')
    namespace.use_child_text_tag_names = ('mot', 'dec')
    namespace.level = preprocess.PreprocessingLevels.CONNECTED_COMPONENTS
    df = preprocess.preprocess(namespace)

df.head()

In [None]:
# Reproducibility
random_state = 1717

## Splits

All tag types are gathered, so that multiple experiments may be carried out by eventually excluding some of the columns.

Balanced KFold splits are computed at document level using a MIP formulation. 

In [None]:
fold_maps = folds.compute_decision_folds(df, verbose=True, seed=random_state)

Preview of the first split:

In [None]:
print(df[fold_maps[0]].shape)
df[fold_maps[0]].head()

A `split` function encapsules all the kfold logic and provides train-test splits based on the its results. The function has a similar interface to the one of scikit-learn's validators, and is suitable to be used with its crossvalidation based metrics (`GridSearchCV`, `cross_validate`, etc.).

In [None]:
for train_indeces, test_indeces in folds.split(df):
    print(len(train_indeces), len(test_indeces))

## Count based encodings
For count based encodings (e.g. tf-idf) text data shall be cleaned in a certain way. Punctuations and symbols, most stopwords, etc. are not required, as the order and structure of sentences is generally lost.

In [None]:
text.load_stopwords()

`vjp.text` contains some pipelines that: lower text, remove punctuation, remove stopwords and lemmatize. Such transformations can be applied to the desired features before feeding them to the model.

In [None]:
tags = ['fact', 'req', 'arg', 'claim', 'mot', 'dec']

# Lemmatize but keep unknown values
df_keep = df.copy()
df_keep[tags] = df[tags].applymap(text.count_keep_pipeline)

# Lemmatize but drop unknown values
df_drop = df.copy()
df_drop[tags] = df[tags].applymap(text.count_drop_pipeline)

# Don't lemmatize
df_no_lem = df.copy()
df_no_lem[tags] = df_no_lem[tags].applymap(text.count_pipeline_head)

dataframes = ('keep', df_keep), ('drop', df_drop), ('no_lem', df_no_lem)

Features will be concatenated for easier vectorization and labels are splitted as demanded by `sklearn` models via `data.count_based_X_y`.

## Models

A simple dummy baseline is defined (defaults to majority class). Models are evaluated on three different preprocessing pipelines in order to inspect the effect of lemmatization.

In [None]:
dummy_baseline = DummyClassifier(random_state=random_state)

# {df_name: (mean, variance), ...}
results = {}
for name, df in dataframes:
    dummy_results = models.cross_validate(dummy_baseline, df,
                                          cv=folds.split(df))
    results[name] = dummy_results.mean(), dummy_results.std()

pd.DataFrame(results, ('mean', 'std'))

A random forest and a linear SVC are built. We focus on these two models as they provide some form of interpretability of the features' weights.

In [None]:
random_forest = Pipeline([('vectorizer', TfidfVectorizer()),
                          ('model', RandomForestClassifier(
                              random_state=random_state))])

results = {}
for name, df in dataframes:
    forest_results = models.cross_validate(random_forest, df,
                                           cv=folds.split(df))
    results[name] = forest_results.mean(), forest_results.std()

pd.DataFrame(results, ('mean', 'std'))

In [None]:
linear_svc = Pipeline([('vectorizer', TfidfVectorizer()),
                       ('model', LinearSVC(random_state=random_state))])
results = {}
for name, df in dataframes:
    svc_results = models.cross_validate(linear_svc, df,
                                        cv=folds.split(df))
    results[name] = svc_results.mean(), svc_results.std()

pd.DataFrame(results, ('mean', 'std'))

## Feature importance
In an effort to interpret model results, Gini importance values are extracted from a random forest and weights are extracted from a SVC. For convenience (not having to deal with multiple splits) the models are fit on the whole dataset. The performance of such models are not evaluated (how could it be?).

In [None]:
random_forest = random_forest.fit(*data.count_based_X_y(df_no_lem,
                                                        models.DEFAULT_TAGS))
importances = pd.Series(random_forest[-1].feature_importances_,
                        random_forest[0].get_feature_names_out())

In [None]:

plt.figure(figsize=(15, 4))
plt.xticks(rotation=45)
plt.ylabel('Gini importance')
plt.title('Random forest feature importance')
plt.bar(*zip(*importances.sort_values(ascending=False)[:30].items()))
plt.show()

In [None]:
linear_svc = linear_svc.fit(*data.count_based_X_y(df_no_lem,
                                                  models.DEFAULT_TAGS))
coefficients = pd.Series(linear_svc[-1].coef_[0],
                         linear_svc[0].get_feature_names_out())

In [None]:
plt.figure(figsize=(15, 4))
plt.xticks(rotation=45)
plt.ylabel('$|w_i|$')
plt.title('Linear SVC Best weights')
plt.bar(*zip(*abs(coefficients).sort_values(ascending=False)[:30].items()))
plt.show()