In [None]:
import os, logging, random
from zipfile import ZipFile

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing

from contrxt.contrxt import ContrXT

# Load Data

In [None]:
# Import the dataset from time_1 and time_2
with ZipFile('tests/test_data/20newsgroups.zip', 'r') as archive:
    df_time_1 = pd.read_csv(archive.open('df_time_1.csv'), delimiter=',')
    df_time_2 = pd.read_csv(archive.open('df_time_2.csv'), delimiter=',')

df_time_1.head()

# Train Simple Classifier

In [None]:
np.random.seed(42)
random.seed(42)
    
# Encode the categorical target
encoder = preprocessing.LabelEncoder()
X_t1, X_t2 = df_time_1['corpus'], df_time_2['corpus']
Y_t1, Y_t2 = (
    encoder.fit_transform(df_time_1['category']),
    encoder.fit_transform(df_time_2['category'],
)    

# Define vectorizer
vect_t1 = TfidfVectorizer(max_features=int(21e6), ngram_range=(1, 1))
vect_t2 = TfidfVectorizer(max_features=int(21e6), ngram_range=(1, 1))

# Fit and transform text data
sparse_t1, sparse_t2 = (
    vect_t1.fit_transform(X_t1),
    vect_t2.fit_transform(X_t2),
) 

# Train simple Naive Bayes classifiers
classifier_t1, classifier_t2 = MultinomialNB(), MultinomialNB()
classifier_t1.fit(sparse_t1, Y_t1)
classifier_t2.fit(sparse_t2, Y_t2)

# Get class names
class_names = df_time_1['category'].unique()
class_names.sort()

# Get model predictions
predicted_labels_t1 = [class_names[i] for i in classifier_t1.predict(sparse_t1)]
predicted_labels_t2 = [class_names[i] for i in classifier_t2.predict(sparse_t2)]

# ContrXT

In [None]:
# Initialize ContrXT
exp = ContrXT(X_t1, predicted_labels_t1,
              X_t2, predicted_labels_t2,
              hyperparameters_selection=True, save_path=f'results/',
              save_surrogates=True, save_bdds=True)

exp.run_trace()

In [None]:
exp.run_explain()

# Bdd2Text

In [None]:
exp.explain.BDD2Text()