# Experiment: Generalist vs Specialists - Naive Bayes, SVM, Random Forest

In [1]:
from echr import *
from nb_tfidf import *
from bert import *
import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from csv import DictWriter
from tqdm import tqdm
from joblib import Parallel, delayed
from sklearn.base import clone
import gc

%load_ext autoreload
%autoreload 2

In [2]:
path = 'datasets/Medvedeva/'
json_path = 'datasets/echrod/cases.json'
part = 'facts'
article = '6'
balance = True
best_alpha = 5
debug = False

In [3]:
def train_test(model, df_train, df_test):
       
    # Balance the training data
    df_train = balance_dataset(df_train) 

    # Split the features and label
    X_train = df_train['text'].to_numpy()
    y_train = df_train['violation'].to_numpy()
    X_test = df_test['text'].to_numpy()
    y_test = df_test['violation'].to_numpy()
   
    # Preprocess
    X_train_preprocessed = np.array([text_preprocessing(text) for text in X_train])
    X_test_preprocessed = np.array([text_preprocessing(text) for text in X_test])

    # Calculate TF-IDF
    tf_idf = TfidfVectorizer(ngram_range=(1, 3),
                             binary=True,
                             smooth_idf=False)
    X_train_tfidf = tf_idf.fit_transform(X_train_preprocessed)
    X_test_tfidf = tf_idf.transform(X_test_preprocessed)

    # Train model
    model.fit(X_train_tfidf, y_train)

    # Test model
    preds = model.predict(X_test_tfidf)

    full_preds = {}
    full_preds['final_pred'] = preds
    full_preds['true_pred'] = y_test

    return return_metrics(preds, y_test), pd.DataFrame.from_dict(full_preds) #acc, mcc, f1


def train_test_ensemble(model, df_train_sets, df_test):
    
    full_preds = {}
    all_preds = []
    
    for art, df_train in tqdm(df_train_sets.items()):
        
        c_model = clone(model)
        
        if type(df_train) == str:
            if debug: print('reading data')
            df_train = pd.read_csv(df_train).dropna()
            if debug: print(art, len(df_train))
        
        if debug: print('balancing data')
        # Balance the training data
        df_train = balance_dataset(df_train)
        if debug: print('after balance, train_df length is', len(df_train))


        if debug: print('Splitting data')
        # Split the features and label
        X_train = df_train['text'].to_numpy()
        y_train = df_train['violation'].to_numpy()
        if debug: print('Splitting test data')
        X_test = df_test['text'].to_numpy()
        y_test = df_test['violation'].to_numpy()
        del df_train
        
        if debug: print('Preprocessing data, x_train has length', len(X_train))
        # Preprocess
        X_train_preprocessed = np.array([text_preprocessing(text) for text in X_train])
        if debug: print('Preprocessing test data, x_test has length', len(X_train))
        X_test_preprocessed = np.array([text_preprocessing(text) for text in X_test])
        del X_train
        del X_test

        # Calculate TF-IDF
        if debug: print('TFIDF')
        tf_idf = TfidfVectorizer(ngram_range=(1, 3),
                                 binary=True,
                                 smooth_idf=False)
        X_train_tfidf = tf_idf.fit_transform(X_train_preprocessed)
        X_test_tfidf = tf_idf.transform(X_test_preprocessed)
        del X_train_preprocessed
        del X_test_preprocessed

        if debug: print('Training model')
       # Train model
        c_model.fit(X_train_tfidf, y_train)

        # Test model
        preds = c_model.predict(X_test_tfidf)
        
        full_preds[art] = preds
        
        # OR layer: if any model predicts violation the ensemble prediction is violation
        if len(all_preds) == 0:
            all_preds = preds
        else:
            for idx, pred in enumerate(preds):
                all_preds[idx] = max(pred, all_preds[idx])
        if debug: print('Finished round')
        
        # Cleanup to reduce memory
        del X_train_tfidf
        del X_test_tfidf
        del c_model
        del y_train
        del preds
        gc.collect()

    full_preds['final_pred'] = all_preds
    full_preds['true_pred'] = y_test
    
        
    return return_metrics(all_preds, y_test), pd.DataFrame.from_dict(full_preds) #acc, mcc, f1


In [4]:
random_state = 1995
random_states = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

def generate_ensemble_dataset(path, random_state, write=False):

    article_numbers = ['2', '3', '5', '6', '8', '10', '11', '13', '14']
    datasets = {art: create_dataset(path, art, part) for art in tqdm(article_numbers)}
    df_all = create_dataset(json_path, 'All', part)

    # Individual training datasets (for each article)
    train_dbs = {}
    
    # Individual training datasets including irrelevant cases (with non-violation label for that case)
    improved_train_dbs = {}
    
    # General test dataset (binary)
    test_db = [] 
    test_size = 0.1

    # Remove 10% of all the datasets
    for art, db in datasets.items():
        db['article'] = art
        train, test = train_test_split(db, test_size=0.1, random_state=random_state, stratify=db['violation'])
        train_dbs[art] = train 
        test_db.append(test)

    # All individual 10% test cases form the big test dataset
    test_db = pd.concat(test_db)

    # The general test set has binary labels
    test_general = df_all[df_all.id.isin(test_db['id'])]

    # General training dataset (binary)
    # Remove all test instances from the general training dataset
    train_general = df_all[~df_all.id.isin(test_db['id'])]
    
    # improved ensemble datasets
    for art, db in train_dbs.items():
        
        # All cases that were not in the article db
        additional_cases = df_all[~df_all.id.isin(db['id'])]
        additional_cases['violation'] = 0
        additional_cases['article'] = 999 # add 999 to symbolise that it is another article
        
        # split violation and non violation of the article db
        violation_cases = db[db['violation']==1]
        nonviolation_cases = db[db['violation']==0]
        violation_cases = violation_cases.sample(n=min(2500, len(violation_cases))) # take a sample to reduce space
        
        # There are usually less nonviolation cases. Add additional cases from other articles such that the number of non-violation cases
        # is equal to the amount of violation cases, making a larger, balanced dataset. If there are more non-violation cases, add nothing.
        additional_cases = additional_cases.sample(n=max(0, len(violation_cases)-len(nonviolation_cases)))
        
        # Add the three datasets together
        improved_train_dbs[art] = pd.concat([violation_cases, nonviolation_cases, additional_cases])
        
        if debug: print(art, len(db), len(violation_cases),  len(nonviolation_cases), len(additional_cases), len(improved_train_dbs[art]))
        del additional_cases
        del violation_cases
        del nonviolation_cases
        
    # Write all datasets to csv to reduce memory
    if write:
        balance_dataset(train_general).to_csv('results/ensemble/temp/train_general.csv')
        test_general.to_csv('results/ensemble/temp/test_general.csv')

        for art, db in train_dbs.items():
            balance_dataset(db).to_csv('results/ensemble/temp/train_dbs'+str(art)+'.csv')

        for art, db in improved_train_dbs.items():
            balance_dataset(db).to_csv('results/ensemble/temp/improved_train_dbs'+str(art)+'.csv')

        del train_dbs
        del improved_train_dbs
        del train_general
        del test_general
        gc.collect()

        return None, None, None, None
    
    return train_dbs, improved_train_dbs, train_general, test_general

def read_ensemble_dataset(db_name):
    if db_name == 'train_general':
        return pd.read_csv('results/ensemble/temp/train_general.csv').dropna()
    if db_name == 'test_general':
        return pd.read_csv('results/ensemble/temp/test_general.csv').dropna()
    if db_name == 'train_dbs':
        return {art:'results/ensemble/temp/train_dbs'+str(art)+'.csv'
                for art in ['2', '3', '5', '6', '8', '10', '11', '13', '14']}
    if db_name == 'improved_train_dbs':
        return {art:'results/ensemble/temp/improved_train_dbs'+str(art)+'.csv'
                for art in ['2', '3', '5', '6', '8', '10', '11', '13', '14']}    

In [5]:
def run_experiment(model, model_name, random_state, provided_dbs=None):
    if not provided_dbs:
        print('\tNo provided data, reading from files')
        train_dbs, improved_train_dbs, train_general, test_general = generate_ensemble_dataset(json_path, random_state)
        train_dbs, improved_train_dbs, train_general, test_general = read_ensemble_dataset()
    else:
        train_dbs = provided_dbs['train_dbs']
        improved_train_dbs = provided_dbs['improved_train_dbs']
        train_general = provided_dbs['train_general']
        test_general = provided_dbs['test_general']

    # General
    print('\tTraining general classifier')
    (acc_g, mcc_g, f1_g), full_preds = train_test(clone(model), read_ensemble_dataset('train_general'), read_ensemble_dataset('test_general'))
    full_preds.to_csv('results/ensemble/full_preds/'+model_name+'/general/full_preds_general_'+str(random_state)+'.csv')

#     #ensemble
    print('\tTraining ensemble')
    (acc_e, mcc_e, f1_e), full_preds = train_test_ensemble(clone(model), read_ensemble_dataset('train_dbs'), read_ensemble_dataset('test_general'))
    full_preds.to_csv('results/ensemble/full_preds/'+model_name+'/ensemble/full_preds_ensemble_'+str(random_state)+'.csv')
    
    #improved ensemble
    print('\tTraining improved ensemble')
    (acc_e2, mcc_e2, f1_e2), full_preds = train_test_ensemble(clone(model), read_ensemble_dataset('improved_train_dbs'), read_ensemble_dataset('test_general'))
    full_preds.to_csv('results/ensemble/full_preds/'+model_name+'/improved_ensemble/full_preds_improved_ensemble_'+str(random_state)+'.csv')
    
    print('\tWriting data')
    field_names = ['model', 'random_state', 'accuracy_general', 'MCC_general', 'F1_general', 
                   'accuracy_ensemble', 'MCC_ensemble', 'F1_ensemble', 
                   'accuracy_improved_ensemble', 'MCC_improved_ensemble', 'F1_improved_ensemble', 
                   'alpha',  'training_size', 'test_size', 'train_distribution', 'test_distribution']
    
    train_general = read_ensemble_dataset('train_general')
    test_general = read_ensemble_dataset('test_general')
    
    dct = {
        'model': model_name,
        'random_state': random_state,
        'accuracy_general': acc_g,
        'MCC_general': mcc_g,
        'F1_general': f1_g,
        'accuracy_ensemble': acc_e,
        'MCC_ensemble': mcc_e,
        'F1_ensemble': f1_e,
        'accuracy_improved_ensemble': acc_e2,
        'MCC_improved_ensemble': mcc_e2,
        'F1_improved_ensemble': f1_e2,
        'alpha': best_alpha,
        'training_size': len(train_general),
        'test_size': len(test_general),
        'train_distribution': round(train_general['violation'].mean()*100,2),
        'test_distribution': round(test_general['violation'].mean()*100,2)
         }
    
    filename = 'results/ensemble/ensemble_general.csv'
    file_exists = os.path.isfile(filename)
    with open(filename, 'a') as f_object:
        dictwriter_object = DictWriter(f_object, fieldnames=field_names)
        if not file_exists:
            dictwriter_object.writeheader()  # file doesn't exist yet, write a header
        dictwriter_object.writerow(dct)
        f_object.close()
    return 0

In [None]:
provided_dbs = None
for random_state in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
    print('Random_state:', random_state)
    
    # Generate and write datasets
    _, _, _, _ = generate_ensemble_dataset(json_path, random_state, write=True)
    provided_dbs = None    

    # Run experiment for each model
    print('NB')
    run_experiment(MultinomialNB(alpha=best_alpha), 'NB', random_state, provided_dbs=provided_dbs)
    print('SVM')
    run_experiment(SGDClassifier(n_jobs=1, alpha=0.0001, max_iter=100, penalty='l2'), 'SVM', random_state, provided_dbs=provided_dbs)
    print('RF')
    run_experiment(RandomForestClassifier(n_jobs=-1, bootstrap=False,  max_depth=100.0, max_features='auto',  min_samples_leaf=2,  min_samples_split=10,  n_estimators=2000), 'RF', random_state, provided_dbs=provided_dbs)