# Train and test models using different parts of the case

In [4]:
from echr import *
import numpy as np
import pickle

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
classifiers = {'SVM': SGDClassifier(n_jobs=-1),
               'NB': MultinomialNB(),
               'RF': RandomForestClassifier(n_jobs=-1),
               'SVM_med': LinearSVC(),
              }
result_dir = 'results/parameter_optimization/'
articles = ['All', '2', '3', '5', '6', '8', '10', '11', '13', '14']
path = 'datasets/Medvedeva/'
json_path = 'datasets/echrod/cases.json'
debug = False
num_runs = 5
cv = 10
n_jobs = -1
use_parts = 'facts'

In [7]:
for path_name, path in {'med': 'datasets/Medvedeva/', 'echrod': 'datasets/echrod/cases.json'}.items():
    for classifier_name, classifier in classifiers.items():
        print(classifier_name)
        for use_parts in ['procedure+facts', 'procedure', 'facts']:
            if classifier_name == 'RF' and use_parts != 'facts': continue
            print(use_parts)
            filename = 'results/parts/'+path_name+'/results_' + classifier_name + '_' + use_parts
            results_df = []
            for article_name in articles:
                print('\t', article_name)

                # Build the data
                df_complete = create_dataset(path, article_name, use_parts)
                df_train = balance_dataset(df_complete) 
                X_train = df_train['text'].to_numpy()
                y_train = df_train['violation'].to_numpy()


                # Get the best params (use data params from med)
                params = get_best_params('Article' + article_name, 'SVM_med')
                clf_params = get_best_params('Article' + article_name, classifier_name)
                for key, value in clf_params.items():
                    params[key] = value

                if 'norm' not in params or params['norm'] == 'None':
                    params['norm'] = None
                if 'use_idf' not in params:
                    params['use_idf'] = True
                if params['remove_stopwords']:
                    params['remove_stopwords'] = 'english'
                else:
                    params['remove_stopwords'] = None
                if not type(params['ngram_range']) == int:
                    params['ngram_range'] = ast.literal_eval(params['ngram_range'])
                params['parts'] = '+'.join(ast.literal_eval(params['parts']))


                vec = ('wordvec', TfidfVectorizer(analyzer = 'word', 
                                                  ngram_range = params['ngram_range'], 
                                                  binary = params['binary'], 
                                                  lowercase = params['remove_upper'],
                                                  min_df = params['min_df'], 
                                                  norm = params['norm'], 
                                                  stop_words = params['remove_stopwords'], 
                                                  use_idf = params['use_idf']))
                clf = get_classifier('Article'+article_name, classifier_name)

                accs, mccs, f1s = [], [], []
                for idx in range(0, num_runs):
                    acc, mcc, f1 = train_model_cross_val(X_train, y_train, vec, clf, debug=debug, cv=cv, n_jobs=n_jobs)
                    accs.append(acc)
                    mccs.append(mcc)
                    f1s.append(f1)

                results_df.append({
                    'Article' : article_name,
                    'Classifier' : classifier_name,
                    'Accuracy' : np.mean(accs),
                    'MCC' : np.mean(mccs),
                    'F1' : np.mean(f1s),
                    'part': use_parts,
                    'training_size': len(df_train),
                    'train_distribution': round(df_train['violation'].mean()*100,2),
                })
                print('\t\t-->', round(np.mean(accs)*100, 2), '%', round(np.mean(f1s)*100, 2), '%', round(np.mean(mccs)*100, 2), '%')
            pd.DataFrame(results_df).to_csv(filename + '.csv', index=False)
#             with open(filename+'.pkl', 'wb') as fid:
#                 pickle.dump(classifier, fid)    

RF
facts
	 All
		--> 75.4 % 74.55 % 50.91 %
	 2
		--> 67.4 % 64.46 % 35.29 %
	 3
		--> 60.8 % 59.66 % 21.63 %
	 5
		--> 62.81 % 63.15 % 25.63 %
	 6
		--> 74.37 % 75.0 % 48.79 %
	 8
		--> 61.79 % 65.31 % 24.08 %
	 10
		--> 64.5 % 62.37 % 29.19 %
	 11
		--> 76.92 % 78.01 % 54.14 %
	 13
		--> 77.09 % 75.21 % 54.82 %
	 14
		--> 71.54 % 72.16 % 43.12 %
SVM_med
procedure+facts
	 All
		--> 71.75 % 70.08 % 43.77 %
	 2
		--> 67.97 % 64.08 % 36.8 %
	 3
		--> 64.18 % 64.46 % 28.37 %
	 5
		--> 67.67 % 67.93 % 35.35 %
	 6
		--> 76.82 % 75.82 % 53.81 %
	 8
		--> 67.79 % 68.84 % 35.66 %
	 10
		--> 61.58 % 61.58 % 23.16 %
	 11
		--> 77.88 % 77.67 % 55.78 %
	 13
		--> 80.36 % 79.36 % 61.0 %
	 14
		--> 76.56 % 76.98 % 53.15 %
procedure
	 All
		--> 69.26 % 67.49 % 38.75 %
	 2
		--> 65.8 % 64.89 % 31.64 %
	 3
		--> 62.64 % 63.36 % 25.3 %
	 5
		--> 62.25 % 63.06 % 24.52 %
	 6
		--> 71.88 % 72.16 % 43.77 %
	 8
		--> 59.9 % 60.55 % 19.82 %
	 10
		--> 61.05 % 58.66 % 22.25 %
	 11
		--> 65.38 % 67.27 % 30.98 %