In [3]:
import sys
sys.path.append('code/')

from preprocess import preprocess
from preprocess import load_config
from train import train_classifier

from datetime import datetime

# Initialise Configuration 

<p>A configuration file is necessary to set hyperparameters and variables for preprocessing and training. Find the description of each key in the following frame

In [4]:
config = {'suppliment_testing': 0, # amount of samples to be taken from training and added to testing 
                                   # Necessary if not all labels present in testing 
                                   # A script that balances the dataset would be better
          
          'taglist': {'1-RAPPORT'       : 0,
                      '2-NEGOTIATE'     : 1,
                      '3-EMOTION'       : 2,
                      '4-LOGIC'         : 3,
                      '5-AUTHORITY'     : 4,
                      '6-SOCIAL'        : 5,
                      '7-PRESSURE'      : 6,
                      '8-NO-PERSUASION' : 7,
                      'NO-TAG'          : 8},
          
         'testing_data': {'conf_threshold' : 0.5,                 # Confidence threshold to allow testing sample 
                          'output_path'    : 'data/dataloaders/', # output path if generating dataloader
                          'path'           : 'data/post_processed/testing_data.jsonl'}, # path to jsonlist file
          
         'training_data': {'conf_threshold': 0.5,
                           'output_path'   : 'data/dataloaders/',
                           'path'          : 'data/post_processed/training_data.jsonl'},
          
         'vectoriser_config': {'type'        : 1 ,         # 1 counts 2 tfidf
                               'lowercase'   : True,       # lowercase tokens
                               'ngram_range' : '(1,1)',    # ngram range (1,1) is only unigrams (1, 2) is uni and bigrams 
                               'stop_words'  : 'english'}, # remove stopwords from language
          
         'model_config': {'type'        : 2,         # 1 - MLP 2 - SVM
                          'onevsrest'   : True,     # wrap in onevsrest
                          'output_path' : 'models/', # output folder for model
                          'max_iter'    : 400}}      # max iterations in training 


# Preprocess Training Data 

We can now preprocess the data using the configuration.

In [5]:
config, vectorizer_name, vectorizer, train_x, train_y, test_x, test_y = preprocess(config)

Total Training: 7755
Total Testing: 1694


In [8]:
import matplotlib.pyplot as plt

def show_tags_counts(counts):
    
    x = list(counts.keys())[:-1]
    y = list(counts.values())[:-1]
    
    plt.bar(x,y)
    locs, labels = plt.xticks()
    plt.setp(labels, rotation=45)
    plt.show()

def count_training_tags(config, train_y):
    idx2tag = { idx : tag for tag, idx in config.get('taglist').items()}
    counts = {tag : 0 for tag in idx2tag.values()}

    for tagidx in train_y:
        counts[idx2tag[tagidx]] += 1
        
    config['training_dist'] = counts
    
    show_tags_counts(counts)
    return idx2tag, counts, config

In [12]:
#idx2tag, counts, config = count_training_tags(config, train_y)

# Train Model 

In [13]:
clf, test_x, test_y = train_classifier(config, vectorizer_name=vectorizer_name,vectorizer=vectorizer,
                            train_x=train_x, train_y=train_y, test_x=test_x, test_y=test_y)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:   25.5s remaining:   32.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:   41.2s finished


# Evaluate Model 

In [14]:
from code.train import evaluate
from sklearn.metrics import classification_report
from code.train import output_eval

In [15]:
def evaluate(trained_model, test_x, test_y, config, verbose=None):

    idx2tag = {idx : tag for tag, idx in config.get('taglist').items()}
    
    try:
        labels = list(sorted(set([idx2tag[i] for i in test_y])))
    except:
        labels = list(idx2tag.values())
    
    
    preds = trained_model.predict(test_x)
    report = classification_report(preds, test_y, target_names = labels,zero_division=True, output_dict=True)
    
    if verbose:
        text = classification_report(preds, test_y, target_names = labels,zero_division=True)
        print(text)
        
    config['classification_report'] = report
    config['time'] = datetime.now().strftime('%d-%m-%Y %H:%M')

    if config['model_config'].get('output_path'):
        output_eval(trained_model, config)
    
    return config, text

In [16]:
report, text = evaluate(clf, test_x, test_y, config, verbose=True)

                 precision    recall  f1-score   support

      1-RAPPORT       0.82      0.83      0.83      1244
    2-NEGOTIATE       0.31      0.98      0.48       170
      3-EMOTION       0.18      0.94      0.30       106
        4-LOGIC       0.11      0.88      0.19        32
    5-AUTHORITY       0.14      0.85      0.25        26
       6-SOCIAL       0.00      1.00      0.00         0
     7-PRESSURE       0.00      1.00      0.00         0
8-NO-PERSUASION       0.55      0.89      0.68       838
         NO-TAG       1.00      1.00      1.00         0

      micro avg       0.48      0.87      0.62      2416
      macro avg       0.35      0.93      0.41      2416
   weighted avg       0.65      0.87      0.71      2416
    samples avg       0.55      0.89      0.62      2416

Outputting model to models/model_20_bow_unigrams_svm_True.pkl
Outputting config to models/model_20_bow_unigrams_svm_True.yaml
