# Argument Prediction

In this notebook, the argument prediction model is trained.

This includes the baseline and the ensemble model.

## Prepare Data

In [None]:
# Load data from file

import json

dataset = []

with open('./labelled_data/1000_labelled_argument_sentences_3.json') as f:
    for line in f:
        json_line = json.loads(line)
        arg = {"text": json_line["content"], "label": json_line["annotation"]["labels"][0]}

        dataset.append(arg)

dataset

In [None]:
# Remove punctuation

import string

print(string.punctuation)

dataset = [{"text": sample["text"].translate(str.maketrans('', '', string.punctuation)), "label":sample["label"]} for sample in dataset]
dataset

In [None]:
# Split dataset into training and testing set

from sklearn.model_selection import train_test_split

# Original dataset
train, test = train_test_split(dataset, test_size=0.1)

train_x = [sample["text"] for sample in train]
train_y = [sample["label"] for sample in train]

test_x = [sample["text"] for sample in test]
test_y = [sample["label"] for sample in test]

## Classification

### Grid Search for Hyperparameters

Find the best parameters for each classifier using an exhaustive search. Randomised search was first used to narrow the search space.

In [None]:
# It is possible to get undefined f1-score as in an exhaustive search (grid search), 
# some labels may never be predicted. This can lead to 0 precision or recall.
# Ignore these warnings

import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [None]:
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

# Lemmatize all words in a sentence. Uses PoS to identify lemma
# Function taken from: https://stackoverflow.com/a/39498745
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        else:
            yield word

#Custom Transformer that lemmatizes samples in the dataset
from sklearn.base import TransformerMixin, BaseEstimator

class LemmaTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, use_lemma=True):
        self.use_lemma = use_lemma
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        if self.use_lemma:
            return [' '.join(lemmatize_all(sample)) for sample in X]
        else:
            return X

### SVM

In [None]:
from sklearn.svm import SVC

best_parameters_svm = {'clf__C': [10],  
              'clf__gamma': [0.15], 
              'clf__kernel': ['rbf'],
                'vect__stop_words': [None],
                 'vect__ngram_range': [(1, 1)],
                 'vect__norm': [None],
                 'vect__use_idf': [False],
                 'lemma__use_lemma': [True]}

parameters_svm = {'clf__C': [0.1, 1, 10, 100, 10000],  
              'clf__gamma': [0.1, 0.15, 0.3], 
              'clf__kernel': ['rbf', 'linear', 'sigmoid'],
                'vect__stop_words': ['english', None],
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'vect__norm': ['l2', None],
                 'vect__use_idf': [True, False],
                 'lemma__use_lemma': [True, False]}

pipeline_svm = Pipeline([('lemma', LemmaTransformer()), ('vect', TfidfVectorizer()), ('clf', SVC())])

clf_svm = GridSearchCV(pipeline_svm, parameters_svm, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro', n_jobs=-1)

clf_svm.fit(train_x, train_y)

clf_svm.best_params_

### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB

best_parameters_nb = {'clf__alpha': [0.1],
                'vect__stop_words': [None],
                 'vect__ngram_range': [(1, 1)],
                 'vect__norm': ['l2'],
                 'vect__use_idf': [False],
                 'lemma__use_lemma': [False]}

parameters_nb = {'clf__alpha': [0.5, 0.75, 0.1, 0.075, 0.05],
                'vect__stop_words': ['english', None],
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'vect__norm': ['l2', None],
                 'vect__use_idf': [True, False],
                 'lemma__use_lemma': [True, False]}
    
pipeline_nb = Pipeline([('lemma', LemmaTransformer()), ('vect', TfidfVectorizer()), ('clf', MultinomialNB())])

clf_nb = GridSearchCV(pipeline_nb, parameters_nb, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro',  n_jobs=-1)

clf_nb.fit(train_x, train_y)

clf_nb.best_params_

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

best_parameters_dec = {'clf__criterion':['gini'],
                  'clf__max_depth': [5],
                  'clf__max_features': [None],
                'vect__stop_words': [None],
                 'vect__ngram_range': [(1, 1)],
                 'vect__norm': [None],
                 'vect__use_idf': [True],
                 'lemma__use_lemma': [True]}

parameters_dec = {'clf__criterion':['gini','entropy'],
                  'clf__max_depth': range(4, 10),
                  'clf__max_features': ['sqrt', 'log2', None],
                'vect__stop_words': ['english', None],
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'vect__norm': ['l2', None],
                 'vect__use_idf': [True, False],
                 'lemma__use_lemma': [True, False]}

pipeline_dec = Pipeline([('lemma', LemmaTransformer()), ('vect', TfidfVectorizer()), ('clf', DecisionTreeClassifier())])

clf_dec = GridSearchCV(pipeline_dec, parameters_dec, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro', n_jobs=-1)

clf_dec.fit(train_x, train_y)

clf_dec.best_params_

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

best_parameters_log = {'clf__penalty': ['l2'],
                  'clf__C': [10000],
                'vect__stop_words': [None],
                 'vect__ngram_range': [(1, 2)],
                 'vect__norm': ['l2'],
                 'vect__use_idf': [True],
                 'lemma__use_lemma': [False]}

parameters_log = {'clf__penalty': ['l1', 'l2'],
                  'clf__C': [1000, 10000, 100000],
                'vect__stop_words': ['english', None],
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'vect__norm': ['l2', None],
                 'vect__use_idf': [True, False],
                 'lemma__use_lemma': [True, False]}

pipeline_log = Pipeline([('lemma', LemmaTransformer()), ('vect', TfidfVectorizer()), ('clf', LogisticRegression())])

clf_log = GridSearchCV(pipeline_log, parameters_log, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro', n_jobs=-1)

clf_log.fit(train_x, train_y)

clf_log.best_params_

### Dummy Classifier

In [None]:
from sklearn.dummy import DummyClassifier

best_parameters_dum = {'dum__strategy': ['uniform'], 
                'vect__stop_words': [None],
                 'vect__ngram_range': [(1, 2)],
                 'vect__norm': [None],
                 'vect__use_idf': [True],
                 'lemma__use_lemma': [True]}

parameters_dum = {'dum__strategy': ['stratified', 'most_frequent', 'prior', 'uniform'], 
                'vect__stop_words': ['english', None],
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'vect__norm': ['l2', None],
                 'vect__use_idf': [True],
                 'lemma__use_lemma': [True]}

pipeline_dum = Pipeline([('lemma', LemmaTransformer()), ('vect', TfidfVectorizer()), ('dum', DummyClassifier())])

clf_dum = GridSearchCV(pipeline_dum, parameters_dum, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro', n_jobs=-1)

clf_dum.fit(train_x, train_y)

clf_dum.best_params_

In [None]:

from sklearn.metrics import classification_report

print("SVM")
print (classification_report(test_y, clf_svm.predict(test_x)))

print("Naive Bayes")
print (classification_report(test_y, clf_nb.predict(test_x)))

print("Decision Tree")
print (classification_report(test_y, clf_dec.predict(test_x)))

print("Logistic Regression")
print (classification_report(test_y, clf_log.predict(test_x)))

print("Dummy Classifier")
print (classification_report(test_y, clf_dum.predict(test_x)))

### Score the classifiers

In [None]:
from sklearn import metrics

print("SVM")
print("Test Score: " + str(metrics.f1_score(test_y, clf_svm.predict(test_x), average='macro')))
print("Best Score: " + str(clf_svm.best_score_))

print("Naive Bayes")
print("Test Score: " + str(metrics.f1_score(test_y, clf_nb.predict(test_x), average='macro')))
print("Best Score: " + str(clf_nb.best_score_))

print("Decision Tree")
print("Test Score: " + str(metrics.f1_score(test_y, clf_dec.predict(test_x), average='macro')))
print("Best Score: " + str(clf_dec.best_score_))

print("Logistic Regression")
print("Test Score: " + str(metrics.f1_score(test_y, clf_log.predict(test_x), average='macro')))
print("Best Score: " + str(clf_log.best_score_))

print("Dummy")
print("Test Score: " + str(metrics.f1_score(test_y, clf_dum.predict(test_x), average='macro')))
print("Best Score: " + str(clf_dum.best_score_))



## Improving the Model

As SVM performed the best, we will use this as a baseline. We will now explore adding different features to improve the classifier.

### Feature Engineering

Build the training set. The training set uses the cross-validated predictions of the first model. This is to avoid overfitting of the second model, as it would effectively have seen the data twice.

The actual classifier trained on the ngrams will include all the data.

In [None]:
# Predictions using the classifier trained on ngrams

from sklearn.model_selection import cross_val_predict

# Rebuild the best classifier with the best parameters
best_pipeline_svm = Pipeline([('lemma', LemmaTransformer(use_lemma=True)), ('vect', TfidfVectorizer(stop_words=None, ngram_range=(1, 1), norm=None, use_idf=False)), ('clf', SVC(C=10, gamma=0.15, kernel='rbf'))])

# Use cross-validated predictions to avoid overfitting
cross_val_predictions = cross_val_predict(best_pipeline_svm, train_x, train_y, cv=3)

ngram_predictions = [1 if prediction == "arg" else 0 for prediction in cross_val_predictions]
ngram_predictions

In [None]:
# Sentiment

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

# Test sentiment extraction
sentiment = [sid.polarity_scores(sample)['compound'] for sample in train_x]
sentiment


In [None]:
# Sentence length

# Test sentence length extraction
sentence_length = [len(word_tokenize(sample)) for sample in train_x]
sentence_length


In [None]:
# PoS

pos_counts = []

for sample in train_x:
    nn = 0
    vb = 0
    jj = 0
    for word, pos in pos_tag(word_tokenize(sample)):
        if pos.startswith('NN'):
            nn += 1
        elif pos.startswith('VB'):
            vb += 1
        elif pos.startswith('JJ'):
            jj += 1
            
    pos_counts.append([nn, vb, jj])

    
pos_counts

In [None]:
import numpy as np

# Precompute the features to increase grid-search efficiency
train_x_all_features = np.column_stack([ngram_predictions, sentiment, sentence_length, pos_counts])
train_x_all_features

In [None]:
# Extract features from a dataset
def extract_features(data_x):
    
    # Predictions based on ngrams from classifier trained on all data
    ngram_predictions = [1 if prediction == "arg" else 0 for prediction in clf_svm.predict(data_x)]
    
    # Sentiment of sentence
    sid = SentimentIntensityAnalyzer()
    sentiment = [sid.polarity_scores(sample)['compound'] for sample in data_x]
    
    # Sentence Length
    sentence_length = [len(word_tokenize(sample)) for sample in data_x]
    
    # Number of each parts of speech
    pos_counts = []
    for sample in data_x:
        nn = 0
        vb = 0
        jj = 0
        for word, pos in pos_tag(word_tokenize(sample)):
            if pos.startswith('NN'):
                nn += 1
            elif pos.startswith('VB'):
                vb += 1
            elif pos.startswith('JJ'):
                jj += 1
        pos_counts.append([nn, vb, jj])
    
    # Combine features into numpy matrix
    return np.column_stack([ngram_predictions, sentiment, sentence_length, pos_counts])

test_x_all_features = extract_features(test_x)
test_x_all_features

### Classification



### SVM

In [None]:
from sklearn.preprocessing import Normalizer
import sklearn.utils.validation

parameters_stack_svm = {'clf__C': [100, 10, 1],
                        'clf__gamma': [0.001, 0.01, 0.1],
                        'clf__kernel': ['rbf', 'linear', 'sigmoid'],
                        'nor': [None, Normalizer(norm='l1'), Normalizer(norm='l2'), Normalizer(norm='max')]}

pipeline_stack_svm = Pipeline([('nor', Normalizer()), ('clf', SVC())])

clf_stack_svm = GridSearchCV(pipeline_stack_svm, parameters_stack_svm, cv=StratifiedKFold(n_splits=3, random_state=998), scoring='f1_macro', n_jobs=-1)

clf_stack_svm.fit(train_x_all_features, train_y)

clf_stack_svm.best_params_

### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

parameters_stack_nb = {'clf__var_smoothing': [0.00001, 0.000001, 0.0000001],
                       'nor': [None, Normalizer(norm='l1'), Normalizer(norm='l2'), Normalizer(norm='max')]}
    
pipeline_stack_nb = Pipeline([('nor', Normalizer()), ('clf', GaussianNB())])

clf_stack_nb = GridSearchCV(pipeline_stack_nb, parameters_stack_nb, cv=StratifiedKFold(n_splits=3, random_state=998), scoring='f1_macro',  n_jobs=-1)

clf_stack_nb.fit(train_x_all_features, train_y)

clf_stack_nb.best_params_

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

parameters_stack_dec = {'clf__criterion':['gini','entropy'],
                        'clf__max_depth': range(2, 10),
                        'clf__max_features': ['sqrt', 'log2', None],
                        'nor': [None, Normalizer(norm='l1'), Normalizer(norm='l2'), Normalizer(norm='max')]}
pipeline_stack_dec = Pipeline([('nor', Normalizer()), ('clf', DecisionTreeClassifier())])

clf_stack_dec = GridSearchCV(pipeline_stack_dec, parameters_stack_dec, cv=StratifiedKFold(n_splits=3, random_state=998), scoring='f1_macro', n_jobs=-1)

clf_stack_dec.fit(train_x_all_features, train_y)

clf_stack_dec.best_params_

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

parameters_stack_log = {'clf__penalty': ['l1', 'l2'],
                        'clf__C': [0.01, 0.1, 1, 10],
                        'nor': [None, Normalizer(norm='l1'), Normalizer(norm='l2'), Normalizer(norm='max')]}

pipeline_stack_log = Pipeline([('nor', Normalizer()), ('clf', LogisticRegression())])

clf_stack_log = GridSearchCV(pipeline_stack_log, parameters_stack_log, cv=StratifiedKFold(n_splits=3, random_state=998), scoring='f1_macro', n_jobs=-1)

clf_stack_log.fit(train_x_all_features, train_y)

clf_stack_log.best_params_

In [None]:
from sklearn.metrics import classification_report

print("SVM")
print(classification_report(test_y, clf_stack_svm.predict(test_x_all_features)))

print("Naive Bayes")
print(classification_report(test_y, clf_stack_nb.predict(test_x_all_features)))

print("Decision Tree")
print(classification_report(test_y, clf_stack_dec.predict(test_x_all_features)))

print("Logistic Regression")
print(classification_report(test_y, clf_stack_log.predict(test_x_all_features)))

In [None]:
from sklearn import metrics

print("SVM")
print("Test Score: " + str(metrics.f1_score(test_y, clf_stack_svm.predict(test_x_all_features), average='macro')))
print("Best Score: " + str(clf_stack_svm.best_score_))

print("Naive Bayes")
print("Test Score: " + str(metrics.f1_score(test_y, clf_stack_nb.predict(test_x_all_features), average='macro')))
print("Best Score: " + str(clf_stack_nb.best_score_))

print("Decision Tree")
print("Test Score: " + str(metrics.f1_score(test_y, clf_stack_dec.predict(test_x_all_features), average='macro')))
print("Best Score: " + str(clf_stack_dec.best_score_))

print("Logistic Regression")
print("Test Score: " + str(metrics.f1_score(test_y, clf_stack_log.predict(test_x_all_features), average='macro')))
print("Best Score: " + str(clf_stack_log.best_score_))

### Rebuild the best models using all the data

In theory, this maximises the data and should improve results. Statistical claims cannot be made on the final model.

In [None]:
# Datasets including all data

train_x_all_data = [sample["text"] for sample in dataset]
train_y_all_data = [sample["label"] for sample in dataset]

### ngram Model

In [None]:
# Lemmatize the data in a preprocessing step. joblib does not save custom classes in pipelines

train_x_lemma_all_data = [' '.join(lemmatize_all(sentence)) for sentence in train_x_all_data]
train_x_lemma_all_data

In [None]:
# Train the weak learner with the best parameters but use all the data

ngram_model = Pipeline([('vect', TfidfVectorizer(stop_words=None, ngram_range=(1, 1), norm=None, use_idf=False)), ('clf', SVC(C=10, gamma=0.15, kernel='rbf'))])

ngram_model.fit(train_x_lemma_all_data, train_y_all_data)


### Meta Model



In [None]:
# Extract features for all the data

# Use cross-validated predictions to avoid overfitting
cross_val_predictions_all_data = cross_val_predict(ngram_model, train_x_all_data, train_y_all_data, cv=3)

ngram_predictions_all_data = [1 if prediction == "arg" else 0 for prediction in cross_val_predictions_all_data]
sentiment_all_data = [sid.polarity_scores(sample)['compound'] for sample in train_x_all_data]
sentence_length_all_data = [len(word_tokenize(sample)) for sample in train_x_all_data]
pos_counts_all_data = []

for sample in train_x_all_data:
    nn = 0
    vb = 0
    jj = 0
    for word, pos in pos_tag(word_tokenize(sample)):
        if pos.startswith('NN'):
            nn += 1
        elif pos.startswith('VB'):
            vb += 1
        elif pos.startswith('JJ'):
            jj += 1
            
    pos_counts_all_data.append([nn, vb, jj])
    
train_x_all_features_all_data = np.column_stack([ngram_predictions_all_data, sentiment_all_data, sentence_length_all_data, pos_counts_all_data])
train_x_all_features_all_data.shape

In [None]:
# Train the meta model using all the data
meta_model = Pipeline([('nor', Normalizer(norm='max')), ('clf', DecisionTreeClassifier(criterion='gini', max_depth=3, max_features=None))])

meta_model.fit(train_x_all_features_all_data, train_y_all_data)


## Save the Models

In [None]:
from sklearn.externals import joblib

import pickle

joblib.dump(ngram_model, "./models/arg_prediction_ngram_model.pkl")
joblib.dump(meta_model, "./models/arg_prediction_meta_model.pkl")

