# Argument Detection

## Prepare Data

In [3]:
# Load data from file

import json

dataset = []

with open('./labelled_data/1000_labelled_argument_sentences_3.json') as f:
    for line in f:
        json_line = json.loads(line)
        arg = {"text": json_line["content"], "label": json_line["annotation"]["labels"][0]}

        dataset.append(arg)

dataset

[{'text': "The motivation for the age restriction, like a lot of the Constitution, might have roots in the political situation in Europe in the 1700's.",
  'label': 'arg'},
 {'text': 'If Alexandria-Ocasio Cortez wanted to run for President in 2020, and people thought she was too young and inexperienced, they could vote against her for that reason.',
  'label': 'arg'},
 {'text': '(Various articles I could quote to support this lmk) Women generally live a couple years longer.',
  'label': 'arg'},
 {'text': 'Why, exactly?', 'label': 'not_arg'},
 {'text': "The minimum age requirement does at least give you some potential limit as to which someone can fill out a political 'resume', at the very least make themselves a known quantity, giving people better ideas as to what sort of a person a politician is/can be, while setting a maximum only hampers potentially still competent, still capable public servants from fulfilling an important role.",
  'label': 'arg'},
 {'text': 'As a foot note after

In [4]:
# Remove punctuation

import string

print(string.punctuation)

dataset = [{"text": sample["text"].translate(str.maketrans('', '', string.punctuation)), "label":sample["label"]} for sample in dataset]
dataset

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


[{'text': 'The motivation for the age restriction like a lot of the Constitution might have roots in the political situation in Europe in the 1700s',
  'label': 'arg'},
 {'text': 'If AlexandriaOcasio Cortez wanted to run for President in 2020 and people thought she was too young and inexperienced they could vote against her for that reason',
  'label': 'arg'},
 {'text': 'Various articles I could quote to support this lmk Women generally live a couple years longer',
  'label': 'arg'},
 {'text': 'Why exactly', 'label': 'not_arg'},
 {'text': 'The minimum age requirement does at least give you some potential limit as to which someone can fill out a political resume at the very least make themselves a known quantity giving people better ideas as to what sort of a person a politician iscan be while setting a maximum only hampers potentially still competent still capable public servants from fulfilling an important role',
  'label': 'arg'},
 {'text': 'As a foot note after the US left Vietnam 

In [6]:
# Split dataset into training and testing set

from sklearn.model_selection import train_test_split

# Original dataset
train, test = train_test_split(dataset, test_size=0.1)

train_x = [sample["text"] for sample in train]
train_y = [sample["label"] for sample in train]

test_x = [sample["text"] for sample in test]
test_y = [sample["label"] for sample in test]

## Classification

### Grid Search for Hyperparameters

Find the best parameters for each classifier using an exhaustive search.

In [7]:
# It is possible to get undefined f1-score as in an exhaustive search (grid search), 
# some labels may never be predicted. This can lead to 0 precision or recall.
# Ignore these warnings

import warnings
warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [8]:
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

# Lemmatize all words in a sentence. Uses PoS to identify lemma
def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        else:
            yield word

#Custom Transformer that lemmatizes samples in the dataset
from sklearn.base import TransformerMixin, BaseEstimator

class LemmaTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, use_lemma=True):
        self.use_lemma = use_lemma
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        if self.use_lemma:
            return [' '.join(lemmatize_all(sample)) for sample in X]
        else:
            return X

[nltk_data] Downloading package wordnet to /home/effsy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### SVM

In [20]:
from sklearn.svm import SVC

best_parameters_svm = {'clf__C': [10],  
              'clf__gamma': [0.15], 
              'clf__kernel': ['rbf'],
                'vect__stop_words': [None],
                 'vect__ngram_range': [(1, 1)],
                 'vect__norm': [None],
                 'vect__use_idf': [False],
                 'lemma__use_lemma': [True]}

parameters_svm = {'clf__C': [0.1, 1, 10, 100, 10000],  
              'clf__gamma': [0.1, 0.15, 0.3], 
              'clf__kernel': ['rbf', 'linear', 'sigmoid'],
                'vect__stop_words': ['english', None],
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'vect__norm': ['l2', None],
                 'vect__use_idf': [True, False],
                 'lemma__use_lemma': [True, False]}

pipeline_svm = Pipeline([('lemma', LemmaTransformer()), ('vect', TfidfVectorizer()), ('clf', SVC())])

clf_svm = GridSearchCV(pipeline_svm, parameters_svm, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro', n_jobs=-1)

clf_svm.fit(train_x, train_y)

clf_svm.best_params_

{'clf__C': 10,
 'clf__gamma': 0.15,
 'clf__kernel': 'rbf',
 'lemma__use_lemma': True,
 'vect__ngram_range': (1, 1),
 'vect__norm': None,
 'vect__stop_words': None,
 'vect__use_idf': False}

### Naive Bayes

In [10]:
from sklearn.naive_bayes import MultinomialNB

best_parameters_nb = {'clf__alpha': [0.1],
                'vect__stop_words': [None],
                 'vect__ngram_range': [(1, 1)],
                 'vect__norm': ['l2'],
                 'vect__use_idf': [False],
                 'lemma__use_lemma': [False]}

parameters_nb = {'clf__alpha': [0.5, 0.75, 0.1, 0.075, 0.05],
                'vect__stop_words': ['english', None],
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'vect__norm': ['l2', None],
                 'vect__use_idf': [True, False],
                 'lemma__use_lemma': [True, False]}
    
pipeline_nb = Pipeline([('lemma', LemmaTransformer()), ('vect', TfidfVectorizer()), ('clf', MultinomialNB())])

clf_nb = GridSearchCV(pipeline_nb, parameters_nb, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro',  n_jobs=-1)

clf_nb.fit(train_x, train_y)

clf_nb.best_params_

{'clf__alpha': 0.1,
 'lemma__use_lemma': False,
 'vect__ngram_range': (1, 2),
 'vect__norm': 'l2',
 'vect__stop_words': None,
 'vect__use_idf': False}

### Decision Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier

best_parameters_dec = {'clf__criterion':['gini'],
                  'clf__max_depth': [5],
                  'clf__max_features': [None],
                'vect__stop_words': [None],
                 'vect__ngram_range': [(1, 1)],
                 'vect__norm': [None],
                 'vect__use_idf': [True],
                 'lemma__use_lemma': [True]}

parameters_dec = {'clf__criterion':['gini','entropy'],
                  'clf__max_depth': range(4, 10),
                  'clf__max_features': ['sqrt', 'log2', None],
                'vect__stop_words': ['english', None],
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'vect__norm': ['l2', None],
                 'vect__use_idf': [True, False],
                 'lemma__use_lemma': [True, False]}

pipeline_dec = Pipeline([('lemma', LemmaTransformer()), ('vect', TfidfVectorizer()), ('clf', DecisionTreeClassifier())])

clf_dec = GridSearchCV(pipeline_dec, parameters_dec, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro', n_jobs=-1)

clf_dec.fit(train_x, train_y)

clf_dec.best_params_

{'clf__criterion': 'gini',
 'clf__max_depth': 5,
 'clf__max_features': None,
 'lemma__use_lemma': True,
 'vect__ngram_range': (1, 1),
 'vect__norm': None,
 'vect__stop_words': None,
 'vect__use_idf': True}

### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

best_parameters_log = {'clf__penalty': ['l2'],
                  'clf__C': [10000],
                'vect__stop_words': [None],
                 'vect__ngram_range': [(1, 2)],
                 'vect__norm': ['l2'],
                 'vect__use_idf': [True],
                 'lemma__use_lemma': [False]}

parameters_log = {'clf__penalty': ['l1', 'l2'],
                  'clf__C': [1000, 10000, 100000],
                'vect__stop_words': ['english', None],
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'vect__norm': ['l2', None],
                 'vect__use_idf': [True, False],
                 'lemma__use_lemma': [True, False]}

pipeline_log = Pipeline([('lemma', LemmaTransformer()), ('vect', TfidfVectorizer()), ('clf', LogisticRegression())])

clf_log = GridSearchCV(pipeline_log, parameters_log, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro', n_jobs=-1)

clf_log.fit(train_x, train_y)

clf_log.best_params_

{'clf__C': 10000,
 'clf__penalty': 'l2',
 'lemma__use_lemma': True,
 'vect__ngram_range': (1, 2),
 'vect__norm': 'l2',
 'vect__stop_words': None,
 'vect__use_idf': True}

### Dummy Classifier

In [13]:
from sklearn.dummy import DummyClassifier

best_parameters_dum = {'dum__strategy': ['uniform'], 
                'vect__stop_words': [None],
                 'vect__ngram_range': [(1, 2)],
                 'vect__norm': [None],
                 'vect__use_idf': [True],
                 'lemma__use_lemma': [True]}

parameters_dum = {'dum__strategy': ['stratified', 'most_frequent', 'prior', 'uniform'], 
                'vect__stop_words': ['english', None],
                 'vect__ngram_range': [(1, 1), (1, 2)],
                 'vect__norm': ['l2', None],
                 'vect__use_idf': [True],
                 'lemma__use_lemma': [True]}

pipeline_dum = Pipeline([('lemma', LemmaTransformer()), ('vect', TfidfVectorizer()), ('dum', DummyClassifier())])

clf_dum = GridSearchCV(pipeline_dum, parameters_dum, cv=StratifiedKFold(n_splits=3, random_state=999), scoring='f1_macro', n_jobs=-1)

clf_dum.fit(train_x, train_y)

clf_dum.best_params_

{'dum__strategy': 'uniform',
 'lemma__use_lemma': True,
 'vect__ngram_range': (1, 2),
 'vect__norm': None,
 'vect__stop_words': None,
 'vect__use_idf': True}

In [15]:

from sklearn.metrics import classification_report

print("SVM")
print (classification_report(test_y, clf_svm.predict(test_x)))

print("Naive Bayes")
print (classification_report(test_y, clf_nb.predict(test_x)))

print("Decision Tree")
print (classification_report(test_y, clf_dec.predict(test_x)))

print("Logistic Regression")
print (classification_report(test_y, clf_log.predict(test_x)))

print("Dummy Classifier")
print (classification_report(test_y, clf_dum.predict(test_x)))

SVM
              precision    recall  f1-score   support

         arg       0.75      0.90      0.81        49
     not_arg       0.88      0.71      0.78        51

    accuracy                           0.80       100
   macro avg       0.81      0.80      0.80       100
weighted avg       0.81      0.80      0.80       100

Naive Bayes
              precision    recall  f1-score   support

         arg       0.59      0.76      0.66        49
     not_arg       0.68      0.49      0.57        51

    accuracy                           0.62       100
   macro avg       0.63      0.62      0.61       100
weighted avg       0.63      0.62      0.61       100

Logistic Regression
              precision    recall  f1-score   support

         arg       0.64      0.86      0.73        49
     not_arg       0.79      0.53      0.64        51

    accuracy                           0.69       100
   macro avg       0.72      0.69      0.68       100
weighted avg       0.72      0.69     

### Score the classifiers

In [21]:
from sklearn import metrics

print("SVM")
print("Test Score: " + str(metrics.f1_score(test_y, clf_svm.predict(test_x), average='macro')))
print("Best Score: " + str(clf_svm.best_score_))

print("Naive Bayes")
print("Test Score: " + str(metrics.f1_score(test_y, clf_nb.predict(test_x), average='macro')))
print("Best Score: " + str(clf_nb.best_score_))

print("Decision Tree")
print("Test Score: " + str(metrics.f1_score(test_y, clf_dec.predict(test_x), average='macro')))
print("Best Score: " + str(clf_dec.best_score_))

print("Logistic Regression")
print("Test Score: " + str(metrics.f1_score(test_y, clf_log.predict(test_x), average='macro')))
print("Best Score: " + str(clf_log.best_score_))

print("Dummy")
print("Test Score: " + str(metrics.f1_score(test_y, clf_dum.predict(test_x), average='macro')))
print("Best Score: " + str(clf_dum.best_score_))



SVM
Test Score: 0.7987117552334944
Best Score: 0.7008395346716232
Naive Bayes
Test Score: 0.614448051948052
Best Score: 0.6171121186083784
Decision Tree
Test Score: 0.7277951406391774
Best Score: 0.6668587072507435
Logistic Regression
Test Score: 0.6828644501278773
Best Score: 0.6947049867912573
Dummy
Test Score: 0.4967793880837359
Best Score: 0.5163683859633524



## SVM Performed the best

In [None]:
metrics.confusion_matrix(test_y, clf_svm.predict(test_x))

## Improving the Model

As SVM performed the best, we will use this as a baseline. We will now explore adding different features to improve the classifier.

### Feature Engineering

Build the training set. The training set uses the cross-validated predictions of the first model. This is to avoid overfitting of the second model, as it would effectively have seen the data twice.

The actual classifier trained on the ngrams will include all the data.

In [41]:
# Predictions using the classifier trained on ngrams

from sklearn.model_selection import cross_val_predict

# Rebuild the best classifier with the best parameters
best_pipeline_svm = Pipeline([('lemma', LemmaTransformer(use_lemma=True)), ('vect', TfidfVectorizer(stop_words=None, ngram_range=(1, 1), norm=None, use_idf=False)), ('clf', SVC(C=10, gamma=0.15, kernel='rbf'))])

# Use cross-validated predictions to avoid overfitting
cross_val_predictions = cross_val_predict(best_pipeline_svm, train_x, train_y, cv=3)

ngram_predictions = [1 if prediction == "arg" else 0 for prediction in cross_val_predictions]
ngram_predictions

[0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,


In [37]:
# Sentiment

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

# Test sentiment extraction
sentiment = [sid.polarity_scores(sample)['compound'] for sample in train_x]
sentiment


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/effsy/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


[0.0,
 0.5106,
 0.0008,
 0.2732,
 0.0,
 -0.1761,
 -0.2732,
 -0.7717,
 0.0,
 0.2023,
 -0.5897,
 -0.34,
 0.0,
 -0.296,
 0.4588,
 -0.6249,
 0.4404,
 0.3182,
 -0.5358,
 0.2263,
 0.0,
 -0.765,
 0.3983,
 0.6808,
 0.1326,
 -0.296,
 0.0,
 0.2263,
 0.0,
 0.0,
 -0.5574,
 0.0772,
 0.6705,
 0.3182,
 0.0,
 0.0,
 0.0,
 0.4215,
 -0.0258,
 -0.8625,
 -0.7184,
 -0.6408,
 -0.7579,
 -0.4019,
 -0.5859,
 0.197,
 0.024,
 0.3182,
 0.1321,
 0.4449,
 0.0,
 0.0,
 0.0,
 0.0772,
 0.0,
 0.4404,
 -0.2392,
 0.4019,
 0.4019,
 -0.575,
 0.4019,
 0.6124,
 0.0,
 0.0,
 0.4059,
 -0.4767,
 0.838,
 0.0,
 0.3182,
 0.0,
 0.2023,
 0.0,
 0.0,
 -0.9144,
 -0.0772,
 0.25,
 0.1531,
 0.3612,
 0.4767,
 -0.7786,
 -0.6369,
 0.3612,
 -0.296,
 0.0,
 0.0,
 -0.765,
 0.34,
 0.6452,
 0.0,
 0.3182,
 0.4215,
 -0.2617,
 -0.5106,
 0.1154,
 0.4939,
 0.357,
 0.0498,
 -0.2732,
 -0.4588,
 0.0,
 -0.4588,
 -0.0772,
 0.4404,
 -0.5423,
 0.25,
 0.2263,
 0.4404,
 -0.0,
 0.25,
 -0.7269,
 0.3182,
 0.0,
 -0.7425,
 0.0,
 -0.4019,
 0.6597,
 -0.5423,
 0.0,
 0.648

In [38]:
# Sentence length

# Test sentence length extraction
sentence_length = [len(word_tokenize(sample)) for sample in train_x]
sentence_length


[13,
 46,
 16,
 15,
 18,
 15,
 15,
 65,
 23,
 6,
 8,
 19,
 19,
 20,
 9,
 11,
 4,
 9,
 12,
 11,
 2,
 23,
 30,
 19,
 4,
 33,
 12,
 22,
 1,
 4,
 23,
 10,
 30,
 9,
 8,
 9,
 15,
 14,
 20,
 29,
 14,
 18,
 15,
 9,
 17,
 20,
 17,
 9,
 17,
 15,
 4,
 12,
 18,
 10,
 11,
 12,
 12,
 1,
 21,
 22,
 7,
 20,
 5,
 4,
 32,
 5,
 31,
 1,
 21,
 19,
 16,
 9,
 2,
 28,
 22,
 14,
 20,
 18,
 32,
 23,
 11,
 21,
 8,
 4,
 1,
 15,
 22,
 28,
 11,
 13,
 17,
 41,
 13,
 21,
 10,
 16,
 17,
 25,
 29,
 6,
 5,
 5,
 20,
 7,
 6,
 11,
 13,
 36,
 22,
 5,
 1,
 23,
 19,
 18,
 43,
 18,
 12,
 19,
 23,
 12,
 8,
 36,
 25,
 12,
 13,
 11,
 12,
 1,
 18,
 9,
 15,
 9,
 26,
 10,
 6,
 36,
 11,
 2,
 13,
 21,
 6,
 12,
 19,
 20,
 7,
 9,
 13,
 17,
 43,
 14,
 18,
 24,
 12,
 31,
 8,
 9,
 23,
 16,
 7,
 7,
 24,
 2,
 14,
 9,
 15,
 27,
 28,
 19,
 5,
 15,
 4,
 1,
 5,
 7,
 1,
 19,
 46,
 9,
 7,
 3,
 19,
 0,
 42,
 7,
 18,
 7,
 12,
 21,
 13,
 10,
 13,
 27,
 22,
 24,
 5,
 20,
 14,
 7,
 31,
 9,
 46,
 12,
 7,
 20,
 18,
 16,
 7,
 20,
 6,
 7,
 7,
 43,
 11,
 11

In [39]:
# PoS

pos_counts = []

for sample in train_x:
    nn = 0
    vb = 0
    jj = 0
    for word, pos in pos_tag(word_tokenize(sample)):
        if pos.startswith('NN'):
            nn += 1
        elif pos.startswith('VB'):
            vb += 1
        elif pos.startswith('JJ'):
            jj += 1
            
    pos_counts.append([nn, vb, jj])

    
pos_counts

[[5, 2, 1],
 [13, 10, 0],
 [4, 3, 3],
 [2, 4, 1],
 [3, 2, 2],
 [4, 3, 2],
 [2, 5, 1],
 [13, 13, 7],
 [6, 5, 0],
 [1, 0, 1],
 [2, 1, 3],
 [2, 3, 2],
 [3, 5, 1],
 [7, 5, 0],
 [3, 1, 0],
 [2, 3, 0],
 [1, 1, 1],
 [2, 1, 2],
 [3, 2, 3],
 [4, 2, 2],
 [1, 1, 0],
 [5, 4, 4],
 [7, 3, 5],
 [3, 4, 4],
 [1, 1, 0],
 [10, 4, 3],
 [4, 1, 2],
 [4, 2, 1],
 [1, 0, 0],
 [2, 0, 0],
 [4, 5, 2],
 [3, 2, 2],
 [12, 4, 4],
 [3, 1, 1],
 [3, 2, 0],
 [0, 3, 0],
 [6, 2, 1],
 [4, 2, 0],
 [4, 4, 2],
 [9, 3, 3],
 [3, 3, 1],
 [4, 3, 1],
 [4, 1, 2],
 [2, 2, 1],
 [4, 4, 3],
 [5, 3, 1],
 [4, 3, 2],
 [2, 2, 1],
 [5, 2, 3],
 [5, 2, 1],
 [1, 0, 1],
 [1, 3, 4],
 [5, 6, 0],
 [2, 2, 1],
 [2, 2, 1],
 [3, 2, 2],
 [2, 3, 0],
 [0, 0, 0],
 [5, 4, 2],
 [6, 3, 2],
 [2, 1, 1],
 [6, 3, 1],
 [1, 1, 1],
 [2, 0, 1],
 [5, 11, 2],
 [0, 2, 1],
 [4, 6, 4],
 [0, 0, 0],
 [6, 6, 1],
 [5, 5, 0],
 [3, 4, 1],
 [3, 1, 1],
 [0, 0, 0],
 [9, 5, 2],
 [4, 6, 0],
 [4, 3, 3],
 [5, 3, 3],
 [4, 4, 2],
 [6, 5, 2],
 [3, 6, 1],
 [3, 1, 2],
 [2, 6, 0],
 [2, 1, 0

In [40]:
import numpy as np

# Precompute the features to increase grid-search efficiency
train_x_all_features = np.column_stack([ngram_predictions, sentiment, sentence_length, pos_counts])
train_x_all_features

NameError: name 'ngram_predictions' is not defined

In [32]:
# Extract features from a dataset
def extract_features(data_x):
    
    # Predictions based on ngrams from classifier trained on all data
    ngram_predictions = [1 if prediction == "arg" else 0 for prediction in clf_svm.predict(data_x)]
    
    # Sentiment of sentence
    sid = SentimentIntensityAnalyzer()
    sentiment = [sid.polarity_scores(sample)['compound'] for sample in data_x]
    
    # Sentence Length
    sentence_length = [len(word_tokenize(sample)) for sample in data_x]
    
    # Number of each parts of speech
    pos_counts = []
    for sample in data_x:
        nn = 0
        vb = 0
        jj = 0
        for word, pos in pos_tag(word_tokenize(sample)):
            if pos.startswith('NN'):
                nn += 1
            elif pos.startswith('VB'):
                vb += 1
            elif pos.startswith('JJ'):
                jj += 1
        pos_counts.append([nn, vb, jj])
    
    # Combine features into numpy matrix
    return np.column_stack([ngram_predictions, sentiment, sentence_length, pos_counts])

test_x_all_features = extract_features(test_x)
test_x_all_features

array([[ 0.000e+00,  4.215e-01,  7.000e+00,  0.000e+00,  3.000e+00,
         1.000e+00],
       [ 1.000e+00,  7.445e-01,  1.900e+01,  5.000e+00,  3.000e+00,
         2.000e+00],
       [ 1.000e+00, -9.217e-01,  1.700e+01,  3.000e+00,  2.000e+00,
         1.000e+00],
       [ 0.000e+00,  3.612e-01,  2.700e+01,  7.000e+00,  7.000e+00,
         0.000e+00],
       [ 1.000e+00,  4.404e-01,  2.500e+01,  5.000e+00,  5.000e+00,
         2.000e+00],
       [ 1.000e+00, -6.623e-01,  1.100e+01,  5.000e+00,  0.000e+00,
         1.000e+00],
       [ 1.000e+00, -2.755e-01,  3.300e+01,  4.000e+00,  6.000e+00,
         2.000e+00],
       [ 1.000e+00,  3.612e-01,  1.200e+01,  4.000e+00,  4.000e+00,
         1.000e+00],
       [ 0.000e+00,  3.612e-01,  6.000e+00,  2.000e+00,  1.000e+00,
         0.000e+00],
       [ 1.000e+00, -4.019e-01,  1.600e+01,  3.000e+00,  3.000e+00,
         1.000e+00],
       [ 0.000e+00,  0.000e+00,  1.200e+01,  3.000e+00,  2.000e+00,
         2.000e+00],
       [ 1.000e+00,  

### Classification



### SVM

In [27]:
# from sklearn.pipeline import FeatureUnion

from sklearn.preprocessing import Normalizer
import sklearn.utils.validation

parameters_stack_svm = {'clf__C': [100, 10, 1],
                        'clf__gamma': [0.001, 0.01, 0.1],
                        'clf__kernel': ['rbf', 'linear', 'sigmoid'],
                        'nor': [None, Normalizer(norm='l1'), Normalizer(norm='l2'), Normalizer(norm='max')]}

pipeline_stack_svm = Pipeline([('nor', Normalizer()), ('clf', SVC())])

clf_stack_svm = GridSearchCV(pipeline_stack_svm, parameters_stack_svm, cv=StratifiedKFold(n_splits=3, random_state=998), scoring='f1_macro', n_jobs=-1)

clf_stack_svm.fit(train_x_all_features, train_y)

clf_stack_svm.best_params_

NameError: name 'train_x_all_features' is not defined

### Naive Bayes

In [165]:
from sklearn.naive_bayes import GaussianNB

parameters_stack_nb = {'clf__var_smoothing': [0.00001, 0.000001, 0.0000001],
                       'nor': [None, Normalizer(norm='l1'), Normalizer(norm='l2'), Normalizer(norm='max')]}
    
pipeline_stack_nb = Pipeline([('nor', Normalizer()), ('clf', GaussianNB())])

clf_stack_nb = GridSearchCV(pipeline_stack_nb, parameters_stack_nb, cv=StratifiedKFold(n_splits=3, random_state=998), scoring='f1_macro',  n_jobs=-1)

clf_stack_nb.fit(train_x_all_features, train_y)

clf_stack_nb.best_params_

{'clf__var_smoothing': 1e-06, 'nor': None}

### Decision Tree

In [182]:
from sklearn.tree import DecisionTreeClassifier

parameters_stack_dec = {'clf__criterion':['gini','entropy'],
                        'clf__max_depth': range(2, 10),
                        'clf__max_features': ['sqrt', 'log2', None],
                        'nor': [None, Normalizer(norm='l1'), Normalizer(norm='l2'), Normalizer(norm='max')]}
pipeline_stack_dec = Pipeline([('nor', Normalizer()), ('clf', DecisionTreeClassifier())])

clf_stack_dec = GridSearchCV(pipeline_stack_dec, parameters_stack_dec, cv=StratifiedKFold(n_splits=3, random_state=998), scoring='f1_macro', n_jobs=-1)

clf_stack_dec.fit(train_x_all_features, train_y)

clf_stack_dec.best_params_

{'clf__criterion': 'gini',
 'clf__max_depth': 3,
 'clf__max_features': None,
 'nor': Normalizer(copy=True, norm='max')}

### Logistic Regression

In [167]:
from sklearn.linear_model import LogisticRegression

parameters_stack_log = {'clf__penalty': ['l1', 'l2'],
                        'clf__C': [0.01, 0.1, 1, 10],
                        'nor': [None, Normalizer(norm='l1'), Normalizer(norm='l2'), Normalizer(norm='max')]}

pipeline_stack_log = Pipeline([('nor', Normalizer()), ('clf', LogisticRegression())])

clf_stack_log = GridSearchCV(pipeline_stack_log, parameters_stack_log, cv=StratifiedKFold(n_splits=3, random_state=998), scoring='f1_macro', n_jobs=-1)

clf_stack_log.fit(train_x_all_features, train_y)

clf_stack_log.best_params_

{'clf__C': 0.1, 'clf__penalty': 'l2', 'nor': None}

In [170]:
from sklearn.metrics import classification_report

print("SVM")
print(classification_report(test_y, clf_stack_svm.predict(test_x_all_features)))

print("Naive Bayes")
print(classification_report(test_y, clf_stack_nb.predict(test_x_all_features)))

print("Decision Tree")
print(classification_report(test_y, clf_stack_dec.predict(test_x_all_features)))

print("Logistic Regression")
print(classification_report(test_y, clf_stack_log.predict(test_x_all_features)))

SVM
              precision    recall  f1-score   support

         arg       0.68      0.90      0.77        49
     not_arg       0.86      0.59      0.70        51

    accuracy                           0.74       100
   macro avg       0.77      0.74      0.73       100
weighted avg       0.77      0.74      0.73       100

Naive Bayes
              precision    recall  f1-score   support

         arg       0.70      0.82      0.75        49
     not_arg       0.79      0.67      0.72        51

    accuracy                           0.74       100
   macro avg       0.75      0.74      0.74       100
weighted avg       0.75      0.74      0.74       100

Decision Tree
              precision    recall  f1-score   support

         arg       0.74      0.76      0.75        49
     not_arg       0.76      0.75      0.75        51

    accuracy                           0.75       100
   macro avg       0.75      0.75      0.75       100
weighted avg       0.75      0.75      0.75 

In [179]:
from sklearn import metrics

print("SVM")
print("Test Score: " + str(metrics.f1_score(test_y, clf_stack_svm.predict(test_x_all_features), average='macro')))
print("Best Score: " + str(clf_stack_svm.best_score_))

print("Naive Bayes")
print("Test Score: " + str(metrics.f1_score(test_y, clf_stack_nb.predict(test_x_all_features), average='macro')))
print("Best Score: " + str(clf_stack_nb.best_score_))

print("Decision Tree")
print("Test Score: " + str(metrics.f1_score(test_y, clf_stack_dec.predict(test_x_all_features), average='macro')))
print("Best Score: " + str(clf_stack_dec.best_score_))

print("Logistic Regression")
print("Test Score: " + str(metrics.f1_score(test_y, clf_stack_log.predict(test_x_all_features), average='macro')))
print("Best Score: " + str(clf_stack_log.best_score_))

SVM
Test Score: 0.7348021215830274
Best Score: 0.7226988173874596
Naive Bayes
Test Score: 0.7390606182256121
Best Score: 0.6852952890935926
Decision Tree
Test Score: 0.7898108297467721
Best Score: 0.7155664123369173
Logistic Regression
Test Score: 0.7688674505074868
Best Score: 0.7219398796262165


### Rebuild the best classifiers using all the data

In order to maximise the data and improve results

In [9]:
# Datasets including all data

train_x_all_data = [sample["text"] for sample in dataset]
train_y_all_data = [sample["label"] for sample in dataset]

### ngram Model

In [15]:
# Lemmatize the data in a preprocessing step. joblib does not save custom classes in pipelines

train_x_lemma_all_data = [' '.join(lemmatize_all(sentence)) for sentence in train_x_all_data]
train_x_lemma_all_data

['The motivation for the age restriction like a lot of the Constitution might have root in the political situation in Europe in the 1700s',
 'If AlexandriaOcasio Cortez want to run for President in 2020 and people think she be too young and inexperienced they could vote against her for that reason',
 'Various article I could quote to support this lmk Women generally live a couple year longer',
 'Why exactly',
 'The minimum age requirement do at least give you some potential limit as to which someone can fill out a political resume at the very least make themselves a known quantity give people better idea as to what sort of a person a politician iscan be while set a maximum only hamper potentially still competent still capable public servant from fulfil an important role',
 'As a foot note after the US leave Vietnam China invade and fail too if that say anything about the strength of modern guerrilla fighting',
 'How be it tyranny if the citizenry who be most afraid of tyranny be on the

In [16]:
# Train the weak learner with the best parameters but use all the data

ngram_model = Pipeline([('vect', TfidfVectorizer(stop_words=None, ngram_range=(1, 1), norm=None, use_idf=False)), ('clf', SVC(C=10, gamma=0.15, kernel='rbf'))])

ngram_model.fit(train_x_lemma_all_data, train_y_all_data)


Pipeline(memory=None,
         steps=[('vect',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm=None,
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=False,
                                 vocabulary=None)),
                ('clf',
                 SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape='ovr', degree=3, gamma=0.15,


### Meta Classifier



In [42]:
# Extract features for all the data

# Use cross-validated predictions to avoid overfitting
cross_val_predictions_all_data = cross_val_predict(ngram_model, train_x_all_data, train_y_all_data, cv=3)

ngram_predictions_all_data = [1 if prediction == "arg" else 0 for prediction in cross_val_predictions_all_data]
sentiment_all_data = [sid.polarity_scores(sample)['compound'] for sample in train_x_all_data]
sentence_length_all_data = [len(word_tokenize(sample)) for sample in train_x_all_data]
pos_counts_all_data = []

for sample in train_x_all_data:
    nn = 0
    vb = 0
    jj = 0
    for word, pos in pos_tag(word_tokenize(sample)):
        if pos.startswith('NN'):
            nn += 1
        elif pos.startswith('VB'):
            vb += 1
        elif pos.startswith('JJ'):
            jj += 1
            
    pos_counts_all_data.append([nn, vb, jj])
    
train_x_all_features_all_data = np.column_stack([ngram_predictions_all_data, sentiment_all_data, sentence_length_all_data, pos_counts_all_data])
train_x_all_features_all_data.shape

(1000, 6)

In [43]:
# Train the meta model using all the data
meta_model = Pipeline([('nor', Normalizer(norm='max')), ('clf', DecisionTreeClassifier(criterion='gini', max_depth=3, max_features=None))])

meta_model.fit(train_x_all_features_all_data, train_y_all_data)

Pipeline(memory=None,
         steps=[('nor', Normalizer(copy=True, norm='max')),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                        max_depth=3, max_features=None,
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort=False, random_state=None,
                                        splitter='best'))],
         verbose=False)


## Save the Models

In [17]:
from sklearn.externals import joblib

import pickle

joblib.dump(ngram_model, "./models/arg_detection_ngram_model.pkl")
joblib.dump(meta_model, "./models/arg_detection_meta_model.pkl")



['./models/arg_detection_ngram_model.pkl']

In [49]:
ngram_model.predict(["People having good careers, means more revenue for the government and happier citizens."])[0]
ngram_model.predict([""])[0]

'not_arg'

In [48]:
meta_model.predict(np.array(["People having good careers, means more revenue for the government and happier citizens."]).reshape(1, -1))

ValueError: could not convert string to float: 'People having good careers, means more revenue for the government and happier citizens.'