# Argument Detection

## Prepare Data

In [124]:
# Load data from file

import json

dataset = []

with open('./labelled_data/1000_labelled_argument_sentences_8.json') as f:
    for line in f:
        json_line = json.loads(line)
        arg = {"text": json_line["content"], "label": json_line["annotation"]["labels"][0]}

        dataset.append(arg)

dataset

[{'text': 'I assume that she was talking about muscle memory?',
  'label': 'not-arg'},
 {'text': 'So what behavior is it that you object to in people who are against cultural appropriation?',
  'label': 'not-arg'},
 {'text': 'The rule has to be that the person who used the word gets to define what they meant by it.',
  'label': 'arg'},
 {'text': 'I would understand every word but sometimes not get the overall meaning.',
  'label': 'not-arg'},
 {'text': 'When I was studying the psychology of learning in university, almost all of the studies we looked at corresponded to automatic/subconsciously learned things.',
  'label': 'arg'},
 {'text': "If a term is not fitting or you don't know how to name it you can use additional more basic words to describe your thoughts.",
  'label': 'not-arg'},
 {'text': "I still disagree with it In absolute principle as the worst thing I've ever seen but it's not made up at least.",
  'label': 'arg'},
 {'text': 'Discrepancies in medical treatment, behavior of

In [127]:
# Create alternative dataset based on lemma

from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

def lemmatize_all(sentence):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(word_tokenize(sentence)):
        if tag.startswith("NN"):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        else:
            yield word

dataset_lemma = [{"text": ' '.join(lemmatize_all(sample["text"])), "label":sample["label"]} for sample in dataset]

dataset_lemma

[nltk_data] Downloading package wordnet to /home/effsy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[{'text': 'I assume that she be talk about muscle memory ?',
  'label': 'not-arg'},
 {'text': 'So what behavior be it that you object to in people who be against cultural appropriation ?',
  'label': 'not-arg'},
 {'text': 'The rule have to be that the person who use the word get to define what they mean by it .',
  'label': 'arg'},
 {'text': 'I would understand every word but sometimes not get the overall meaning .',
  'label': 'not-arg'},
 {'text': 'When I be study the psychology of learn in university , almost all of the study we look at correspond to automatic/subconsciously learned thing .',
  'label': 'arg'},
 {'text': "If a term be not fitting or you do n't know how to name it you can use additional more basic word to describe your thought .",
  'label': 'not-arg'},
 {'text': "I still disagree with it In absolute principle as the bad thing I 've ever see but it 's not make up at least .",
  'label': 'arg'},
 {'text': 'Discrepancies in medical treatment , behavior of law enforceme

In [61]:
# Split dataset into training and testing set

from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.33, random_state=42)

train_x = [x["text"] for x in train]
train_y = [x["label"] for x in train]

test_x = [x["text"] for x in test]
test_y = [x["label"] for x in test]


In [92]:
# Represent text as BoW 

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

cv = TfidfVectorizer(ngram_range=(1, 2))
train_x_bow = cv.fit_transform(train_x)
test_x_bow = cv.transform(test_x)


## Classification

#### SVM

In [93]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_bow, train_y)

clf_svm.predict(test_x_bow[0])
test_y[0]

'not-arg'

#### Decision Tree

In [94]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_bow, train_y)

clf_dec.predict(test_x_bow[0])
test_y[0]

'not-arg'

#### Logistic Regression

In [95]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_bow, train_y)

clf_log.predict(test_x_bow[0])
test_y[0]



'not-arg'

#### Naive Bayes

In [96]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(train_x_bow.toarray(), train_y)

clf_gnb.predict(test_x_bow[0].toarray())
test_y[0]

'not-arg'

## Evaluation

In [67]:
# print(clf_svm.score(test_x_bow, test_y))
# print(clf_dec.score(test_x_bow, test_y))
# print(clf_gnb.score(test_x_bow.toarray(), test_y))
# print(clf_log.score(test_x_bow, test_y))


In [97]:
# f1 score

from sklearn.metrics import f1_score

display(f1_score(test_y, clf_svm.predict(test_x_bow), average=None, labels=["arg", "not-arg"]))
display(f1_score(test_y, clf_gnb.predict(test_x_bow.toarray()), average=None, labels=["arg", "not-arg"]))
display(f1_score(test_y, clf_log.predict(test_x_bow), average=None, labels=["arg", "not-arg"]))
display(f1_score(test_y, clf_dec.predict(test_x_bow), average=None, labels=["arg", "not-arg"]))


array([0.81222707, 0.57425743])

array([0.78923767, 0.56074766])

array([0.75396825, 0.20512821])

array([0.66666667, 0.58503401])

In [None]:
# Tune the model parameters with grid search

## Improving the Model

This is the baseline. We will now explore adding different features to improve the classifier

In [108]:
# Create new features. 

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

train_x_bow
train_x



# Sentiment
sid = SentimentIntensityAnalyzer()


for sentence in train_x:
    print(sentence)
    ss = sid.polarity_scores(sentence)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end='')
        print()

# URLs

# Reddit features

# POS number of each

# Sentence length
train_x_bow



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/effsy/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


But for now, the arena is such that people are forced to pay for insurance, not because of the cost of preventative care, but because of the unforseen costs of having to be hospitalized for emergency care.
compound: 0.2023, 
neg: 0.147, 
neu: 0.711, 
pos: 0.142, 
(I expanded a bit on that [here](https://www.reddit.com/r/changemyview/comments/eunbnm/cmv_saying_definitions_change_or_language_is/ffqtlhp/).)
compound: 0.0, 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 
Discrepancies in medical treatment, behavior of law enforcement, upwards social mobility, etc.
compound: 0.0, 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 
If you look at the cost of Medicare currently and then extrapolate it to cover the rest of the population, you would have to increase taxes to a point where private insurance premiums are cheaper anyway.
compound: 0.3182, 
neg: 0.0, 
neu: 0.937, 
pos: 0.063, 
Yes.
compound: 0.4019, 
neg: 0.0, 
neu: 0.0, 
pos: 1.0, 
So what are the prior assumptions that would work for you?
compound: 0.0, 
neg: 0.

compound: -0.1935, 
neg: 0.08, 
neu: 0.834, 
pos: 0.086, 
HxNx is just a naming scheme with some protein.
compound: 0.0, 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 
The public awareness and intergovernmental activity that has happened when one of these scares arise also allows for the practice and preparedness for when a real epidemic occurs.
compound: -0.34, 
neg: 0.085, 
neu: 0.915, 
pos: 0.0, 
Please note that a change of view doesn't necessarily mean a reversal, or that the conversation has ended.
compound: 0.3182, 
neg: 0.0, 
neu: 0.867, 
pos: 0.133, 
Wtf, I've never heard this before.
compound: -0.5859, 
neg: 0.432, 
neu: 0.568, 
pos: 0.0, 
All of Gotham believes this.
compound: 0.0, 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 
Income is all relative
compound: 0.0, 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 
https://www.merriamwebster.com/dictionary/sacrifice
compound: 0.0, 
neg: 0.0, 
neu: 1.0, 
pos: 0.0, 
If you're taking the epistemological approach that you can't prove someone isn't a racist because you ca

<335x5951 sparse matrix of type '<class 'numpy.float64'>'
	with 10103 stored elements in Compressed Sparse Row format>

In [121]:
# Add extra features into original train/test sets

train_new

[nltk_data] Downloading package wordnet to /home/effsy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['But for now , the arena be such that people be force to pay for insurance , not because of the cost of preventative care , but because of the unforseen cost of have to be hospitalize for emergency care .',
 '( I expand a bit on that [ here ] ( http : //www.reddit.com/r/changemyview/comments/eunbnm/cmv_saying_definitions_change_or_language_is/ffqtlhp/ ) . )',
 'Discrepancies in medical treatment , behavior of law enforcement , upwards social mobility , etc .',
 'If you look at the cost of Medicare currently and then extrapolate it to cover the rest of the population , you would have to increase tax to a point where private insurance premium be cheap anyway .',
 'Yes .',
 'So what be the prior assumption that would work for you ?',
 'For example my wife want to have this stupid rubber mat in all our cabinet and i hate them .',
 'Can it be tranmitted in the incubation and nonsymptomatic phase of infection ?',
 'The transcript outline the quid pro quo',
 'Does a trillion dollar bailout t

In [122]:
train_x

['But for now, the arena is such that people are forced to pay for insurance, not because of the cost of preventative care, but because of the unforseen costs of having to be hospitalized for emergency care.',
 '(I expanded a bit on that [here](https://www.reddit.com/r/changemyview/comments/eunbnm/cmv_saying_definitions_change_or_language_is/ffqtlhp/).)',
 'Discrepancies in medical treatment, behavior of law enforcement, upwards social mobility, etc.',
 'If you look at the cost of Medicare currently and then extrapolate it to cover the rest of the population, you would have to increase taxes to a point where private insurance premiums are cheaper anyway.',
 'Yes.',
 'So what are the prior assumptions that would work for you?',
 'For example my wife wants to have this stupid rubber mats in all our cabinets and i hate them.',
 'Can it be tranmitted in the incubation and nonsymptomatic phase of infection?',
 'The transcript outlined the quid pro quo',
 'Does a trillion dollar bailout teac


## Save the Model