# Argument Detection

### Prepare Data

In [12]:
# Load data from file

import json

dataset = []

with open('./labelled_data/1000_labelled_argument_sentences_8.json') as f:
    for line in f:
        json_line = json.loads(line)
        arg = {"text": json_line["content"], "label": json_line["annotation"]["labels"][0]}

        dataset.append(arg)

In [15]:
# Split dataset into training and testing set

from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.33, random_state=42)

train_x = [x["text"] for x in train]
train_y = [x["label"] for x in train]

test_x = [x["text"] for x in test]
test_y = [x["label"] for x in test]


In [26]:
# Represent text as BoW 

from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
train_x_bow = cv.fit_transform(train_x)
test_x_bow = cv.transform(test_x)


## Classification

#### SVM

In [30]:
from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_bow, train_y)

clf_svm.predict(test_x_bow[0])
test_y[0]

'not-arg'

#### Decision Tree

In [35]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()

clf_dec.fit(train_x_bow, train_y)

clf_dec.predict(test_x_bow[0])
test_y[0]

'not-arg'

#### Logistic Regression

In [37]:
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()

clf_log.fit(train_x_bow, train_y)

clf_log.predict(test_x_bow[0])
test_y[0]



'not-arg'

#### Naive Bayes

In [43]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = GaussianNB()

clf_gnb.fit(train_x_bow.toarray(), train_y)

clf_gnb.predict(test_x_bow[0].toarray())
test_y[0]

'not-arg'

## Evaluation

In [51]:
# print(clf_svm.score(test_x_bow, test_y))
# print(clf_dec.score(test_x_bow, test_y))
# print(clf_gnb.score(test_x_bow.toarray(), test_y))
# print(clf_log.score(test_x_bow, test_y))


0.6121212121212121
0.6060606060606061
0.6484848484848484
0.6787878787878788


In [50]:
# f1 score

from sklearn.metrics import f1_score

f1_score(test_y, clf_svm.predict(test_x_bow), average=None, labels=["arg", "not-arg"])

array([0.6       , 0.62352941])