# Exploratory Data Analysis

In this notebook, EDA is carried out in order to understand the data. This will assist with feature extraction for the argumention prediction classifier.

In [None]:
# Load data from file

import json

data = []

with open('./labelled_data/1000_labelled_argument_sentences.json') as f:
    for line in f:
        json_line = json.loads(line)
        arg = {"text": json_line["content"], "label": json_line["annotation"]["labels"][0]}

        data.append(arg)

In [None]:
# Create document term matrices (dtm) and corpus

from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Corpus
df_corpus = pd.DataFrame().from_dict(data, orient='columns')

# dtm without stop words
cv = CountVectorizer(stop_words='english')
df_cv = cv.fit_transform(df_corpus.text)
df_dtm = pd.DataFrame(df_cv.toarray(), columns=cv.get_feature_names())
df_dtm.index = df_corpus.index

# dtm with stop words
cv_all = CountVectorizer()
df_cv_all = cv_all.fit_transform(df_corpus.text)
df_dtm_all = pd.DataFrame(df_cv_all.toarray(), columns=cv_all.get_feature_names())
df_dtm_all.index = df_corpus.index

display(df_corpus.head())
display(df_dtm.head())
display(df_dtm_all.head())

In [None]:
# Number of arg and not_arg
number_of_arg = len(df_corpus[df_corpus["label"] == "arg"])
number_of_not_arg = len(df_corpus[df_corpus["label"] == "not_arg"])

display(number_of_arg)
display(number_of_not_arg)

## Without stop words

In [None]:
# Average number of words in each sentence of arg and not_arg 

dtm_arg = df_dtm.loc[df_corpus.index[df_corpus["label"] == "arg"].tolist()]
dtm_notarg = df_dtm.loc[df_corpus.index[df_corpus["label"] == "not_arg"].tolist()]

print("Average number of words in arg sentences")
display(dtm_arg.sum(axis=1).mean())

print("Average number of words in not_arg sentences")
display(dtm_notarg.sum(axis=1).mean())

In [None]:
# Most Common Words - Quantitative

display(dtm_arg.sum().sort_values(ascending=False).head(20))
display(dtm_notarg.sum().sort_values(ascending=False).head(20))

In [None]:
# Most Common Words - Visual

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import wordcloud

import matplotlib.pyplot as plt

df_arg = df_corpus[df_corpus["label"] == "arg"]
df_notarg = df_corpus[df_corpus["label"] == "not_arg"]
concat_arg = " ".join(arg for arg in df_arg.text)
concat_notarg = " ".join(arg for arg in df_notarg.text)

wordcloud_arg = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(concat_arg)
wordcloud_notarg = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(concat_notarg)

plt.imshow(wordcloud_arg, interpolation='bilinear')
plt.axis("off")
print("most common words: arg")
plt.show()

plt.imshow(wordcloud_notarg, interpolation='bilinear')
plt.axis("off")
print("most common words: not_arg")
plt.show()

In [None]:
# Full vocabulary of dataset and number of occurances
print(pd.DataFrame(df_dtm.sum()))

## With stop words

In [None]:
# Average number of words in each sentence of arg and not_arg 

dtm_arg_all = df_dtm_all.loc[df_corpus.index[df_corpus["label"] == "arg"].tolist()]
dtm_notarg_all = df_dtm_all.loc[df_corpus.index[df_corpus["label"] == "not_arg"].tolist()]

print("Average number of words in arg sentences")
display(dtm_arg_all.sum(axis=1).mean())

print("Average number of words in not_arg sentences")
display(dtm_notarg_all.sum(axis=1).mean())

In [None]:
# Most Common Words - Quantitative
display(dtm_arg_all.sum().sort_values(ascending=False).head(20))
display(dtm_notarg_all.sum().sort_values(ascending=False).head(20))

In [None]:
# Most Common Words - Visual

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import wordcloud

import matplotlib.pyplot as plt

df_arg = df_corpus[df_corpus["label"] == "arg"]
df_notarg = df_corpus[df_corpus["label"] == "not_arg"]
concat_arg = " ".join(arg for arg in df_arg.text)
concat_notarg = " ".join(arg for arg in df_notarg.text)

wordcloud_arg = WordCloud(stopwords=None, max_font_size=50, max_words=100, background_color="white").generate(concat_arg)
wordcloud_notarg = WordCloud(stopwords=None, max_font_size=50, max_words=100, background_color="white").generate(concat_notarg)

plt.imshow(wordcloud_arg, interpolation='bilinear')
plt.axis("off")
print("most common words: arg")
plt.show()

plt.imshow(wordcloud_notarg, interpolation='bilinear')
plt.axis("off")
print("most common words: not_arg")
plt.show()

In [None]:
# Full vocabulary of dataset and number of occurances
print(pd.DataFrame(df_dtm.sum()))

## Fraction of sentences that contain numbers



In [None]:
def contains_numbers(text):
    return any(char.isdigit() for char in text)

print("Fraction of sentences that contain numbers: arg")
display(sum([contains_numbers(sentence) for sentence in df_corpus[df_corpus["label"] == "arg"].text.tolist()])/number_of_arg)

print("Fraction of sentences that contain numbers: not_arg")
display(sum([contains_numbers(sentence) for sentence in df_corpus[df_corpus["label"] == "not_arg"].text.tolist()])/number_of_not_arg)

### Sentiment

In [None]:
# Sentiment

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

sentiment = [sid.polarity_scores(sentence)['compound'] for sentence in df_corpus[df_corpus["label"] == "arg"].text.tolist()]
print("Average sentiment in arg sentences")
display(sum(sentiment)/len(sentiment))

sentiment = [sid.polarity_scores(sentence)['compound'] for sentence in df_corpus[df_corpus["label"] == "not_arg"].text.tolist()]
print("Average sentiment in not_arg sentences")
display(sum(sentiment)/len(sentiment))


### PoS

In [None]:
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

import numpy
from sklearn.preprocessing import normalize
from sklearn.preprocessing import minmax_scale

pos_counts_arg = []

for sentence in df_corpus[df_corpus["label"] == "arg"].text.tolist():
    nn = 0
    vb = 0
    jj = 0
    for word, pos in pos_tag(word_tokenize(sentence)):
        if pos.startswith('NN'):
            nn += 1
        elif pos.startswith('VB'):
            vb += 1
        elif pos.startswith('JJ'):
            jj += 1
            
    pos_counts_arg.append([nn, vb, jj])


pos_counts_arg = numpy.mean(pos_counts_arg, axis=0)

print("Average number of PoS of NN (Nouns), VB (Verbs), JJ (Adjectives) in arg sentences")
print(pos_counts_arg)
print("Normalised")
print(normalize([pos_counts_arg], norm='l1'))

pos_counts_notarg = []

for sentence in df_corpus[df_corpus["label"] == "not_arg"].text.tolist():
    nn = 0
    vb = 0
    jj = 0
    for word, pos in pos_tag(word_tokenize(sentence)):
        if pos.startswith('NN'):
            nn += 1
        elif pos.startswith('VB'):
            vb += 1
        elif pos.startswith('JJ'):
            jj += 1
            
    pos_counts_notarg.append([nn, vb, jj])

pos_counts_notarg = numpy.mean(pos_counts_notarg, axis=0)

print("Average number of PoS of NN (Nouns), VB (Verbs), JJ (Adjectives) in not_arg sentences")
print(pos_counts_notarg)
print("Normalised")
print(normalize([pos_counts_notarg], norm='l1'))