<a href="https://colab.research.google.com/github/Charles-Scott-Green/Argument-mining/blob/master/2_Extract_Metadata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#!python -m spacy download en_core_web_lg --quiet

In [0]:
#!pip install vaderSentiment

In [0]:
#!pip install spacy-wordnet

## Import Libraries and initialize settings

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [0]:
import re
import string
import math
from tqdm import tqdm

In [0]:
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize, word_tokenize
from pprint import pprint
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer

In [0]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data]   Package omw is already up-to-date!


True

In [0]:
import spacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 
from spacy.lang.en.stop_words import STOP_WORDS
from spacy_wordnet.wordnet_annotator import WordnetAnnotator

nlp = spacy.load("en_core_web_lg")
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

In [0]:
import warnings
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
warnings.filterwarnings('ignore')

In [0]:
from sklearn.model_selection import train_test_split as tts 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, precision_score

import lightgbm as lgb

from imblearn.over_sampling import SMOTE

## Define Functions

In [0]:
def topic_domains(topic):
    raw_domains = []
    common_domains = []

    for chunk in nlp(topic).noun_chunks:
        rt_nn = chunk.root
        raw_domains.append(rt_nn._.wordnet.wordnet_domains())

    if len(raw_domains) > 0:
        reference = raw_domains[0]
    else:
        return list(raw_domains)
        
    for domain in raw_domains:
        common_domains = set(reference) & set(domain)
        
    return list(common_domains)

In [0]:
def expanded_sentence_similarity(topic, sentence):
    
    extended_topic = []
    domains = topic_domains(topic)
    
    if len(domains) < 1:
        return nlp(topic).similarity(nlp(sentence))
    else:
        for token in nlp(topic):
            synsets = token._.wordnet.wordnet_synsets_for_domain(domains)
            if synsets: 
                lfs = []
                for syn in synsets:
                    lfs.extend(syn.lemma_names())
                    extended_topic.append('({})'.format('|'.join(set(lfs))))
            else:
                extended_topic.append(token.text)

        tokenized_extended_topic = nlp(' '.join(extended_topic))

    return tokenized_extended_topic.similarity(nlp(sentence))            

In [0]:
def subject_match(topic, sentence):
    
    ntopic = nlp(topic)
    scores = []
    for chunk in nlp(sentence).noun_chunks:
        scores.append(chunk.similarity(ntopic))
    
    return np.mean(scores)

In [0]:
def subject_match2(topic, sentence):
    
    ntopic = nlp(topic)
    scores = []
    for chnk in nlp(sentence).noun_chunks:
        for tchnk in ntopic.noun_chunks:
            scores.append(chnk.similarity(tchnk))
    
    return np.mean(scores)

In [0]:
def includes_that(sentence):
    check = []
    for tok in sentence:
        if tok.tag_ == 'IN':
            check.append(tok.text)
    if 'that' in check:
        return 1
    else:
        return 0

In [0]:
def patterns(sentence, pattern_type='dep'):
    
    pat = []
    if pattern_type == 'dep':
        pat = [tok.dep_ for tok in sentence]
    elif pattern_type == 'tag':
        pat = [tok.tag_ for tok in sentence]
    
    return '|'.join(pat)

In [0]:
def pos_pattern(sentence):
    pat = []
    for tok in sentence:
        pat.append(tok.pos_)

    return '|'.join(pat)


In [0]:
def pattern_in_text(pattern, sentence):
    text_pattern = pos_pattern(sentence)

    if pattern in text_pattern:
        return 1
    else:
        return 0

In [0]:
def infinitive_verb_present(sentence):
    text_pattern = pos_pattern(sentence)
    infinitive_pattern = 'PART|VERB'

    if infinitive_pattern in text_pattern:
        return 1
    else:
        return 0


In [0]:
def sentiment_subjectivity(sentence, score_type='sentiment'):
    analyzer = SentimentIntensityAnalyzer()

    if score_type == 'sentiment':
        pos = analyzer.polarity_scores(sentence)['pos']
        neg = analyzer.polarity_scores(sentence)['neg']
        return pos/neg
    
    else:
        return analyzer.polarity_scores(sentence)['compound']

In [0]:
def is_sentence(clause):
    nn = 2
    vb = 1
    cls = nlp(clause)
    for tok in cls:
        if tok.pos_ in ['NOUN', 'PROPN', 'PRON']:
            nn -= 1
        elif tok.pos_ in ['VERB']:
            vb -= 1
    if nn < 1 and vb < 1:
        return True
    else:
        return False

In [0]:
def get_clauses(sentence):
    """Sentence type is a string."""
    subtree_phrases = []
    
    for tok in nlp(sentence):
        if tok.head.pos_ == 'VERB':
            if len([tok for tok in tok.subtree]) >= 3:
                subtree_prhase = ''.join(tok.text_with_ws for tok in tok.subtree)
                if subtree_prhase != sentence:
                    if is_sentence(subtree_prhase):
                        splits = subtree_prhase.split(',')
                        if len(splits) == 1:
                            subtree_phrases.append(subtree_prhase)
                        else:
                            for split in splits:
                                if is_sentence(split):
                                    if split not in subtree_phrases:
                                        subtree_phrases.append(split) 
    
    raw_clauses = []
    clauses = []

    for phrase in subtree_phrases:
        tok_phrase = nlp(phrase)
        for chunk in tok_phrase.noun_chunks:
            span = tok_phrase[chunk.root.head.left_edge.i:chunk.root.head.right_edge.i+1]
            if span[0].pos_ in ['DET', 'ADP']:
                if span[1:].text not in raw_clauses:
                    if is_sentence(span[1:].text):
                        raw_clauses.append(span[1:].text)
            elif span.text not in raw_clauses:
                if is_sentence(span.text):
                    raw_clauses.append(span.text)
    
    if len(raw_clauses) == 0:
        clauses.append(sentence)
    else:
        for clause in raw_clauses:
            dup_checks = [(clause in check_clause) for check_clause in raw_clauses]
            if sum(dup_checks) == 1:
                clauses.append(clause)
                
    return clauses

In [0]:
def conf_matrix(y_true, y_predict):
    
    from sklearn.metrics import confusion_matrix
    
    data = confusion_matrix(y_true, y_predict)
    index = ['Actual_0', 'Actual_1']
    columns = ['Predicted_0', 'Predicted_1']
    
    return pd.DataFrame(data, index, columns)

In [0]:
def get_target_data(path):
    df = pd.read_csv(path)
    df['cdc_present'] = (df['cdc'] != '---').astype(int)

    return df['cdc_present']

In [0]:
def count_sentiment_words(sentence):
    analyzer = SentimentIntensityAnalyzer()
    wd_cnt = 0

    for tok in nlp(sentence):
        if analyzer.polarity_scores(tok.text)['compound'] != 0:
            wd_cnt += 1

    return wd_cnt

In [0]:
def count_syllables(sentence):
    """Given a string, calculates and returns the number of syllables of the word."""
    import string
    
    word_list = sentence.translate(string.punctuation).split()
    vowels = "aeiouy"
    tally = 0
    
    for word in word_list:
        word = word.lower()
        vowels = "aeiouy"
        count = 0
        
        if word[0] in vowels:
            count += 1

        for i in range(1, len(word)):
            if word[i] in vowels and word[i - 1] not in vowels:
                count += 1

        if word.endswith('e'):
            count -= 1

        if count == 0:
            count += 1
    
        tally += count
    
    return tally

In [0]:
def punct_marks(sentence):
    """Given a sentence, returns count and/or list of punctiation marks"""
    
    pm = []
    
    for tok in nlp(sentence):
        if tok.pos_ == 'PUNCT':
            pm.append(tok)
    return len(pm)

In [0]:
def noun_counts(sentence):
    
    chnk = []
    
    for chunk in nlp(sentence).noun_chunks:
        chnk.append(chunk)

    return len(chnk)

In [0]:
def word_count(sentence):
    wds = []
    for tok in nlp(sentence):
        if tok.pos_ != 'PUNCT':
            wds.append(tok)
    return len(wds)

In [0]:
def phrase_chunks(sentence):
    s = []
    for tok in nlp(sentence):
        if len([tok for tok in tok.subtree]) >=3:
            if ''.join(tok.text_with_ws for tok in tok.subtree) != str(sentence):
                s.append(''.join(tok.text_with_ws for tok in tok.subtree))
    return len(s)

## Load & Explore Data

In [0]:
# paths to articles and topics
path = '/content/drive/My Drive/Colab Notebooks/Thinkful/Module 34 - Final Capstone/data/CDC Detection/cl_datt.csv'


In [0]:
# Load topic/article index data
raw_data = pd.read_csv(path)


In [0]:
# Unique Claim counts
raw_data.shape

(41999, 5)

In [0]:
raw_data.columns

Index(['Unnamed: 0', 'text', 'topic', 'article', 'cdc'], dtype='object')

In [0]:
# Explore columns and types.
raw_data.head()

Unnamed: 0.1,Unnamed: 0,text,topic,article,cdc
0,0,Deficit spending is the amount by which a gove...,Europe should weaken its austerity measures to...,Deficit spending,---
1,1,Government deficit spending is a central point...,Europe should weaken its austerity measures to...,Deficit spending,---
2,2,Government deficit spending is a central point...,Europe should weaken its austerity measures to...,Deficit spending,---
3,3,The mainstream economics position is that defi...,Europe should weaken its austerity measures to...,Deficit spending,---
4,4,"This is derived from Keynesian economics, and ...",Europe should weaken its austerity measures to...,Deficit spending,---


In [0]:
# Convert target into a binary class
raw_data['cdc_present'] = (raw_data['cdc'] != '---')

### Topics

In [0]:
# Review topics
raw_data['topic'].unique()

array(['Europe should weaken its austerity measures to guarantee its citizens greater social support ',
       'year round schooling ',
       'wind power should be a primary focus of future energy supply ',
       'trade aid',
       'the use of performance enhancing drugs in professional sports ',
       'the use of affirmative action ',
       'the sale of violent video games to minors ',
       'the one child policy of the republic of China ', 'the monarchy ',
       "the United States is responsible for Mexico's drugs war ",
       'the US is justified in using force to prevent states from acquiring nuclear weapons ',
       'that the right to asylum should not be absolute ',
       'subsidise poor communities ', 'reintroduce national service ',
       're engage with Myanmar ', 'partial birth abortions ',
       'parents to genetically screen foetuses for heritable diseases ',
       'multiculturalism ', 'make physical education compulsory ',
       'limit the right to bear arms 

In [0]:
displacy.render(nlp('trade aide'), jupyter=True, 
                options={'compact':True, 'distance':100})

In [0]:
txt = 'Europe should weaken its austerity measures to guarantee its citizens greater social support'

displacy.render(nlp(txt), jupyter=True, 
                options={'compact':True, 'distance':100})

Topics vary in structure.  In many cases, topics are not statements or even sentences, rather they are specific words, or word combinations, that are "evaluative".  

In debate, evalutive terms are stand-alone terms that provide the general foundation, or context, of discussion or debate.  Additional phrase/sentence structures build on the context to frame a topic.  Take for example the diagramed topics:

* "trade aid" - This is not a topic; it is a context.  

* "Europe should weakin its austerity measures to guarantee its citizens greater social support."  This a topic.  In this case, the context is "austerity measures".  

Applying domain knowledge, the context of each unique "topic" will be extracted.



In [0]:
# Contexts
evaluatives = ['austerity measures', 'year round school', 'wind power', 'traid aid',
               'performing enhancing drugs', 'affirmative aciton', 
               'violent video games', 'one child policy', 'monarchy', 'drug war', 
               'nuclear weapons', 'right to asylum', 'subsidise community', 
               'national service', 'Myanmar', 'abortions', 'genetic screening', 
               'multiculturalism', 'physical education', 'right to bear arms', 
               'intellectual property rights', 'retirement age', 'pay housewives', 
               'gambling', 'endangered species', 'voter identification', 
               'human sex ratio', 'bribery', 'boxing', 'atheism', 'nuclear weapons',
               'collective bargaining', 'censorship']

context_dict = {}

for i,tpc in enumerate(raw_data['topic'].unique()):
    context_dict[tpc] = evaluatives[i]

In [0]:
raw_data['context'] = raw_data['topic'].apply(lambda x: context_dict.get(x))

raw_data.head()

Unnamed: 0.1,Unnamed: 0,text,topic,article,cdc,cdc_present,context
0,0,Deficit spending is the amount by which a gove...,Europe should weaken its austerity measures to...,Deficit spending,---,False,austerity measures
1,1,Government deficit spending is a central point...,Europe should weaken its austerity measures to...,Deficit spending,---,False,austerity measures
2,2,Government deficit spending is a central point...,Europe should weaken its austerity measures to...,Deficit spending,---,False,austerity measures
3,3,The mainstream economics position is that defi...,Europe should weaken its austerity measures to...,Deficit spending,---,False,austerity measures
4,4,"This is derived from Keynesian economics, and ...",Europe should weaken its austerity measures to...,Deficit spending,---,False,austerity measures


### Text

Given the text data, some stats will be extracted for analysis:

* word count for each sentence
* syllable count for each sentence
* count of sentiment words in sentence
* punctuation count of each sentence
* count of noun objects in each sentence
* count of phrases in each sentence



##### Stats

In [0]:
# Exctract information from text for evaluation
raw_data['text_words'] = raw_data['text'].apply(lambda x: word_count(x))
raw_data['text_syllables'] = raw_data['text'].apply(lambda x: count_syllables(x))
raw_data['text_sentiment_words'] = raw_data['text'].apply(lambda x: count_sentiment_words(x))
raw_data['text_punctuation'] = raw_data['text'].apply(lambda x: punct_marks(x))
raw_data['text_nouns'] = raw_data['text'].apply(lambda x: noun_counts(x))
raw_data['text_phrases'] = raw_data['text'].apply(lambda x: phrase_chunks(x))


#### Sentence Cleaning

When discussing topics, it was discovered that topic statements needed "cleaning" for context.  Similarly, sentences will require cleaning as well.  

In many NLP cases, the removal of stopwords is part of the process.  Unfortunately, stopword removal can impact sentence context.  In order to retain context, stopwords will not be removed.  However, given the goal of extracting Context Dependent Claims, extracting independent clauses from sentences may help remove "stop data" from the sentence while retaining context.

This can be done by:

1. Identify sentence complexity.
 * Complex sentences can contain several clauses
 * Simple sentences are retained.
2. Parse complext sentences into clauses
 * Dependent clauses discarded  
 * Independent clauses retained

 



In [0]:
# Calculate ratio of sentiment words to words in sentence
raw_data['sent_rat'] = raw_data['text_sentiment_words'] / raw_data['text_words']


In [0]:
# Extract independent clauses found in each sentence
raw_data['clauses'] = raw_data['text'].apply(lambda x: get_clauses(x))

In [0]:
# Count the number of clauses in each sentence
raw_data['num_clauses'] = raw_data['clauses'].apply(lambda x: len(x))

In [0]:
# Join independent clauses into a single sentence
raw_data['claused_sentence'] = raw_data['clauses'].apply(lambda x: ','.join(x))


In [0]:
# Compute context similarity of each sentence
context_similarity = []

for i,txt in enumerate(raw_data['claused_sentence']):
    context_similarity.append(subject_match(raw_data['context'][i], txt))

raw_data['context_similarity'] = context_similarity

In [0]:
# Compute similarity of sentence with expanded context
expanded_similarity = []
for i,txt in enumerate(raw_data['claused_sentence']):
    expanded_similarity.append(expanded_sentence_similarity(raw_data['context'][i], txt))

raw_data['expanded_similarity'] = expanded_similarity

In [0]:
raw_data[['context_similarity', 'expanded_similarity', 'cdc_present']].head(20)

Unnamed: 0,context_similarity,expanded_similarity,cdc_present
0,0.326836,0.259504,False
1,0.373849,0.267986,False
2,0.267243,0.106333,False
3,0.461772,0.240641,False
4,0.318571,0.306673,False
5,0.43085,0.267669,True
6,0.419114,0.383118,True
7,0.352407,0.277571,False
8,0.206059,0.233298,False
9,0.613701,0.162653,False


In [0]:
raw_data['subjectivity_score'] = raw_data['claused_sentence'].apply(lambda x: sentiment_subjectivity(x, 'subjectivity'))

In [0]:
raw_data.head(10).transpose()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Unnamed: 0,0,1,2,3,4,5,6,7,8,9
text,Deficit spending is the amount by which a gove...,Government deficit spending is a central point...,Government deficit spending is a central point...,The mainstream economics position is that defi...,"This is derived from Keynesian economics, and ...",The mainstream position is attacked from both ...,Advocates of sound finance (in the US known as...,"Sound finance has academic support, predominan...",Proponents of sound finance date back to Adam ...,Sound finance was the dominant position until ...
topic,Europe should weaken its austerity measures to...,Europe should weaken its austerity measures to...,Europe should weaken its austerity measures to...,Europe should weaken its austerity measures to...,Europe should weaken its austerity measures to...,Europe should weaken its austerity measures to...,Europe should weaken its austerity measures to...,Europe should weaken its austerity measures to...,Europe should weaken its austerity measures to...,Europe should weaken its austerity measures to...
article,Deficit spending,Deficit spending,Deficit spending,Deficit spending,Deficit spending,Deficit spending,Deficit spending,Deficit spending,Deficit spending,Deficit spending
cdc,---,---,---,---,---,deficit spending is necessary,government should always run a balanced budget,---,---,---
cdc_present,False,False,False,False,False,True,True,False,False,False
context,austerity measures,austerity measures,austerity measures,austerity measures,austerity measures,austerity measures,austerity measures,austerity measures,austerity measures,austerity measures
text_words,35,14,17,66,34,40,44,64,13,27
text_syllables,67,29,37,122,64,77,73,128,22,52
text_sentiment_words,3,1,2,7,3,6,8,7,0,4


In [0]:
write_path = '/content/drive/My Drive/Colab Notebooks/Thinkful/Module 34 - Final Capstone/data/CDC Detection/cleaned.csv'

raw_data.to_csv(write_path)