<a href="https://colab.research.google.com/github/Charles-Scott-Green/Argument-mining/blob/master/1_Compile_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!python -m spacy download en_core_web_lg --quiet

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [0]:
!pip install vaderSentiment



In [0]:
!pip install spacy-wordnet



## Import Libraries and initialize settings

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [0]:
import re
import string
import math
from tqdm import tqdm

In [0]:
import nltk
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.tokenize import sent_tokenize, word_tokenize
from pprint import pprint
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktTrainer

In [0]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package omw to /root/nltk_data...
[nltk_data]   Unzipping corpora/omw.zip.


True

In [0]:
import spacy
from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy 
from spacy.lang.en.stop_words import STOP_WORDS
from spacy_wordnet.wordnet_annotator import WordnetAnnotator

nlp = spacy.load("en_core_web_lg")
nlp.add_pipe(WordnetAnnotator(nlp.lang), after='tagger')

In [0]:
from glob import glob

In [0]:
import warnings

warnings.filterwarnings('ignore')

In [0]:
from sklearn.model_selection import train_test_split as tts 
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# Define Functions

In [0]:
def text_cleaner(text):
    # visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  better get rid of it now!
    text = text.strip()
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = re.sub(r"(\b|\s+\-?|^\-?)(\d+|\d*\.\d+)\b", " ", text)
    text = text.replace(' .', '.')
    text = text.replace('/', '')
    text = re.sub("\'s", "'s", text)
    text = ' '.join(text.split())

    return text

In [0]:
def compile_train_corpus(text_folder):
    train_doc = ''

    for text_file in text_folder:
            with open(text_file) as article:
                doc = article.read()
                train_doc += doc
    return train_doc

In [0]:
def is_sentence(clause):
    nn = 2
    vb = 1
    cls = nlp(clause)
    for tok in cls:
        if tok.pos_ in ['NOUN', 'PROPN', 'PRON']:
            nn -= 1
        elif tok.pos_ in ['VERB']:
            vb -= 1
    if nn < 1 and vb < 1:
        return True
    else:
        return False

In [0]:
def get_clauses(sentence):
    """Sentence type is a string."""
    subtree_phrases = []
    
    for tok in nlp(sentence):
        if tok.head.pos_ == 'VERB':
            if len([tok for tok in tok.subtree]) >= 3:
                subtree_prhase = ''.join(tok.text_with_ws for tok in tok.subtree)
                if subtree_prhase != sentence:
                    if is_sentence(subtree_prhase):
                        splits = subtree_prhase.split(',')
                        if len(splits) == 1:
                            subtree_phrases.append(subtree_prhase)
                        else:
                            for split in splits:
                                if is_sentence(split):
                                    if split not in subtree_phrases:
                                        subtree_phrases.append(split) 
    
    raw_clauses = []
    clauses = []

    for phrase in subtree_phrases:
        tok_phrase = nlp(phrase)
        for chunk in tok_phrase.noun_chunks:
            span = tok_phrase[chunk.root.head.left_edge.i:chunk.root.head.right_edge.i+1]
            if span[0].pos_ in ['DET', 'ADP']:
                if span[1:].text not in raw_clauses:
                    if is_sentence(span[1:].text):
                        raw_clauses.append(span[1:].text)
            elif span.text not in raw_clauses:
                if is_sentence(span.text):
                    raw_clauses.append(span.text)
    
    if len(raw_clauses) == 0:
        clauses.append(sentence)
    else:
        for clause in raw_clauses:
            dup_checks = [(clause in check_clause) for check_clause in raw_clauses]
            if sum(dup_checks) == 1:
                clauses.append(clause)
                
    return clauses

# Load Data

In [0]:
# paths to articles and topics
path_topic = '/content/drive/My Drive/Colab Notebooks/Thinkful/Module 34 - Final Capstone/data/CDC Detection/debater_ce_acl/2014_7_18_ibm_CDCdata.xls'
path_articles = '/content/drive/My Drive/Colab Notebooks/Thinkful/Module 34 - Final Capstone/data/CDC Detection/debater_ce_acl/articles'

## Load Reference data set

In [0]:
# Load topic/article index data
ref_data = pd.read_excel(path_topic)


In [0]:
# Unique Claim counts
ref_data['Claim'].nunique(), ref_data.shape[0]

(1332, 1387)

In [0]:
ref_data.drop_duplicates('Claim', inplace=True)
ref_data['Claim'].nunique(), ref_data.shape[0]
ref_data.reset_index(inplace=True)

In [0]:
ref_data.head()

Unnamed: 0,index,Topic,Article,Claim,Requires correction,Correction type,Corrected Text
0,0,the sale of violent video games to minors,Video game controversies,exposure to violent video games causes at leas...,True,,Exposure to violent video games causes at leas...
1,1,the sale of violent video games to minors,Video game controversies,video game violence is not related to serious ...,False,,
2,2,the sale of violent video games to minors,Video game controversies,some violent video games may actually have a p...,False,,
3,3,the sale of violent video games to minors,Video game controversies,exposure to violent video games causes both sh...,False,,
4,4,the sale of violent video games to minors,Video game controversies,they increase the violent tendencies among youth,True,,Violent video games increase the violent tende...


In [0]:
ref_data['Article'].nunique()

313

## Clean Reference Data and Train tokenizer

Some text preparation of the file names, as they relate to the article names in the reference data, is needed. Additionally, articles will be organized into an appropriate  list and/or dictionary for ease of use.

In [0]:
# Clean up article names to make them match filenames
clean_article = [x.replace('.', '').replace('&', 'and'). 
                 replace('Mיrida', 'Merida').replace('?', '') for 
                 x in ref_data['Article']]

ref_data['c_Article'] = clean_article

# Unique Topic and article counts
ref_data['Topic'].nunique(), ref_data['c_Article'].nunique()

(33, 313)

In [0]:
topic_article_dict = {}

for t in ref_data['Topic']:
    topic_article_dict[t] = list(set(ref_data.loc[ref_data['Topic'] == t, 'c_Article']))

topic_article_dict

{'Europe should weaken its austerity measures to guarantee its citizens greater social support ': ['Anti-austerity protests',
  'Welfare state',
  'Austerity',
  'Greek government-debt crisis',
  'Deficit spending'],
 "Google shouldn't censor its search results in China ": ['Public Pledge on Self-Discipline for the Chinese Internet Industry',
  'Internet censorship',
  'Google China',
  "Internet censorship in the People's Republic of China",
  'Criticism of Google',
  'Corporate social responsibility'],
 'all collective bargaining rights claimed by trades unions ': ['Trade union',
  'Federal Labor Relations Act',
  'Demir and Baykara v Turkey',
  'Collective bargaining',
  'Industrial unionism',
  'Opposition to trade unions'],
 'all nations have a right to nuclear weapons ': ['Nuclear proliferation',
  'History of nuclear weapons',
  'Mutual assured destruction',
  'Nuclear warfare',
  'India and weapons of mass destruction',
  'Deterrence theory',
  'Treaty on the Non-Proliferation 

In [0]:
# Dictionary for paths for articles
articles = [file.replace(path_articles+'/', '') for 
            file in glob(path_articles+'/*.txt')]
article_paths = [file for file in glob(path_articles+'/*.txt')]

articles_dict = dict(zip(articles,article_paths))


### Train 'PunktSentenceTokenizer' 

To organize data into a usabel format, partitioning sentences is necessary.  Since all sentences are from a common source (Wikipedia), training the tokenizer on the sentences is useful.  

In [0]:
# compile training corpus
training_corpus = compile_train_corpus(article_paths)

# train tokenizer
trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(training_corpus)
 
tokenizer = PunktSentenceTokenizer(trainer.get_params())

### Clean the claim sentences in the reference data

In [0]:
cleaned_claims = []

for claim in ref_data['Claim']:
    cleaned = text_cleaner(claim)
    cleaned_claims.append(cleaned)

ref_data['clean_Claim'] = cleaned_claims

In [0]:
# Initialize dataframe that will be primary.
df = pd.DataFrame(columns=['text', 'topic', 'article', 'text_clauses', 
                           'nm_clauses', 'cdc'])

## Load Articles and Compile Data by Topic


### "Google shouldn't censor its search results in China "

In [0]:
topic = "Google shouldn't censor its search results in China "

articles = topic_article_dict[topic]
articles

['Public Pledge on Self-Discipline for the Chinese Internet Industry',
 'Internet censorship',
 'Google China',
 "Internet censorship in the People's Republic of China",
 'Criticism of Google',
 'Corporate social responsibility']

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
ndf['text_clauses'] = ndf['text'].apply(lambda x: get_clauses(x))

In [0]:
ndf['nm_clauses'] = ndf['text_clauses'].apply(lambda x: len(x))

In [0]:
ndf['nm_clauses'].describe()

count    20.000000
mean      1.450000
std       0.944513
min       1.000000
25%       1.000000
50%       1.000000
75%       1.000000
max       4.000000
Name: nm_clauses, dtype: float64

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{7: 'their presence in China will support economic development leading to political change',
 8: 'their activities are facilitating and sanctioning government censorship rather than challenging it'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
ndf['text_clauses'] = ndf['text'].apply(lambda x: get_clauses(x))
ndf['nm_clauses'] = ndf['text_clauses'].apply(lambda x: len(x))
ndf['nm_clauses'].describe()

count    212.000000
mean       1.382075
std        0.848896
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        8.000000
Name: nm_clauses, dtype: float64

In [0]:
chk = ndf.loc[ndf['nm_clauses'] > 4]
chk.head()

Unnamed: 0,text,topic,article,text_clauses,nm_clauses
99,"Examples include: Sex and erotic, fetishism, p...",Google shouldn't censor its search results in ...,Internet censorship,"[Examples include: Sex and erotic, Gay and Les...",5
190,Facebook: Among other things the Facebook Stat...,Google shouldn't censor its search results in ...,Internet censorship,[other things the Facebook Statement of Rights...,8
191,Google: Google's general Terms of Service were...,Google shouldn't censor its search results in ...,Internet censorship,[Google's general Terms of Service were update...,6


In [0]:
chk['text'][191]

'Google: Google\'s general Terms of Service were updated on March , and state: "We may suspend or stop providing our Services to you if you do not comply with our terms or policies or if we are investigating suspected misconduct", "We may review content to determine whether it is illegal or violates our policies, and we may remove or refuse to display content that we reasonably believe violates our policies or the law", and "We respond to notices of alleged copyright infringement and terminate accounts of repeat infringers according to the process set out in the U.S. Digital Millennium Copyright Act".'

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{67: 'the internet should never be regulated by any level of government anywhere',
 71: 'access to the Internet was a fundamental right',
 72: 'access to the Internet was a fundamental right of all people',
 192: 'Google may temporarily or permanently remove sites from its index and search results if it believes it is obligated to do so by law'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
ndf['text_clauses'] = ndf['text'].apply(lambda x: get_clauses(x))
ndf['nm_clauses'] = ndf['text_clauses'].apply(lambda x: len(x))
ndf['nm_clauses'].describe()

count    80.000000
mean      1.300000
std       0.682512
min       1.000000
25%       1.000000
50%       1.000000
75%       1.000000
max       5.000000
Name: nm_clauses, dtype: float64

In [0]:
chk = ndf.loc[ndf['nm_clauses']>4]
chk['text'][73]

'People\'s Daily published a scathing op-ed on Google which criticized western leaders for politicizing the way in which China controls citizen\'s access to the Internet, saying "implementing monitoring according to a country\'s national context is what any government has to do," and that China\'s need to censor the internet is greater than that of developed countries, "The Chinese society has generally less information bearing capacity than developed countries such as the U.S.&nbsp;..".'

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{43: "it could play a role more useful to the cause of free speech by participating in China's IT industry than by refusing to comply",
 44: "removing search results is inconsistent with Google's mission",
 49: 'Google China is a flagrant violation of the Google motto, "Don\'t be evil',
 73: "implementing monitoring according to a country's national context is what any government has to do"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
ndf['text_clauses'] = ndf['text'].apply(lambda x: get_clauses(x))
ndf['nm_clauses'] = ndf['text_clauses'].apply(lambda x: len(x))
ndf['nm_clauses'].describe()

count    175.000000
mean       1.354286
std        0.727278
min        1.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        7.000000
Name: nm_clauses, dtype: float64

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{43: "it could play a role more useful to the cause of free speech by participating in China's IT industry than by refusing to comply",
 44: "removing search results is inconsistent with Google's mission",
 49: 'Google China is a flagrant violation of the Google motto, "Don\'t be evil',
 73: "implementing monitoring according to a country's national context is what any government has to do"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{4: 'Google\'s stated mission is "to organize the world\'s information and make it universally accessible',
 174: 'some censorship is necessary in order to keep the Chinese government from blocking Google entirely',
 177: 'Google for assisting the Chinese government in repressing its own citizens'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{67: 'the internet should never be regulated by any level of government anywhere',
 71: 'access to the Internet was a fundamental right',
 72: 'access to the Internet was a fundamental right of all people',
 192: 'Google may temporarily or permanently remove sites from its index and search results if it believes it is obligated to do so by law'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'all collective bargaining rights claimed by trades unions '

In [0]:
topic = 'all collective bargaining rights claimed by trades unions '

articles = topic_article_dict[topic]
articles

['Demir and Baykara v Turkey',
 'Collective bargaining',
 'Federal Labor Relations Act',
 'Industrial unionism',
 'Trade union',
 'Opposition to trade unions']

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{18: 'there is an inherent right to collective bargaining'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{9: 'The right to collectively bargain is recognized through international human rights conventions',
 11: 'the "freedom of association and the effective recognition of the right to collective bargaining" as an essential right of workers',
 12: 'collective bargaining as a human right',
 14: 'The right to bargain collectively with an employer enhances the human dignity',
 16: 'Collective bargaining permits workers to achieve a form of workplace democracy'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{3: 'facilitates and encourages the amicable settlements of disputes between employees and their employers'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{28: 'an individual cannot stand alone against the power of the company'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{318: 'unionisation produces higher wages (for the union members) at the expense of fewer jobs'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{3: 'frequently produces higher wages at the expense of fewer jobs',
 11: 'unions promote deadweight loss',
 18: 'The effect of union activities to influence pricing is potentially very harmful',
 36: 'unions never raise productivity to compensate for higher wages',
 40: 'unionized workers will spend their higher wages, driving economic growth and creating new jobs',
 46: 'Unions may serve the practical purpose of leveling the playing-field between workers and powerful oligopolies'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'all nations have a right to nuclear weapons '

In [0]:
topic = 'all nations have a right to nuclear weapons '

articles = topic_article_dict[topic]
articles

['Anti-nuclear movement',
 'Nuclear warfare',
 'Nuclear proliferation',
 'Nuclear holocaust',
 'Kenneth Waltz',
 'Nuclear weapons debate',
 'International Atomic Energy Agency',
 'History of nuclear weapons',
 'Iran and weapons of mass destruction',
 'Deterrence theory',
 'Nuclear peace',
 'Mutual assured destruction',
 'Treaty on the Non-Proliferation of Nuclear Weapons',
 'Salted bomb',
 'India and weapons of mass destruction',
 'UK Trident programme',
 'Nuclear weapon',
 "Japan's non-nuclear weapons policy",
 'Nuclear disarmament',
 'Stability-instability paradox']

In [0]:
len(articles)

20

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{33: 'nuclear weapons had become a source of extreme risk'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{26: 'a full-scale nuclear war could potentially bring about the extinction of the human race',
 127: 'a full-scale nuclear war would result in the extinction of the human species'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{1: 'more countries with nuclear weapons may increase the possibility of nuclear warfare',
 38: 'A fundamental goal for American and global security is to minimize the proliferation risks associated with the expansion of nuclear power',
 401: 'the spread of nuclear weapons could increase international stability',
 403: 'it will decrease the likelihood of war',
 407: 'nuclear weapons promote caution in decision-makers',
 421: 'weak states will be unable to prevent – or will actively provide for – the disastrous possibility of nuclear terrorism',
 431: 'If one state produces a nuclear weapon it creates almost a domino effect within the region',
 438: 'prohibition on nuclear proliferation has been characterised as a form of technological apartheid'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{8: 'nuclear holocaust could result in an end to human life'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{16: 'states must act in a way that ensures their security above all',
 17: 'they cannot count on the good will of others to help them, so they must always be ready to fend for themselves'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(7, 1332)

In [0]:
cdc_idx_dict

{7: 'it would undermine deterrence',
 10: 'no issue carries more importance to the long-term health and security of humanity than the effort to reduce, and perhaps one day, rid the world of nuclear weapons',
 29: 'it would undermine deterrence',
 31: 'Nuclear weapons are said to have induced "nuclear peace',
 34: 'is obsolete',
 41: 'the likelihood that non-state terrorists will get their hands on nuclear weaponry is increasing',
 44: 'no issue carries more importance to the long-term health and security of humanity than the effort to reduce, and perhaps one day, rid the world of nuclear weapons'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{26: 'if we hope to escape self-destruction, then nuclear weapons should have no place in our collective conscience, and no role in our security'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{217: 'the greater the threat of mutual destruction, the safer the world would be'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{165: 'the present situation whereby Nuclear Weapon States monopolise the right to possess nuclear weapons is "highly discriminatory',
 404: 'the present situation whereby Nuclear Weapon States monopolise the right to possess nuclear weapons is "highly discriminatory'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9 

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{3: 'an inferior nuclear force, by virtue of its extreme destructive power, could deter a more powerful adversary',
 16: 'nuclear weapons had become a source of extreme risk',
 37: 'nuclear weapons are intended to deter other states from attacking with their nuclear weapons',
 98: 'Nuclear weapons give nations the potential to not only destroy their enemies but humanity itself',
 127: 'nuclear weapons had become a source of extreme risk'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10 

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(9, 1332)

In [0]:
cdc_idx_dict

{0: 'decrease the chances of crisis escalation',
 1: 'nuclear weapons are said to have induced stability',
 2: 'nuclear proliferation may be beneficial for inducing stability',
 3: 'increases the chances of nuclear material falling into the hands of non-state groups who are free from the threat of nuclear retaliation',
 5: 'new nuclear states will use their acquired nuclear capabilities to deter threats and preserve peace',
 6: 'new nuclear states often lack adequate organizational controls over their new weapons, which makes for a high risk of either deliberate or accidental nuclear war',
 13: "Nuclear weapons may also lessen a state's reliance on allies for security, thus preventing allies from dragging each other into wars",
 26: 'nuclear weapons induce stability',
 27: 'nuclear weapons contribute to stability'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{1: 'the deployment, and implicit menace of use, of strong weapons is essential to threaten the enemy in order to prevent the use by said-enemy of the same weapons',
 75: 'a nuclear nation might be hijacked by a despot or other person or persons who might use nuclear weapons without sane regard for the consequences'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 12

In [0]:
with open(articles_dict[articles[12].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[12]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{16: 'the NPT cannot stop the proliferation of nuclear weapons or the motivation to acquire them',
 75: 'Having more nuclear nuclear-weapon states would reduce security for all',
 100: 'nuclear forces continue to play an essential role in war prevention'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 13

In [0]:
with open(articles_dict[articles[13].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[13]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{4: 'nuclear weapon technology would soon reach the point where it could end human life on Earth'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 14

In [0]:
with open(articles_dict[articles[14].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[14]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{11: 'every country will have to devise and use the latest devices for its protection'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 15

In [0]:
with open(articles_dict[articles[15].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[15]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{88: 'In certain circumstances, they can play a positive role'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 16

In [0]:
with open(articles_dict[articles[16].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[16]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{86: 'the significance of nuclear weapons is purely to deter war',
 89: 'would generally be contrary to the rules of international law applicable in armed conflict',
 90: 'nuclear proliferation would be desirable',
 91: 'nuclear weapons successfully deter all-out war between states',
 125: 'could lead to increased global instability',
 128: 'no issue carries more importance to the long-term health and security of humanity than the effort to reduce, and perhaps one day, rid the world of nuclear weapons'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 17

In [0]:
with open(articles_dict[articles[17].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[17]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{33: 'public opinion is overwhelmingly opposed to nuclearization'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 18

In [0]:
with open(articles_dict[articles[18].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[18]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{7: 'it would undermine deterrence',
 17: 'extreme danger intrinsic to nuclear war and the possession of nuclear weapons',
 71: 'with nuclear weapons more widely available, deterrence is decreasingly effective and increasingly hazardous'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 19 

In [0]:
with open(articles_dict[articles[19].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[19]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{1: 'when two countries each have nuclear weapons, the probability of a direct war between them greatly decreases'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'atheism is the only way '

In [0]:
topic = 'atheism is the only way '

articles = topic_article_dict[topic]
articles

['Antitheism',
 'Problem of evil',
 'The Root of All Evil',
 'Quinque viae',
 'Cosmological argument',
 'Criticism of atheism',
 'Argument from morality',
 'Criticism of religion',
 'Existence of God',
 'God Is Not Great',
 'Agnosticism',
 'Ultimate Boeing 747 gambit',
 'The God Delusion',
 'Atheism',
 'God of the gaps',
 'Naturalism (philosophy)',
 'God',
 'The System of Nature',
 'Richard Dawkins']

In [0]:
len(articles)

19

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{7: 'the effect of religious belief, is positively harmful',
 26: 'there is evidence even for the existence of a God'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{2: 'the existences of such a god and of evil are logically incompatible',
 24: 'God cannot exist with, or would want to prevent, all evils',
 29: 'God and evil are logically incompatible',
 34: 'An omniscient, wholly good being would prevent the occurrence of any intense suffering'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{0: 'humanity would be better off without religion or belief in God',
 10: 'is divisive and dangerous',
 120: 'atheism is not a recipe for despair but just the opposite',
 121: 'is life-affirming in a way that religion can never be',
 132: 'religion does more harm than good'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{35: 'everything in the Universe has a purpose, which must have been caused by God',
 40: 'everything in the Universe follows laws, which must have been created by God'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{3: 'something caused the Universe to exist, and this First Cause must be God',
 24: 'existence must be due to an agent cause',
 38: 'the existence of the Universe requires an explanation, and the creation of the Universe by a First Cause, generally assumed to be God, is that explanation',
 88: 'a god created the Universe'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{7: 'there are insufficient grounds to assert authoritatively that any supreme being does not exist',
 17: 'acknowledgment of God or the gods is a major factor in motivating people towards moral behavior',
 40: 'Atheism has been criticized as a faith in itself',
 98: 'atheism systematically influences people to do bad things'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{3: 'God must exist to give force to moral obligations',
 5: 'objective moral truths and the binding nature of obligations suggests a high power to enforce them, regarded as God',
 15: 'only the existence of God as orthodoxly conceived could support the existence of moral order in the world',
 19: 'A natural moral order requires the existence of God as orthodoxly conceived, so god must exist',
 20: "there is not good argument for God's existence that arises from pure reason alone",
 22: 'all moral thought requires the assumption that God exists',
 26: 'theists are able to offer justification for morality, while atheists are not',
 33: 'the natural is all that exists'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(16, 1332)

In [0]:
cdc_idx_dict

{5: 'all phenomena could be understood as resulting from purely natural causes',
 6: 'religion was born of fear and ignorance',
 14: 'they require beliefs that are irrational',
 15: 'religious beliefs and traditions lack scientific or rational foundations',
 17: 'Religions often posit facts that are contradicted by scientific evidence',
 23: 'their teachings are outdated in comparison with modern Western morals',
 26: 'Religions have promoted facts and histories that are contradicted by science',
 48: 'theist religions and their holy books are not divinely inspired, but instead are fabrications of non-divine human individuals',
 60: 'there are reasonable arguments supporting the existence of God',
 90: 'religious belief is a delusion',
 94: 'religious belief is a delusion',
 95: 'religion is nothing more than a social construct that primitive humans evolved',
 103: 'the need for explaining life and death can be met by science and philosophy',
 196: 'atheism was responsible for "some 20

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(26, 1332)

In [0]:
cdc_idx_dict

{6: 'arguments for the existence of God show insufficient reason to believe',
 54: 'the universe includes "ideas" not perceptible to mankind (or not always perceptible), and that there must therefore exist an omniscient superobserver',
 62: 'there was a "first cause", or "prime mover" who is identified as God',
 64: "the universe's order and complexity are best explained by reference to a creator God",
 68: 'certain features of the universe and of living things are the product of an intelligent cause',
 74: "basic facts, such as humanity's existence, are best explained by the existence of God",
 76: 'atheistic arguments must ultimately refute themselves if pressed with rigorous consistency',
 115: "the theism of people throughout most of recorded history and in many different places provides prima facie demonstration of God's existence",
 127: "when a person's understanding ponders over the existence of God it encounters nothing but contradictions",
 146: 'natural (non-supernatural) th

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{32: 'we do not need God to explain things',
 81: 'the most immoral acts in human history were performed by atheists',
 88: 'the human race no longer needs religion',
 97: 'all attempts to reconcile faith with science and reason are consigned to failure'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{5: 'human reason is incapable of providing sufficient rational grounds to justify the belief that deities either do or do not exist',
 30: 'God, the beginning and end of all, can, by the natural light of human reason, be known with certainty from the works of creation'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11 

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{6: 'the God hypothesis inferior to evolution by natural selection as explanations for the complexity of life',
 12: 'where design fails to explain complexity, evolution by natural selection succeeds',
 20: 'The theory of natural selection is much simpler than the theory of the existence of such a complex being, and thus preferable',
 58: 'God is improbable',
 60: 'a creator of a universe with such complexity would have to be complex and improbable',
 64: 'there must be a first cause, which can be given the name God'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 12

In [0]:
with open(articles_dict[articles[12].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[12]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(9, 1332)

In [0]:
cdc_idx_dict

{11: 'evolution can explain the apparent design in nature',
 23: 'Natural selection and similar scientific theories are superior to a "God hypothesis"—the illusion of intelligent design—in explaining the living world and the cosmos',
 26: 'atheism is evidence of a healthy, independent mind',
 33: 'evolution by natural selection can explain apparent design in nature',
 37: 'the designer hypothesis immediately raises the larger problem of who designed the designer',
 43: 'the theory of a universe without a God is preferable to the theory of a universe with a God',
 50: 'religion is needed to make us behave morally',
 59: 'fills a "much needed gap',
 72: 'religion is socially dangerous'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 13

In [0]:
with open(articles_dict[articles[13].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[13]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(10, 1332)

In [0]:
cdc_idx_dict

{76: 'unproven religious propositions deserve as much disbelief as all other unproven propositions',
 81: 'atheists are quick to believe in God in times of crisis',
 107: 'God and other religious beliefs are human inventions',
 109: 'belief in God and religion are social functions, used by those in power to oppress the working class',
 110: 'necessarily ends in the enslavement of mankind',
 115: 'renders life meaningless and miserable',
 139: 'religion as a human invention used to frighten people into following moral order',
 261: 'atheism is a superior basis for ethics',
 265: 'religions provide a net benefit to individuals and society',
 266: 'reliance on divine authority lends itself to authoritarianism and dogmatism'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 14 

In [0]:
with open(articles_dict[articles[14].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[14]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{27: "Because current science can't figure out exactly how life started, it must be God who caused life to start"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 15

In [0]:
with open(articles_dict[articles[15].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[15]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{1: 'nothing exists beyond the natural universe', 8: 'nature is all there is'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 16

In [0]:
with open(articles_dict[articles[16].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[16]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{49: 'the universe can be explained without any reference to the supernatural',
 60: 'God exists and this can be proven',
 102: "none of the arguments for God's existence are compelling"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 17

In [0]:
with open(articles_dict[articles[17].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[17]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{3: 'belief in a higher being is the product of fear'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 18 

In [0]:
with open(articles_dict[articles[18].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[18]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{85: 'life and the universe were created by a deity',
 123: 'atheism is evidence of a healthy, independent mind'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'boxing '

In [0]:
topic = 'boxing '

articles = topic_article_dict[topic]
articles

['Boxing styles and technique',
 'Knockout',
 'Boxing',
 'Concussion',
 'Combat sport',
 'Dementia pugilistica',
 'Contact sport',
 'Boxing in China',
 'The distance (boxing)',
 'Amateur boxing']

In [0]:
len(articles)

10

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{65: 'a powerpunch can do a lot of damage to a boxer'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{25: 'The referee intervenes to avoid unnecessary damage or potential injury',
 29: 'the typical knock out which results in a sustained loss of consciousness',
 33: 'Repeated blows to the head are known to gradually cause permanent brain damage',
 34: 'In severe cases may cause strokes or paralysis',
 36: 'many physicians advise against sports involving knockouts'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(14, 1332)

In [0]:
cdc_idx_dict

{59: 'boxing commissions and other sanctioning bodies were established to regulate the sport',
 101: 'fighters wear protective headgear, reducing the number of injuries, knockdowns, and knockouts',
 111: 'A referee monitors the fight to ensure that competitors use only legal blows',
 116: 'Referees will stop the bout if a boxer is seriously injured',
 122: 'Headgear is not permitted in professional bouts, and boxers are generally allowed to take much more damage before a fight is halted',
 123: 'the referee may stop the contest if he believes that one participant cannot defend himself due to injury',
 381: 'Knocking a person unconscious or even causing concussion may cause permanent brain damage',
 382: 'There is no clear division between the force required to knock a person out and the force likely to kill a person',
 383: 'amateur boxers, professional boxers and Toughman fighters have died as the result of ring or training injuries',
 385: 'boxing an "obscenity" that "should not be s

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{200: 'medical groups have called for a ban on the sport'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{3: 'Sports related to combat skills have been a part of human culture for thousands of years'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{6: 'boxing may cause DP'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{8: 'Contact sports have a higher risk of transmission of blood-borne disease between players',
 16: 'tend to cause injuries'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{5: 'boxing was very brutal'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{32: 'fatalities are rare in heavyweight matches',
 35: 'boxing remains the 8th most deadly sport'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{7: 'A referee monitors the fight to ensure that competitors use only legal blows',
 10: 'Referees will stop the bout if a boxer is seriously injured'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{7: 'A referee monitors the fight to ensure that competitors use only legal blows',
 10: 'Referees will stop the bout if a boxer is seriously injured'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'bribery is sometimes acceptable '

In [0]:
topic = 'bribery is sometimes acceptable '

articles = topic_article_dict[topic]
articles

['Bribery',
 'United Nations Convention against Corruption',
 'Political corruption',
 "Corruption in the People's Republic of China"]

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{5: 'Bribery in bureaucracy has been viewed as a reason for higher cost of production of goods and services',
 8: 'Expectations of when a monetary transaction is appropriate can differ from place to place',
 34: 'it encourages rent seeking behaviour',
 36: 'may interfere with good government',
 53: 'In some cases where the system of law is not well-implemented, bribes may be a way for companies to continue their businesses'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{23: 'undermine the value of democracy'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(7, 1332)

In [0]:
cdc_idx_dict

{12: 'Corruption poses a serious development challenge',
 13: 'it undermines democracy and good governance',
 18: 'corruption increases the cost of business',
 19: 'corruption reduces costs by cutting bureaucracy',
 21: 'distorts the playing field',
 22: 'Corruption also generates economic distortions',
 24: 'reduces the quality of government services and infrastructure'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{7: 'adds to economic inequality',
 93: 'Corruption favors the most connected and unscrupulous, rather than the efficient',
 95: 'Bribes also lead to a misdirection of resources',
 101: 'it distorts and retards development',
 102: 'corruption is a necessary trade-off'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'countries with an imbalanced male female ratio skewed towards males should encourage parents to produce girls '

In [0]:
topic = 'countries with an imbalanced male female ratio skewed towards males should encourage parents to produce girls '

articles = topic_article_dict[topic]
articles

['Human sex ratio',
 'Sex selection',
 'Sex-selective abortion',
 'One-child policy']

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{143: 'Gender imbalance may result in the threat of social unrest, especially in the case of an excess of low-status young males unable to find spouses'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{52: 'sex selection is an expression of reproductive rights',
 61: "China's gender imbalance is further increased by the One Child Policy",
 62: 'a lack of opportunity for many men to marry is believed to be producing increases in crime',
 68: 'if female babies worth their weight in rupees and yuan, economic and educational opportunities for girls would soon follow'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{58: 'Shortage of females has the effect of driving human trafficking',
 61: 'son preference, which results in harmful and unethical practices'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(17, 1332)

In [0]:
cdc_idx_dict

{8: 'The policy is controversial both within and outside China because of the manner in which the policy has been implemented, and because of concerns about negative social consequences',
 9: "possible cause behind China's gender imbalance",
 55: 'it had proved "remarkably effective',
 62: "great success in helping to implement China's current economic growth",
 63: 'The reduction in the fertility rate and thus population growth has reduced the severity of problems that come with overpopulation',
 65: 'the focus of China on population control helps provide a better health service for women',
 67: 'The individual savings rate has increased since the one-child policy was introduced',
 72: 'less intrusive options, including those that emphasized delay and spacing of births, could have achieved the same results over an extended period of time',
 83: 'China could have expected a continued reduction in its fertility rate just from continued economic development, had it kept to the previous p

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'democratic governments should require voters to present photo identification at the polling station '

In [0]:
topic = 'democratic governments should require voters to present photo identification at the polling station '

articles = topic_article_dict[topic]
articles

['Crawford v Marion County Election Board',
 'Voter suppression',
 'Help America Vote Act']

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{20: 'procedures for acquiring an ID were too burdensome and costly for some low income or elderly voters'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{12: 'presenting them is a minor inconvenience when weighed against the possibility of ineligible voters affecting elections',
 13: 'photo ID requirements disproportionately affect minority and elderly voters',
 17: 'photo identification was necessary to prevent widespread voter fraud',
 18: 'would decrease voting',
 22: 'The danger of voter fraud threatens the integrity of the entire electoral process',
 28: 'it "has the potential to block millions of eligible American voters, and thus suppress the right to vote'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{59: 'may reduce rather than expand the electorate',
 63: 'voter identification laws that could suppress the turnout by voters'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'endangered species should be protected '

In [0]:
topic = 'endangered species should be protected '

articles = topic_article_dict[topic]
articles

['Habitat conservation',
 'Conservation biology',
 'Biodiversity',
 'Habitat destruction',
 'Conservation in Australia',
 'Deep ecology',
 'Biocentrism (ethics)',
 'Extinction',
 'Melbourne Principles',
 'Environmental ethics',
 'Ecological effects of biodiversity',
 'Economics of biodiversity',
 'Convention on Biological Diversity']

In [0]:
len(articles)

13

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{9: 'The cost of repairing damaged ecosystems is considered to be much higher than the cost of conserving natural ecosystems'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{124: 'rapid rates of biodiversity loss threatens the sustained well-being of humanity',
 144: 'species are irreplaceable components of the global ecosystem'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(10, 1332)

In [0]:
cdc_idx_dict

{114: "Earth's surviving biodiversity provides resources for increasing the range of food and other products suitable for human use",
 124: 'Biodiversity is also known to have an important role in reducing disaster risk, and in post-disaster relief',
 125: 'Biodiversity provides critical support for drug discovery and the availability of medicinal resources',
 128: 'Biodiversity has been critical to advances throughout the field of bionics',
 133: 'Biodiversity is also important to the security of resources',
 135: 'Biodiversity enriches leisure activities',
 142: 'biodiversity has intrinsic aesthetic and spiritual value to mankind',
 144: 'Biodiversity supports many ecosystem services',
 146: 'Biodiversity is directly involved in water purification',
 218: 'Loss of biodiversity results in the loss of natural capital that supplies ecosystem goods and services'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{94: 'When biodiversity is lost, the environment loses many species that provide valuable and unique roles to the ecosystem',
 95: 'The environment and all its inhabitants rely on biodiversity',
 97: 'humans are losing animals that could have served as biological control agents'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{2: 'wealth of biodiversity is important for future generations',
 4: 'biological richness is an unmeasurable aesthetic that may be developed into commercial recreational attractions',
 6: 'Research on natural processes can only occur if habitat is preserved and organisms continue to thrive'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{4: 'like humanity, the living environment as a whole has the same right to live and flourish',
 15: 'the right of all forms to live is a universal right',
 42: 'Human life is dependent on the harmonious balance of interdependent relationships between organisms',
 71: 'the world does not exist as a resource to be freely exploited by humans',
 118: 'All life has intrinsic value'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{3: 'any actions which negatively affect the living systems of which we are a part, adversely affect us as well',
 4: 'all species have inherent value',
 8: 'each organism has a purpose and a reason for being'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{40: 'As long as species have been evolving, species have been going extinct',
 136: 'the loss of native species as a loss to ecotourism',
 140: 'People who live close to nature can be dependent on the survival of all the species in their environment',
 142: 'The very fact that a species is near extinction implies that its final demise will have negligible impact',
 150: 'since species become extinct "all the time" the disappearance of a few more will not destroy the ecosystem'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{20: 'Nature is more than a commodity for the benefit of humans',
 22: 'They warrant our respect, whether or not they are of immediate benefit to us',
 26: 'we have a responsibility to act as custodians for nature'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{39: 'it has extrinsic value – instrumental to the welfare of human beings'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{4: 'they appear to be more stable',
 22: 'there are direct economic consequences of losing diversity in certain ecosystems and in the world as a whole',
 23: 'Losing species means losing potential foods',
 70: 'diverse ecosystems actually resist invasion and disease better than their less diverse equivalents'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11 

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{1: 'biodiversity is inherently valuable',
 2: 'Diverse ecosystems are typically more productive than non-diverse ones',
 3: "human economic productivity is largely reliant on Earth's ecosystems",
 44: 'Biodiversity is a source of economic wealth'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 12

In [0]:
with open(articles_dict[articles[12].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[12]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{12: 'is an integral part of the development process',
 21: 'there is a threat of significant reduction or loss of biological diversity',
 23: 'conservation will bring us significant environmental, economic and social benefits in return'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'gambling '

In [0]:
topic = 'gambling '

articles = topic_article_dict[topic]
articles

['Problem gambling',
 'Charity gambling',
 'Gambling',
 'Gambling in the United States',
 'Lottery',
 'Economics of gambling',
 'Southern District of New York action against online poker players',
 'Gamblers Anonymous',
 'Unlawful Internet Gambling Enforcement Act of 2006',
 'Online gambling',
 'Casino']

In [0]:
len(articles)

11

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(10, 1332)

In [0]:
cdc_idx_dict

{3: 'Pathological gambling is a common disorder that is associated with social costs',
 7: 'leads to adverse consequences for the gambler, others, or for the community',
 35: 'pathological gambling is an addiction similar to chemical addiction',
 46: 'Problem gamblers have the erroneous belief that if they keep playing, they will eventually win',
 60: 'Pathological gambling is similar to many other impulse control disorders',
 63: 'As debts build up people turn to other sources of money such as theft',
 67: 'Compulsive gambling is often very detrimental to personal relationships',
 70: 'Abuse is also common in homes where pathological gambling is present',
 73: 'Problem gambling is often associated with increased suicidal ideation',
 74: 'problem gambling increases the lifetime risk of suicide'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{2: 'profits from the venture go to the charity or group of charities, rather than to a municipality or private casino'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(11, 1332)

In [0]:
cdc_idx_dict

{7: 'Gambling is also a major international commercial activity',
 10: 'religious authorities generally disapprove of gambling',
 12: 'there is no moral impediment to gambling',
 13: 'most legal jurisdictions limit gambling',
 19: 'Many jurisdictions, local as well as national, either ban gambling or heavily control it by licensing',
 20: 'regulation generally leads to gambling tourism and illegal gambling',
 21: 'legal gambling provides significant government revenue',
 33: 'high-payoffs have very low probability, a house bias can quite easily be missed',
 46: 'Betting on team sports has become an important service industry in many countries',
 79: 'gambling, like any behavior which involves variation in brain chemistry, can become a psychologically addictive and harmful behavior',
 80: 'gamblers persist in gambling even after repeated losses'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{3: 'it leads to increased political corruption, compulsive gambling and higher crime rates',
 4: 'gambling is a type of regressive tax on the individuals',
 25: "nearly all the western states' governments outlawed gambling",
 50: 'the majority of the states run some type of lottery to raise funds for state operations'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{37: 'The lotteries proved very popular and were hailed as a painless form of taxation',
 128: 'There can be some problems associated with winning a lottery jackpot',
 130: 'Winners sometimes feel anomie from the dramatic change of lifestyles',
 131: 'lotteries facilitate a higher degree of inequality than a society should have',
 136: 'any social system that allocates resources based on chance is one that is corrupt',
 137: 'any form of gambling, is susceptible to fraud'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(9, 1332)

In [0]:
cdc_idx_dict

{5: 'As a result of gambling, some are driven to extreme lengths to cover debt',
 7: 'addicted gamblers spend most of their energy following their addiction',
 8: 'They cost companies loss of productivity and profit',
 9: 'Gamblers themselves may suffer from depression and bankruptcy',
 11: 'The social costs to society are varied',
 16: 'Gambling provides jobs',
 24: 'gambling increases aggregate demand for goods and services in the economy',
 26: 'money goes directly towards stimulating the economy',
 41: 'it is economically beneficial for a state to allow and support gambling institutions'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{28: 'internet gambling as a legitimate activity that citizens have the right to engage in'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{4: 'problem gambling has been shown to cause dysfunctional families'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{36: 'Internet gambling is a growing problem for banks and credit card companies',
 86: 'regulation of online gambling is a better alternative'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{20: 'Internet gambling has become one of the most popular and lucrative business present on the Internet',
 132: 'a popular leisure activity enjoyed in many forms by millions of people',
 147: 'Various forms of online gambling are legal and regulated in many countries',
 161: 'the high-speed instant gratification of Internet games and the high level of privacy they offer may exacerbate problem and pathological gambling',
 168: 'electronic funds transfers inherent in online gambling are being exploited by criminal interests'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(7, 1332)

In [0]:
cdc_idx_dict

{3: 'the social and economic consequences of casino gambling outweigh the initial revenue that may be generated',
 22: 'gambling in some form or another has been seen in almost every society in history',
 35: 'Most games played have mathematically-determined odds that ensure the house has at all times an advantage over the players',
 43: 'the modern day slot machine is addictive',
 97: 'Given the large amounts of currency handled within a casino, both patrons and staff may be tempted to cheat and steal',
 108: 'One area of controversy surrounding casinos is their relationship to crime rates',
 109: 'a positive relationship between casinos and crime'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'housewives should be paid for their work '

In [0]:
topic = 'housewives should be paid for their work '
articles = topic_article_dict[topic]
articles

['Care work', 'Feminist economics']

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(9, 1332)

In [0]:
cdc_idx_dict

{12: 'Care work is essential to human well-being',
 16: 'care work is directly related to the health of a society as well as to the economic development of that society',
 30: 'the household sector is a wealth spender, and not a wealth creator',
 31: 'the household sector plays a very important role in wealth creation',
 42: "women's jobs in the home were not part of any market",
 102: 'the household does not form wealth',
 106: 'the quality of care may decrease in response to the call for profit-making and efficiency',
 133: 'care work should not be done for pay because pay will undermine the intrinsic motivations for this work',
 136: 'care work should be better compensated by the market'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{111: 'care work as central to economic development and human well-being',
 114: 'unpaid domestic work is just as valuable as paid work',
 161: 'domestic labor is work'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

### 'institute a mandatory retirement age '

In [0]:
topic = 'institute a mandatory retirement age '

articles = topic_article_dict[topic]
articles

['Ageing', 'Memory and aging', 'US age discrimination', 'Mandatory retirement']

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{224: 'benefits both society and older individuals',
 228: 'the more active elderly people are, the more likely they are to be satisfied with life'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{14: 'Normal aging is associated with a decline in various memory abilities in many cognitive tasks'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{7: 'they lack up-to-date skills',
 15: 'older people as a group that acts as more of a cost than an asset to the company',
 17: 'The older generation workers often require more benefits from the company due to their age'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{2: 'the practice as a form of age discrimination, or ageism',
 7: 'it is unlawful to discriminate against a person because of hisher age'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'intellectual property rights '

In [0]:
topic = 'intellectual property rights '

articles = topic_article_dict[topic]
articles

['Property',
 'Patent',
 'Intellectual property',
 'Societal views on patents',
 'Libertarian perspectives on intellectual property',
 'Missionary Church of Kopimism',
 'Philosophy of copyright',
 'Anti-copyright',
 'Copyright']

In [0]:
len(articles)

9

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{34: 'property rights encourage their holders to develop the property',
 53: 'property rights encourage their holders to develop their property or generate wealth',
 193: 'interference by the state over the centuries in property ownership has had dire consequences for justice as well as for economic productivity'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{117: 'Patents provide incentives for economically efficient research and development',
 124: 'patents facilitate and encourage disclosure of innovations into the public domain for the common good',
 125: 'If inventors did not have the legal protection of patents, in many cases, they would prefer or tend to keep their inventions secret',
 126: 'Awarding patents generally makes the details of new technology publicly available',
 127: "when a patent's term has expired, the public record ensures that the patentee's idea is not lost to humanity",
 138: 'patents have been criticized as inconsistent with free trade'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(11, 1332)

In [0]:
cdc_idx_dict

{23: 'intellectual property is desirable because it encourages innovation',
 24: 'creators will not have sufficient incentive to invent unless they are legally entitled to capture the full social value of their inventions',
 28: 'intellectual property rights are essential to maintaining economic growth',
 29: 'give statutory expression to the moral and economic rights of creators in their creations',
 30: 'encourage fair trading which would contribute to economic and social development',
 31: 'effective enforcement of intellectual property rights is critical to sustaining economic growth',
 34: 'a positive correlation between the strengthening of the IP system and subsequent economic growth',
 35: 'IP can be a disincentive to innovation',
 49: 'they promote public welfare',
 54: 'To violate intellectual property is therefore no different morally than violating other property rights',
 64: 'infringes on the right to own tangible property'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{4: 'patents were obstructing research',
 8: 'patents as state-granted monopolies inconsistent with free trade',
 10: 'give rise to "troll" entities',
 27: 'intellectual property rights may become so fragmented that, effectively, no one can take advantage of them'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{3: "Patents and copyrights are the legal implementation of the base of all property rights: a man's right to the product of his mind",
 18: 'intellectual property laws can actually hinder innovation',
 22: 'they divert resources from research and development to patent filing and lawsuits',
 24: 'are not a legitimate subject of property rights',
 25: "the only way that intellectual property rights can be implemented is by limiting others' physical property rights",
 29: 'one cannot own information without owning other people',
 31: 'the patent monopoly..consists in protecting inventors..against competition for a period long enough to extort from the people a reward enormously in excess of the labor measure of their services',
 35: 'have hindered the ability of consumers to buy the products they want'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{4: 'all information should be freely distributed and unrestricted'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(10, 1332)

In [0]:
cdc_idx_dict

{16: 'intellectual property as a necessary way of incentivising the creation of new creative works',
 22: 'in the absence of intellectual property protections such as copyright and patents, various types of intangible assets would be under-produced',
 28: 'without a significant period of legal protection of their future income, many valuable books and artworks would not be created',
 32: 'Without a feasible way to recoup investments of creative time through copyright, there would be little economic incentive to produce',
 35: 'it has been largely successful in financing the creation and distribution of a wide variety of works',
 45: 'has always served simply to enrich a few at the expense of creativity',
 50: 'the current (international) copyright system undermines its own goal',
 59: 'quality works can be created even in the absence of a copyright-enforced monopoly rent',
 63: 'Copyright can also be used to stifle political criticism',
 69: 'has made and continues to make a valuable e

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(10, 1332)

In [0]:
cdc_idx_dict

{2: 'granting developers temporary monopolies over their works encourages further development and creativity',
 3: 'serves to enrich a few at the expense of creativity',
 28: 'copyright is invalid because, unlike physical property, intellectual property is not scarce',
 32: 'is obsolete',
 38: 'the cost of trying to enforce it is unreasonable',
 39: 'knowledge should be "shared in solidarity',
 42: 'copyright law as preventing or slowing human progress',
 51: 'artists cannot produce new works without an economic incentive',
 56: 'is a fundamental right for both creators and consumers',
 57: 'content creators would not have incentive to produce their products if they cannot be guaranteed payment'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{20: 'the law is fair and just',
 32: 'patent and copyright laws support in fundamental and thoroughgoing ways the expansion of the range of creative human activities that can be commodified'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'limit the right to bear arms '

In [0]:
topic = 'limit the right to bear arms '

articles = topic_article_dict[topic]
articles

['Gun violence in the United States',
 'Gun culture',
 'National Rifle Association',
 'Second Amendment to the United States Constitution',
 'Gun politics in the United States',
 'Political arguments of gun politics in the United States',
 'Gun politics in Brazil',
 'Gun politics',
 'Gun violence and gun control in Texas',
 'Gun control']

In [0]:
len(articles)

10

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{43: 'the likelihood that a death will result is significantly increased when either the victim or the attacker has a firearm',
 59: 'if guns were less available, criminals may likely commit the crime anyway',
 89: 'criminal use of guns is far more common than self-defense use of guns',
 99: 'more guns can reduce crime',
 100: 'limiting access to guns by law-abiding people makes them more vulnerable to armed criminals',
 171: 'potential crime victims might be carrying firearms, and thus serve as a deterrent against crime'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{21: 'citizens should not be prevented from having guns unless they have done something to show that they are not to be trusted with them',
 22: 'guns provide some level of protection against criminality and tyranny',
 23: 'widespread gun ownership is protection against tyranny'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{4: 'gun ownership is a civil liberty',
 204: 'citizens need to arm themselves to safeguard political liberties against threats by the government'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{41: 'people have a right to bear arms for the defence of themselves and the state'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{18: 'most guns are in the hands of people who are unlikely to misuse them',
 68: 'the right to bear arms is absolute and unqualified',
 119: 'Americans have an individual right described in the Second Amendment to possess firearms'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(10, 1332)

In [0]:
cdc_idx_dict

{4: 'gun possession is a fundamental civil right',
 41: 'the people have a right to "keep and bear arms" as a protection from the government',
 43: 'the right to bear arms is necessary for the citizens to protect themselves from the "tyranny in government',
 44: "an armed citizenry is the population's last line of defense against tyranny by their own government",
 86: 'making civilian ownership of firearms illegal would increase the crime rate',
 88: 'increased gun ownership leads to higher levels of crime',
 129: 'are effectively deterred by armed intended victims',
 149: 'people who keep a gun at home increase their risk of homicide',
 158: 'allowing law-abiding citizens to carry concealed firearms, deters crime',
 159: 'The possibility of getting shot by an armed victim is a substantial deterrent to crime'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{16: 'no rights should ever be allowed to be taken away by the government',
 18: 'it would be of no use to forbid law-abiding citizens to own legal registered guns in accordance to the law',
 19: 'regions where gun ownership is widespread were the ones with the smallest number of gun-related deaths',
 21: 'their only reason to exist is to harm others',
 24: 'guns are needed for personal security'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(9, 1332)

In [0]:
cdc_idx_dict

{364: 'there were substantial correlations between gun ownership and gun-related suicide and homicide rates',
 368: 'easier access to guns lead to more violence',
 390: 'laws which make it easier for law-abiding citizens to get a permit to carry a gun in public places, cause reductions in crime',
 391: 'allowing law-abiding citizens to carry concealed firearms deters crime',
 394: 'gun laws generally had no significant effect on violent crime rates or suicide rates',
 407: 'laws that forbid the carrying of arms.. disarm only those who are neither inclined nor determined to commit crimes',
 408: 'they serve rather to encourage than to prevent homicides',
 428: 'gun possession is a civil right',
 447: "an armed citizens' militia can help deter crime and tyranny"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{3: 'gun control laws are effective in reducing gun-related accidents and crime',
 4: 'gun control laws are ineffective in reducing gun-related accidents and crime',
 6: "the Second Amendment guarantees an individual's right to own a firearm",
 71: 'fewer restrictions on handguns will result in increasing numbers of injuries and deaths',
 78: 'Defensive use of guns is both common and effective in preventing injury and property loss',
 85: 'gun bans would increase injuries and deaths'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(11, 1332)

In [0]:
cdc_idx_dict

{5: 'there were significant correlations between gun ownership and gun-related suicide and homicide rates',
 9: 'the access to guns leads to more violence',
 38: 'laws which make it easier for law-abiding citizens to get a permit to carry a gun in public places, cause reductions in crime',
 39: 'allowing law-abiding citizens to carry concealed firearms deters crime',
 45: 'gun laws generally had no significant effect on violent crime rates or suicide rates',
 56: 'laws that forbid the carrying of arms.. disarm only those who are neither inclined nor determined to commit crimes',
 57: 'they serve rather to encourage than to prevent homicides',
 78: 'gun ownership as a civil right',
 81: 'gun possession is a civil right',
 105: "an armed citizens' militia can help deter crime and tyranny",
 172: 'gun control works'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'make physical education compulsory '

In [0]:
topic = 'make physical education compulsory '

articles = topic_article_dict[topic]
articles

['Physical exercise',
 'Physical fitness',
 'Overweight',
 'Active Living',
 'Exercise trends',
 'Health',
 'Physical education',
 'Democratic education',
 'Obesity',
 'Recess (break)',
 'Childhood obesity',
 'Summerhill School',
 'Sedentary lifestyle']

In [0]:
len(articles)

13

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(10, 1332)

In [0]:
cdc_idx_dict

{3: 'helps prevent the "diseases of affluence',
 4: 'It also improves mental health',
 5: 'physical exercise may help decrease some of the effects of childhood and adult obesity',
 21: 'strengthening the immune system',
 23: 'Frequent and regular aerobic exercise has been shown to help prevent or treat serious and life-threatening chronic conditions',
 33: 'Not everyone benefits equally from exercise',
 45: 'moderate exercise has a beneficial effect on the human immune system',
 68: 'Exercise alone is a potential prevention method andor treatment for mild forms of depression',
 76: 'Too much exercise can be harmful',
 79: 'Inappropriate exercise can do more harm than good'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{11: 'Physical fitness can also prevent or treat many chronic health conditions',
 13: 'To stay healthy it is important to engage in physical activity'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{3: 'Excess weight has reached epidemic proportions globally'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{12: 'There are many health related benefits to being physically active'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{10: 'Physical inactivity is increasing or high among many groups in the population',
 15: 'inactivity one of the leading preventable causes of death worldwide'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{21: 'people can improve their health via exercise'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{5: 'Introducing students to activities like bowling, walkinghiking, or Frisbee at an early age can help students develop good activity habits that will carry over into adulthood',
 7: 'Teaching non-traditional sports to students may also provide the necessary motivation for students to increase their activity, and can help students learn about different cultures'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{91: 'the model of ideal education is that which occurs when people go on their own initiative to discover things'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{65: 'a lack of physical activity is thought to explain most cases of obesity'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{17: 'youth do not get the physical outlet needed not only for their cognitive development but for their physical health'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{56: 'children who fail to engage in regular physical activity are at greater risk of obesity',
 59: 'Physical inactivity as a child could result in physical inactivity as an adult',
 73: 'Childhood inactivity is linked to obesity'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11 

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{0: 'school should be made to fit the child, rather than the other way around',
 16: 'children learn best with freedom from coercion'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 12

In [0]:
with open(articles_dict[articles[12].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[12]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{22: 'many children lead a relatively sedentary lifestyle'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'multiculturalism '

In [0]:
topic = 'multiculturalism '

articles = topic_article_dict[topic]
articles

['Multiculturalism',
 'Multiculturalism in Canada',
 'Multiculturalism in Australia',
 'Multiculturalism in the Netherlands',
 'Cultural diversity',
 'All for Australia',
 'Muscular liberalism',
 'Melting pot',
 'Convention on the Protection and Promotion of the Diversity of Cultural Expressions',
 'Cultural imperialism',
 'Declaration on the Rights of Indigenous Peoples',
 'Universal Declaration on Cultural Diversity',
 'Canadian identity',
 'Nationalism',
 'Multicultural education',
 'Minority group',
 'Criticism of multiculturalism',
 'Cultural competence',
 'Alliance of Civilizations',
 'Acculturation',
 'Cosmopolitanism',
 'Interminority racism',
 'Leitkultur',
 'Interculturalism']

In [0]:
len(articles)

24

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(7, 1332)

In [0]:
cdc_idx_dict

{78: 'state multiculturalism has failed',
 82: 'is seen as an essential feature of the nation and the nation-state',
 200: 'allows people to truly express who they are within a society',
 204: 'promotes respect for the dignity of the lives and voices of the forgotten',
 205: 'multiculturalism tries to restore a sense of wholeness in a postmodern era that fragments human life and thought',
 210: "ultimately erodes the host nations' distinct culture",
 212: 'the more racially diverse a community is, the greater the loss of trust'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{61: 'the strongest nations are those that are made up of different racial elements',
 135: 'multiculturalism helps in bringing together immigrants and minorities in the country',
 147: 'official multiculturalism limits the freedom of minority members',
 153: 'multiculturalism works better in theory than in practice',
 154: 'it hinders equity and equality in society'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{38: 'a multicultural society could never be strong',
 49: 'many multicultural societies have failed',
 51: 'it is a perilous concept on which to found policy',
 58: 'multiculturalism has resulted in political corruption'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(9, 1332)

In [0]:
cdc_idx_dict

{26: 'would lead to acceptance of barbaric practices',
 32: 'homogeneity and integration are necessary for a society',
 36: 'multiculturalism and immigration led to adaptation problems',
 40: 'immigrants must always lose their own culture - that is the price of immigration',
 43: 'lack of integration threatened society',
 47: 'A liberal democracy cannot be multicultural',
 52: 'Democracy and the rule of law could only be restored by abolishing multiculturalism',
 61: 'the emphasis on group identity and group rights diminished individual liberty for those within the minorities',
 69: 'Human beings are equal; cultures are not'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{9: 'cultural diversity may be vital for the long-term survival of humanity',
 10: 'cultural diversity is as necessary for humankind as biodiversity is for nature',
 13: 'it is unethical deliberately to conserve "less developed" societies',
 23: 'it is in the best interests of individuals and of humanity as a whole that all people adhere to a specific model for society'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{10: 'Multiculturalism tends to emphasize the rights of ethnic minorities at the expense of the majority'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{5: 'multiculturalism has shifted from tolerating multiple cultures to tolerating multiple value systems, which can be hostile to liberalism'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{2: 'cultural differences within society are valuable and should be preserved',
 120: "one does not need to assimilate or abandon one's heritage in order to blend in",
 128: 'assimilation can hurt minority cultures by stripping away their distinctive features',
 141: 'separating citizens by ethnicity or race and providing immigrant groups "special privileges" can harm the very groups they are intended to help',
 145: 'the multiculturalist policy of freer immigration is unworkable in an era in which the supply of immigrants from third world countries seems limitless'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{6: 'promotion and maintenance of cultural diversity are an essential requirement for sustainable development'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{63: 'it makes available more ways of solving problems and responding to catastrophes',
 108: 'The greater public good warrants eliminating those cultural characteristics that promote conflict or prevent harmony'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{35: 'Indigenous peoples have the right to the dignity and diversity of their cultures',
 84: "the distinctiveness of people's identity and their rights to preserve their heritage should be acknowledged"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11 

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{2: 'cultural diversity is as necessary for humankind as biodiversity is for the nature',
 3: 'it is the common heritage of humanity'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 12

In [0]:
with open(articles_dict[articles[12].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[12]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{203: 'cultural appreciation of ethnic and religious diversity promotes a greater willingness to tolerate political differences'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 13

In [0]:
with open(articles_dict[articles[13].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[13]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{4: 'multinationality in a single state should necessarily comprise the right to express and exercise national identity even by minorities'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 14 

In [0]:
with open(articles_dict[articles[14].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[14]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(11, 1332)

In [0]:
cdc_idx_dict

{7: 'Multicultural classrooms promote decision-making and critical thinking',
 16: 'multiculturalism is a tool for instilling students with pride and confidence in their unique and special backgrounds',
 21: 'Multicultural education provides a relatively fairer learning environment for international students',
 24: 'multicultural education may cause abandonment of original cultural',
 57: 'Diversity is intrinsically valuable to the dominant culture',
 99: 'A homogeneous community grounded on consensus may be unable to criticize the injustice and exclusionary practices that undermine it',
 100: 'Reform of cultural pathology often comes from the recognition of difference',
 112: 'Multiculturalism is a developmental journey through which an individual enhances knowledge and skills about different cultures',
 131: 'Multicultural education in public schools would promote acceptance of diversity',
 139: 'Citizens in a diverse democratic society should be able to maintain attachments to their

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 15

In [0]:
with open(articles_dict[articles[15].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[15]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{78: "recognition and rights accorded to specific groups may interfere with the state's need to establish a cohesive identity",
 80: 'where members of minorities see that their specific needs and ambitions have been acknowledged and catered for, they will commit themselves more willingly to accepting the legitimacy of the nation',
 83: 'These may be considered necessary because the minority group in question is socially disadvantaged',
 89: 'the political function of rights is precisely to protect minorities from oppression by majorities'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 16

In [0]:
with open(articles_dict[articles[16].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[16]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(16, 1332)

In [0]:
cdc_idx_dict

{10: 'A community of separate cultures fosters a rights mentality, rather than a responsibilities mentality',
 11: 'It is divisive',
 12: 'It works against quick and effective integration',
 16: 'multiculturalism obscures the social costs associated with large scale immigration',
 22: 'many multicultural societies have failed',
 24: 'It is divisive',
 25: 'It threatens social cohesion',
 33: 'official multiculturalism limits the freedom of minority members',
 39: 'multiculturalism works better in theory than in practice',
 40: 'it hinders equity and equality in society',
 54: 'would lead to acceptance of barbaric practices',
 109: 'multiculturalism undermined national unity',
 130: 'some forms of multiculturalism can divide people',
 132: 'multiculturalism to be dangerous to the West',
 140: 'the more racially diverse a community is, the greater the loss of trust',
 156: 'it creates friction within society'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 17

In [0]:
with open(articles_dict[articles[17].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[17]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{9: 'Diversity must be prevalent and valued',
 11: 'differences are recognized as a uniting component rather than a separating one'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 18 

In [0]:
with open(articles_dict[articles[18].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[18]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{60: 'diversity brings progress and social cohesion'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 19

In [0]:
with open(articles_dict[articles[19].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[19]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{233: 'the community is enriched as difference accrues'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 20

In [0]:
with open(articles_dict[articles[20].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[20]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{72: 'there is no rational ground for curtailing the cultural freedoms'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 21

In [0]:
with open(articles_dict[articles[21].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[21]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{86: 'Multiculturalism can become a polite and euphemistic way of affirming and persisting unequal power relationships'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 22 

In [0]:
with open(articles_dict[articles[22].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[22]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{36: 'had reached the end of its useful life',
 37: 'Multiculturalism could not be allowed to create a society where all values were equal'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 23

In [0]:
with open(articles_dict[articles[23].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[23]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{14: 'should be treated and promoted equally',
 26: "people have the right to maintain an affiliation with one's ethnic group"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'parents to genetically screen foetuses for heritable diseases '

In [0]:
topic = 'parents to genetically screen foetuses for heritable diseases '

articles = topic_article_dict[topic]
articles

['Preimplantation genetic diagnosis',
 'Genetic testing',
 'Human genetic engineering',
 'Prenatal diagnosis',
 'In vitro fertilisation']

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{200: 'it involves the destruction of human life'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{72: 'The procedures used for prenatal testing carry a small but real risk of losing the pregnancy',
 97: 'There is no stronger antidote for fear than information'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{19: 'every child has the right to be born free from preventable diseases'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{21: 'Having this information in advance of the birth means that healthcare staff as well as parents can better prepare themselves for the delivery of a child with a health problem',
 119: 'may give the option of fetal surgery during pregnancy',
 128: 'Early diagnosis gives the parents time to research and discuss post-natal treatment and care'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{187: 'PGS can reduce the risk of multiple pregnancies because fewer embryos are needed for implantation',
 250: 'intentionally culling out blind or deaf embryos might prevent considerable future suffering'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'partial birth abortions '

In [0]:
topic = 'partial birth abortions '

articles = topic_article_dict[topic]
articles

['Religion and abortion',
 'Legalized abortion and crime effect',
 'Gonzales v Carhart',
 'Roe v Wade',
 'Stenberg v Carhart',
 'Societal attitudes towards abortion',
 'Philosophical aspects of the abortion debate',
 'Intact dilation and extraction',
 'Partial-Birth Abortion Ban Act',
 'Support for the legalization of abortion',
 'Judaism and abortion',
 'Abortion and mental health',
 'Abortion debate',
 'Abortion in the United States']

In [0]:
len(articles)

14

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{2: 'abortion, which would then involve the deliberate destruction of life, should be rejected',
 6: 'abortion should be approved or disapproved according to each circumstance'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{5: 'abortion has negative effects on society'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{26: 'the state has an interest in preserving fetal life',
 27: 'intact dilation and extraction procedure is never needed to protect the health of a pregnant woman'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{46: 'abortion a fundamental right',
 50: 'the decision to abort must be left to the mother and her physician',
 150: 'a state\'s ban on "partial birth abortion" was unconstitutional'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{5: 'a state law banning certain forms of abortion was unconstitutional',
 11: 'it would be safer and would involve fewer risks for the women',
 20: 'D&X abortions were never medically necessary',
 29: 'government had no right to force doctors to perform any procedure other than what they felt would be the safest'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{10: 'abortion should be legal in all circumstances',
 18: 'The government should not interfere with a woman’s ability to have an abortion'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(12, 1332)

In [0]:
cdc_idx_dict

{3: 'induced abortion is the deliberate and unjust killing of the embryo in violation of its right to life',
 4: 'the law should not criminalize just exercises of the right to control one’s own body',
 8: 'criminalizing abortion will lead to the deaths of many women through ‘back-alley abortions',
 11: 'the embryo has a right to life',
 13: 'the embryo has a right to life',
 46: 'life begins at conception',
 55: 'abortion is wrong because it deprives the embryo of a valuable future',
 59: 'as a standard embryo does have a highly valuable future, killing it is seriously wrong',
 60: 'deliberate abortions are seriously immoral',
 86: 'the embryo has a right to life',
 93: 'the embryo has a right to life',
 108: 'the fetus has a right to life'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{35: 'abortion, and especially late-term abortion, is unjust',
 36: 'Critics consider the procedure to be infanticide'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{9: 'partial-birth abortion.. is.. unnecessary to preserve the health of the mother',
 11: 'a partial-birth abortion bore no relevance to any measure needed to advance the health of any woman'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{3: 'is a person and therefore has a right to life',
 4: 'whether or not to continue with a pregnancy is an inviolable personal choice',
 9: 'legal abortion under medically controlled conditions is preferable to illegal back-alley abortion without proper medical supervision',
 17: 'they could be used to form a slippery slope against all abortions',
 62: "women's lives are lost due to unsafe abortions when abortion is illegal"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{18: 'one must abort a fetus if the continuation of pregnancy might imperil the life of the woman',
 81: 'abortion is justifiable if a continuation of pregnancy might cause the woman severe physical or psychological harm',
 87: 'In all circumstances, it should be her decision whether or not to terminate a pregnancy',
 89: 'any decision should be left up to the woman within whose body the fetus is growing'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11 

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{47: 'abortion causes mental health problems',
 48: 'abortion causes mental health problems'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 12

In [0]:
with open(articles_dict[articles[12].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[12]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(16, 1332)

In [0]:
cdc_idx_dict

{34: 'it should be illegal for governments to regulate abortion',
 54: 'the state has an "important and legitimate interest in protecting the potentiality of human life',
 63: 'while the state has an interest in protecting the fetus "at some point", this interest cannot override that of the pregnant woman',
 79: 'the fetus has a right to life',
 118: 'abortion is morally wrong',
 119: 'while the fetus is innocent and biologically human, it is not a person with a right to life',
 131: 'a right to life beginning at conception',
 139: 'the fetus has a right to life',
 157: 'abortion involves unjust discrimination against the unborn',
 160: 'abortion is morally wrong',
 163: 'deliberate abortions are placed in the "same moral category" as killing an innocent adult human being',
 169: 'the fetus has a right to life',
 171: 'the fetus has a right to life',
 177: 'life begins at conception',
 178: 'abortion to be morally wrong',
 186: 'abortion should be legal in all circumstances'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 13

In [0]:
with open(articles_dict[articles[13].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[13]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{2: 'the right of personal privacy includes the abortion decision',
 4: 'there is a fundamental right to privacy encompassing the decision about abortion',
 152: 'abortions should be legal under any circumstances',
 155: 'abortions should be legal under any circumstances',
 173: 'unborn children to have an inherent right to life'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 're engage with Myanmar '

In [0]:
topic = 're engage with Myanmar '

articles = topic_article_dict[topic]
articles

['2007 Burmese anti-government protests',
 'Maung Zarni',
 'Burma',
 'Burma Campaign UK',
 'Foreign relations of Burma',
 'International reaction to the 2007 Burmese anti-government protests',
 'Burmese general election, 2010']

In [0]:
len(articles)

7

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{324: "Burma's rulers continue to defy the world's just demand to stop their vicious persecution",
 335: 'the government wants to engage again in constructive dialogue'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{14: 'economic sanctions and political pressure by Western countries on Burma was counter-productive and futile',
 30: 'economic sanctions and political pressure on the Burmese military regime remain the only feasible policy to achieve a restoration of democracy and human rights in Burma',
 33: 'efforts to sanction Burma were useless'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{157: 'the government has embarked on a series of reforms toward liberal democracy',
 223: "the military regime in Burma is one of the world's most repressive and abusive regimes",
 236: "Burma's human rights record has been improving",
 273: 'the American-led sanctions have had adverse effects on the civilian population'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{23: 'sanctions are designed to pressure the regime to enter into dialogue with the demcoracy movement'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{13: 'the American-led sanctions have had adverse effects on the civilian population'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{42: 'the best way to go is through engagement and encouragement',
 48: 'constructive engagement with the Burmese junta has failed'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{63: 'The international community can provide constructive help and refrain from any negative impact on the domestic political process of Myanmar',
 93: 'When peaceful democratic movements are suppressed – as in Burma – then the democracies of the world cannot remain silent&nbsp'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'reintroduce national service '

In [0]:
topic = 'reintroduce national service '

articles = topic_article_dict[topic]
articles

['Conscription',
 'Charles B Rangel',
 'Conscription in Germany',
 'Conscription in the United States',
 'National Service Act of 2006',
 'Counter-recruitment']

In [0]:
len(articles)

6

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(13, 1332)

In [0]:
cdc_idx_dict

{7: 'it violates individual rights',
 327: 'represents slavery and involuntary servitude',
 328: 'Of all the statist violations of individual rights in a mixed economy, the military draft is the worst',
 329: 'It is an abrogation of rights',
 330: 'It negates man’s fundamental right—the right to life',
 334: 'the very conception of a just government in its duty to the citizen includes the reciprocal obligation of the citizen to render military service in case of need',
 335: 'in a cost-to-benefit ratio, conscription during peace time is not worthwhile',
 336: 'Months or years of service amongst the most fit and capable subtracts from the productivity of the economy',
 341: 'The work effort of the conscripts is effectively wasted',
 343: 'professionally-skilled conscripts are also difficult to replace in the civilian workforce',
 344: 'Every soldier conscripted in the army is taken away from his civilian work, and away from contributing to the economy which funds the military',
 348: 'i

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{213: 'Fairness dictates that the sons and daughters of the white middle and upper classes share the burden of war'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{84: 'some service is better than none at all, bringing citizens in contact with their military',
 88: 'the abolition of the draft would lead to recruitment shortages even for higher ranking positions',
 89: 'considerable savings in defence spending from abolishing the draft',
 92: 'professional armed forces can be more expensive than a draft-based military',
 95: 'the draft was simply anachronistic',
 97: 'The draft obliged male citizens to pay society back through their military or civilian service'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(7, 1332)

In [0]:
cdc_idx_dict

{86: 'conscription would not provide adequate protection for the rights of conscientious objectors',
 212: 'adequate military strength could be maintained without having conscription',
 275: 'the draft is fundamentally unfair',
 280: 'The draft has been perceived by some as unfairly targeting the poor and lower middle classes',
 298: 'it was less likely that a republic with conscription would engage in preemptive wars',
 328: 'the draft should be reinstated to make the military more equal',
 329: 'the draft "does bring people from all quarters of our society together in the common purpose of serving'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{4: 'in the case of a mandatory draft members of the U.S. Congress would be much less likely to authorize an unnecessary war',
 5: 'bringing back the draft would remedy the social disparity'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{64: 'military service provides job skills',
 65: 'personal and technical skills learned in the military will improve later employment prospects in civilian life',
 68: 'leadership skills acquired during military training can absolutely enhance one’s chances for success in corporate life'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'subsidise poor communities '

In [0]:
topic = 'subsidise poor communities '

articles = topic_article_dict[topic]
articles

["Welfare's effect on poverty",
 'Prodesis',
 'Welfare culture',
 'Subsidy',
 'Criticisms of welfare',
 'Poverty reduction',
 'Cycle of poverty',
 'Redistribution of wealth',
 'Economic inequality',
 'Social safety net']

In [0]:
len(articles)

10

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{2: 'sustains or even creates poverty',
 3: 'poverty decreases after countries adapt welfare programs',
 33: 'makes an incentive to not find work',
 36: 'welfare not only increases poverty but also increases other problems'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{101: 'the availability of government funds has led to tensions within the community'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{2: 'breeds dependence on government aid',
 26: 'welfare only bred dependence on the government',
 44: 'welfare has demonstrated some proven effects for helping impoverished families'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{8: 'Subsidies may distort markets',
 24: 'produce inefficiencies',
 38: 'they are inefficient'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{4: 'the welfare state has produced a generation of dependents who, instead of working, rely solely upon the state for income',
 11: 'it creates dependence to the state'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{26: 'helps growth',
 28: 'is essential in providing better lives',
 47: 'trade rules are often unfair as they block access to richer nations’ markets and ban poorer nations from supporting their industries',
 130: 'Western monetary aid often only serves to increase poverty and social inequality',
 131: 'higher aid levels erode the quality of governance',
 133: 'aid is not spread properly'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{0: 'poverty, once started, is likely to continue unless there is outside intervention'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(11, 1332)

In [0]:
cdc_idx_dict

{17: 'less stratified economies are more socially just',
 19: 'the rich have an obligation to assist the poor, thus creating a more financially egalitarian society',
 23: 'the rich exploit the poor or otherwise gain unfair benefits, and thus should return some of those benefits',
 24: 'a larger middle class benefits an economy by enabling more people to be consumers',
 26: 'economic inequality contributes to crime',
 28: 'a lower rate of redistribution in a given society increases the inequality found among future incomes',
 31: 'reducing these inequalities is one way to prevent or ameliorate economic crises',
 38: 'it improves social stability',
 72: 'there is no encouragement of those receiving aid to resume working',
 90: 'redistribution of legitimately obtained property cannot ever be just',
 91: 'redistribution tends to benefit those with political clout to set spending priorities more than those in need'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(12, 1332)

In [0]:
cdc_idx_dict

{217: 'Higher levels of economic inequality tend to intensify social hierarchies and generally degrade the quality of social relations',
 222: 'Economic inequality is thought to reduce distributive efficiency within society',
 227: 'a society with more equality will have higher aggregate utility',
 228: 'in societies where inequality is lower, population-wide satisfaction and happiness tend to be higher',
 246: 'an increased gap between rich and poor increases the incentives for competition and innovation within an economy',
 269: 'redistributive policies that have an adverse effect on investment and economic growth',
 308: 'when there is economic inequality then political inequality is assured',
 320: 'a certain amount of redistribution would be justified',
 328: "without redistribution, one generation's successful individuals would become the next generation's embedded caste",
 329: 'social justice requires redistribution of high incomes and large concentrations of wealth in a way th

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(9, 1332)

In [0]:
cdc_idx_dict

{9: 'Safety nets enable households to make productive investments in their future that they may otherwise miss',
 44: 'CCT Programs have been proved to be very well-targeted and effective',
 48: 'CCT Programs are efficient tools for reducing poverty and inequality',
 77: 'Subsidies guarantee access to essential commodities at prices that consumers can afford',
 79: 'they tend to be expensive and regressive',
 88: 'May distort production incentives',
 128: 'Safety nets in low-income countries are increasingly being recognized as effective tools to reach out to the most vulnerable',
 129: 'they protect households facing hard times from falling into deeper poverty and help them manage risk',
 130: 'they can provide households with a cushion to invest resources more efficiently'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'that the right to asylum should not be absolute '

In [0]:
topic = 'that the right to asylum should not be absolute '

articles = topic_article_dict[topic]
articles

['Russian Federation Law on Refugees',
 'Boat people',
 'Nativism (politics)',
 'Convention Relating to the Status of Refugees',
 'Immigration to the United Kingdom since 1922',
 'UK Immigration Service',
 'Right of asylum',
 'United Nations High Commissioner for Refugees',
 'Immigration',
 'Immigration and crime',
 'Illegal immigration',
 'Refugee',
 'Illegal immigration from Africa to Israel']

In [0]:
len(articles)

13

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{11: 'application for asylum could be denied regardless of the legitimacy of their claim'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{6: 'political refugees, may be fleeing for their lives',
 41: 'The plight of the boat people became an international humanitarian crisis'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(7, 1332)

In [0]:
cdc_idx_dict

{2: 'the groups are considered hostile or alien to the natural culture',
 5: 'the immigrants will distort or spoil existing cultural values',
 128: 'Acquire jobs which would have otherwise been available to native citizens, suppressing wages',
 129: 'Damage a sense of community and nationality',
 131: 'May overpopulate countries',
 132: 'Can swamp a native population and replace its culture with their own',
 134: 'immigrants can "swamp" a local population'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{35: 'prohibition of forcible return is part of customary international law'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{99: 'the opposition to high levels of immigration by refugees is based on racism'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{23: 'social and political issues surrounding the issue of immigration',
 475: 'asylum seekers were motivated by the availability of benefits',
 608: 'migration provided a positive resource for the economy'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{41: 'everyone has the right to seek and to enjoy in other countries asylum from persecution'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{34: 'everyone can exercise the right to seek asylum and find safe refuge in another state'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(7, 1332)

In [0]:
cdc_idx_dict

{58: 'immigrants are thought to compete with employees who are already in the country',
 84: 'immigration threatens national identity',
 95: 'freedom of movement is often recognized as a civil right',
 98: 'everyone has the right to leave or enter a country, along with movement within it',
 99: 'everyone has the right to leave any country, including his own, and to return to his country',
 100: 'the freedom of movement both within and between countries is a basic human right',
 103: 'everyone has the right to seek and to enjoy in other countries asylum from persecution'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{21: 'immigrants have high rates of criminality'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{131: 'refugees (legally defined to be people who are persecuted in their original country and then enter another country seeking safety) should be exempted from immigration laws',
 134: 'the freedom of movement both within and between countries is a basic human right',
 159: 'repression and intolerance against immigrants will not solve the problems caused by the economic crisis'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11 

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{6: 'countries must be prepared to allow Open borders for people fleeing conflict',
 102: 'everyone can exercise the right to seek asylum and find safe refuge in another state',
 196: 'refugees have much to bring to the countries in which they are resettled in terms of culture and labor',
 197: 'Frequently, these countries of asylum are some of the world’s poorest nations and cannot handle the large influx of persons',
 367: 'The plight of the boat people became an international humanitarian crisis',
 522: 'Refugee populations consist of people who are terrified and are away from familiar surroundings'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 12

In [0]:
with open(articles_dict[articles[12].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[12]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{3: 'Only a fraction of all the illegal immigrants is actually eligible for this status',
 15: "In some of the illegal immigrants' countries of origin humanitarian hardship exists",
 46: 'they may serve as informants or as operatives of hostile states or terrorist organizations',
 47: 'they are contributing to the congestion in the cities and to the rise in crime',
 49: 'failing to stop the illegal immigration waves at an early stage will only lead to much larger waves of illegal immigration in the future'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'the US is justified in using force to prevent states from acquiring nuclear weapons '

In [0]:
topic = 'the US is justified in using force to prevent states from acquiring nuclear weapons '

articles = topic_article_dict[topic]
articles

['Nuclear proliferation',
 'Nuclear weapons debate',
 'Deterrence theory',
 'Nuclear peace',
 'Treaty on the Non-Proliferation of Nuclear Weapons',
 'Nuclear weapon',
 'Criticism of American foreign policy']

In [0]:
len(articles)

7

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{1: 'more countries with nuclear weapons may increase the possibility of nuclear warfare',
 38: 'A fundamental goal for American and global security is to minimize the proliferation risks associated with the expansion of nuclear power',
 401: 'the spread of nuclear weapons could increase international stability',
 403: 'it will decrease the likelihood of war',
 407: 'nuclear weapons promote caution in decision-makers',
 421: 'weak states will be unable to prevent – or will actively provide for – the disastrous possibility of nuclear terrorism',
 431: 'If one state produces a nuclear weapon it creates almost a domino effect within the region',
 438: 'prohibition on nuclear proliferation has been characterised as a form of technological apartheid'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(7, 1332)

In [0]:
cdc_idx_dict

{7: 'it would undermine deterrence',
 10: 'no issue carries more importance to the long-term health and security of humanity than the effort to reduce, and perhaps one day, rid the world of nuclear weapons',
 29: 'it would undermine deterrence',
 31: 'Nuclear weapons are said to have induced "nuclear peace',
 34: 'is obsolete',
 41: 'the likelihood that non-state terrorists will get their hands on nuclear weaponry is increasing',
 44: 'no issue carries more importance to the long-term health and security of humanity than the effort to reduce, and perhaps one day, rid the world of nuclear weapons'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{3: 'an inferior nuclear force, by virtue of its extreme destructive power, could deter a more powerful adversary',
 16: 'nuclear weapons had become a source of extreme risk',
 37: 'nuclear weapons are intended to deter other states from attacking with their nuclear weapons',
 98: 'Nuclear weapons give nations the potential to not only destroy their enemies but humanity itself',
 127: 'nuclear weapons had become a source of extreme risk'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(9, 1332)

In [0]:
cdc_idx_dict

{0: 'decrease the chances of crisis escalation',
 1: 'nuclear weapons are said to have induced stability',
 2: 'nuclear proliferation may be beneficial for inducing stability',
 3: 'increases the chances of nuclear material falling into the hands of non-state groups who are free from the threat of nuclear retaliation',
 5: 'new nuclear states will use their acquired nuclear capabilities to deter threats and preserve peace',
 6: 'new nuclear states often lack adequate organizational controls over their new weapons, which makes for a high risk of either deliberate or accidental nuclear war',
 13: "Nuclear weapons may also lessen a state's reliance on allies for security, thus preventing allies from dragging each other into wars",
 26: 'nuclear weapons induce stability',
 27: 'nuclear weapons contribute to stability'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{16: 'the NPT cannot stop the proliferation of nuclear weapons or the motivation to acquire them',
 75: 'Having more nuclear nuclear-weapon states would reduce security for all',
 100: 'nuclear forces continue to play an essential role in war prevention'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{86: 'the significance of nuclear weapons is purely to deter war',
 89: 'would generally be contrary to the rules of international law applicable in armed conflict',
 90: 'nuclear proliferation would be desirable',
 91: 'nuclear weapons successfully deter all-out war between states',
 125: 'could lead to increased global instability',
 128: 'no issue carries more importance to the long-term health and security of humanity than the effort to reduce, and perhaps one day, rid the world of nuclear weapons'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{31: 'the US keeps a huge stockpile of nuclear weapons while urging other nations not to get them',
 60: 'the United Nations Charter, ratified by the U.S., prohibits members from using force against fellow members except against imminent attack'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### "the United States is responsible for Mexico's drugs war "

In [0]:
topic = "the United States is responsible for Mexico's drugs war "

articles = topic_article_dict[topic]
articles

['Smuggling of firearms into Mexico',
 'War on Drugs',
 'Merida Initiative',
 'Mexican Drug War',
 'ATF gunwalking scandal']

In [0]:
len(articles)

5

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{24: "a 'significant' percentage of their firearms originate from gun stores and other sources in the U.S",
 51: 'the majority of illegal guns in Mexico really come from the United States',
 68: 'American guns are arming the Mexican drug cartels',
 100: 'most weapons and arms trafficked into Mexico are from gun dealers in the United States'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{224: 'by making drugs illegal rather than regulating them, the War on Drugs creates a highly profitable black market'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{2: 'most of the financing for the Mexican traffickers comes from American drug consumers',
 63: 'firearms recovered in Mexico came from U.S. gun dealers',
 89: 'the root cause of the problem: U.S. demand',
 104: 'American) government has been sending weapons to Mexico in a premeditated and systematic manner, knowing that their destinations were Mexican criminal organizations'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{132: 'A significant number of firearms that make their way to Mexico come from U.S. gunshops',
 141: 'most weapons and arms trafficked into Mexico are from gun dealers in the United States',
 142: 'Mexican crime guns traced to U.S. origins',
 147: "a 'significant' percentage of their firearms originate from gun stores and other sources in the U.S",
 255: 'most of the financing for the Mexican traffickers comes from American drug consumers',
 264: 'the root cause of the problem: U.S. demand'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{2: 'the ATF knowingly allowed thousands of guns to be bought by suspected arms traffickers ("gunrunners") working through straw purchasers on behalf of Mexican drug cartels',
 170: 'American) government has been sending weapons to Mexico in a premeditated and systematic manner, knowing that their destinations were Mexican criminal organizations'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'the monarchy '

In [0]:
topic = 'the monarchy '

articles = topic_article_dict[topic]
articles

['Republicanism in Australia',
 'Republicanism in Canada',
 'Monarch',
 'Monarchy of the United Kingdom',
 'Monarchy of New Zealand',
 'Right-wing politics',
 'Enlightened absolutism',
 'Monarchy',
 'Debate on the monarchy in Canada',
 'Constitutional monarchy',
 'Republicanism in the United Kingdom',
 'Monarchism']

In [0]:
len(articles)

12

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{21: 'The hereditary nature of the monarchy is said to conflict with egalitarianism and dislike of inherited privilege',
 22: 'The laws of succession are held by some to be sexist'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{7: 'the monarchy is inherently contrary to egalitarianism and multiculturalism',
 9: 'national pride is diminished by the monarchy',
 13: 'people are given greater dignity from choosing their head of state',
 28: "the country's head of state should be elected",
 45: 'monarchy as "outdated and irrelevant',
 84: 'The monarchy remains a symbol of imperialism and colonialism'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{25: 'The principal advantage of hereditary monarchy is the immediate continuity of leadership'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{79: 'the monarch cannot be prosecuted for criminal offences'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{73: 'The monarch is immune from criminal prosecution',
 167: 'a republic is "inevitable'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{91: 'social traditions or hierarchies that are essential for social order'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{8: 'monarchs ruled with the intent of improving the lives of their subjects',
 9: 'the sovereign knew the interests of his subjects better than they themselves'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{21: 'The system of monarchy since antiquity has contrasted with forms of democracy',
 35: 'The principal advantage of hereditary monarchy is the immediate continuity of leadership',
 47: 'The monarch serves as a ceremonial figurehead symbol of national unity and state continuity',
 75: 'a morally-based, balanced monarchy is stressed as the ideal form of government'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{17: 'the monarchy had worked well',
 56: 'the monarchy is an outdated and regressive institution',
 64: 'royals were simply celebrities who should not have any formal role',
 76: 'constitutional monarchy was outdated'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{3: 'monarch may have strictly ceremonial duties',
 33: 'serves the traditional role of embodying and representing the nation',
 53: 'a source of checks and balances against elected politicians who might seek powers in excess of those conferred by their respective constitutions',
 63: 'a check against possible illegal action by politicians'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(25, 1332)

In [0]:
cdc_idx_dict

{68: 'republicanism is the next logical step toward a fully democratic constitution',
 72: 'monarchy is unfair and elitist',
 73: 'in a modern and democratic society no one should be expected to defer to another simply because of his birth',
 74: 'it encourages attitudes which are more suited to a bygone age of imperialism',
 75: 'encourages a feeling of dependency in many people who should instead have confidence in themselves and their fellow citizens',
 76: "the people', not the members of one family, should be sovereign",
 77: 'it should be a fundamental right of the people of any nation to elect their head of state and for every citizen to be eligible to hold that office',
 79: 'Monarchical prerogative powers can be used to circumvent normal democratic process with no accountability',
 80: 'Monarchy is ethnic-discrimination',
 85: 'Monarchy is gender-discriminative',
 88: 'A monarchy demands deference',
 89: 'It is the enemy of merit and aspiration',
 92: 'It devalues intellect an

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11 

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{28: 'it strengthens popular liberty'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'the one child policy of the republic of China '

In [0]:
topic = 'the one child policy of the republic of China '

articles = topic_article_dict[topic]
articles

['Family planning',
 'Voluntary Human Extinction Movement',
 'Little Emperor Syndrome',
 'Reproductive rights',
 "Human rights in the People's Republic of China",
 'Only child',
 'Sex selection',
 'One-child policy',
 'Two-child policy',
 'Overpopulation',
 'Demographics of China',
 'Human population control',
 'Compulsory sterilization']

In [0]:
len(articles)

13

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{46: 'a continuation of the rapid population growth that had been occurring would hinder their development as a nation',
 52: 'the policy has created abuse for women in China',
 53: 'implementation of the policy has involved forced abortions and forced sterilization'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{3: 'a decrease in the human population would prevent a significant amount of man-made human suffering'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{16: 'this four-two-one reconfiguration of the familial structure has distinct ramifications for Chinese society'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{7: 'Parents have a basic human right to determine freely and responsibly the number and the spacing of their children',
 14: 'Parents have a basic right to decide freely and responsibly on the number and spacing of their children',
 16: 'Parents have the exclusive right to determine freely and responsibly the number and spacing of their children',
 19: "governments have a responsibility to meet individuals' reproductive needs",
 31: 'The human rights of women include their right to have control over and decide freely and responsibly on matters related to their sexuality',
 54: 'Control over reproduction is a basic need and a basic right for all women',
 58: 'Programs that do not take the interests of women into account are unlikely to succeed',
 106: "the policies' narrow focus led to coercion and decreased quality of care"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{119: 'it contributes to forced abortions',
 120: 'This is thought to have been a significant contribution to the gender imbalance in mainland China',
 122: 'the dramatic decrease in Chinese fertility started before the program began in for unrelated factors'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{15: 'only children are spoiled',
 17: 'only children have aversive social skills',
 19: 'the lack of siblings has been blamed for a number of social ills',
 21: 'The one child policy has also been speculated to be the underlying cause of forced abortions',
 23: 'it is more difficult for only children to cooperate in a conventional family environment',
 31: 'only children are higher in achievement motivation',
 36: 'children with many siblings receive fewer resources',
 40: 'Only children are also more likely to make outside friends'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{52: 'sex selection is an expression of reproductive rights',
 61: "China's gender imbalance is further increased by the One Child Policy",
 62: 'a lack of opportunity for many men to marry is believed to be producing increases in crime',
 68: 'if female babies worth their weight in rupees and yuan, economic and educational opportunities for girls would soon follow'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(17, 1332)

In [0]:
cdc_idx_dict

{8: 'The policy is controversial both within and outside China because of the manner in which the policy has been implemented, and because of concerns about negative social consequences',
 9: "possible cause behind China's gender imbalance",
 55: 'it had proved "remarkably effective',
 62: "great success in helping to implement China's current economic growth",
 63: 'The reduction in the fertility rate and thus population growth has reduced the severity of problems that come with overpopulation',
 65: 'the focus of China on population control helps provide a better health service for women',
 67: 'The individual savings rate has increased since the one-child policy was introduced',
 72: 'less intrusive options, including those that emphasized delay and spacing of births, could have achieved the same results over an extended period of time',
 83: 'China could have expected a continued reduction in its fertility rate just from continued economic development, had it kept to the previous p

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{7: 'a single child would be left with having to provide support for his or her two parents and four grandparents'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{140: "rapid population growth damages the Earth's resources and diminishes human well-being",
 367: 'overpopulation as a serious threat to the quality of human life'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{57: 'As a result of the policy, China successfully achieved its goal of a more stable and much-reduced fertility rate',
 90: 'rapid population growth as an obstacle to development',
 101: 'population control was necessary for economic growth and improved living standards',
 125: 'coercive measures used to achieve the desired results of the one-child policy',
 134: 'Rapid fertility reduction associated with the one-child policy has potentially negative results'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11 

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(14, 1332)

In [0]:
cdc_idx_dict

{3: "population control can involve measures that improve people's lives",
 19: 'a large increase in population would bring, "certain poverty on the citizenry',
 21: 'excessive growth may reduce output per worker',
 48: 'a larger population would mean more production',
 80: 'overpopulation has been blamed for a variety of issues, including increasing poverty',
 93: 'reduction of the population is a key to economic growth',
 94: 'economists doubt that a correlation between population reduction and economic growth exists',
 95: 'poverty and famine are caused by bad government and bad economic policies, not by overpopulation',
 96: 'higher population density leads to more specialization and technological innovation, which in turn leads to a higher standard of living',
 97: 'human beings are the ultimate resource',
 99: 'there is no correlation between population density and poverty and starvation',
 129: 'number of problems associated with overpopulation',
 137: 'The success of the policy

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 12

In [0]:
with open(articles_dict[articles[12].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[12]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{34: 'Coercive sterilization to enforce the one child policy has occurred in China'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'the sale of violent video games to minors '

In [0]:
topic = 'the sale of violent video games to minors '

articles = topic_article_dict[topic]
articles

['Gender representation in video games',
 'Video game',
 'Video game content rating system',
 'Nonviolent video game',
 'Violence',
 'Console game',
 'Grand Theft Childhood',
 'Brown v Entertainment Merchants Association',
 'Video game controversies',
 'Media influence',
 'Graphic violence',
 'California Assembly Bills 1792 and 1793',
 'School violence',
 'Media violence research',
 'Video game culture']

In [0]:
len(articles)

15

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{29: "violent video games are influencing their children's view about violence"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{183: 'Various games have been accused of causing addiction and even violent behavior'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{41: 'numerous researchers have proposed potential positive effects of video games'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{46: 'violent games cause youth violence',
 47: 'a high degree of relationship between violent games and youth violence',
 48: 'video game violence leads to youth violence',
 56: 'video game content was a form of freedom of expression',
 61: 'there is social utility in expressive and imaginative forms of entertainment, even if they contain violence',
 213: 'there are tangible benefits to violence in action games'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{295: 'evidence for harmful effects were inconclusive'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{33: 'video games allow children to act out crimes',
 35: 'violence in video games is not causally linked with aggressive tendencies'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{29: 'correlations between violent gameplay and some common childhood problems',
 33: 'most children who play violent games do not have problems',
 34: 'many creative, social and emotional benefits from video game play—even games with violent content'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{2: 'video games were protected speech under the First Amendment',
 36: 'there was a connection between video games and violence',
 54: 'content-based regulations are presumptively invalid',
 88: 'content-based regulations are presumptively invalid',
 117: 'no evidence linked video games to youth violence',
 136: 'there was no "compelling" link between violent video games and its effects on children',
 148: 'parents, not government bureaucrats, have the right to decide what is appropriate for their children',
 169: 'parents should make the decision” about what video games they purchase for their children, and what constitutes “too violent'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(31, 1332)

In [0]:
cdc_idx_dict

{4: 'exposure to violent video games causes at least a temporary increase in aggression and that this exposure correlates with aggression in the real world',
 6: 'video game violence is not related to serious aggressive behavior in real life',
 8: 'some violent video games may actually have a prosocial effect in some contexts',
 10: 'exposure to violent video games causes both short term and long term aggression in players and decreases empathy and prosocial behavior',
 21: 'they increase the violent tendencies among youth',
 22: 'have shown no conclusive link between video game usage and violent activity',
 30: 'violent video games are significantly associated with: increased aggressive behavior, thoughts, and affect; increased physiological arousal; and decreased pro-social (helping) behavior',
 37: 'video game publishers unethically train children in the use of weapons and, more importantly, harden them emotionally to the act of murder',
 39: 'violent video games may increase mild f

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{11: 'not all depictions of violence are even bad to witness',
 42: 'no connection between exposure to media violence and real life violence',
 43: 'exposure alone does not cause a child to commit crimes',
 49: 'there is no convincing evidence that prove that media violence cause violent crime or any type of real life violence'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{13: 'exposure to graphic violence leads to desensitization to committing acts of violence in person',
 26: 'violence in games hardens children to unethical acts'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11 

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{6: 'violent video games—especially first-person shooter games—encouraged real-life acts of violence in teenagers'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 12

In [0]:
with open(articles_dict[articles[12].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[12]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{52: 'violent video games is related to increased aggressiveness in children'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 13

In [0]:
with open(articles_dict[articles[13].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[13]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{13: 'children may imitate aggressive behaviors witnessed in media',
 22: "children don't automatically imitate aggression, but rather consider the context of aggression",
 124: 'societal media consumption and violent crime rates are not well associated'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 14 

In [0]:
with open(articles_dict[articles[14].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[14]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{111: 'computer games cause violence',
 113: 'a correlation between violent content conveyed through media (including videogames) and violent or aggressive behavior',
 149: 'Some serious psychological problems have been attributed to desensitization to violence in video games'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'the use of affirmative action '

In [0]:
topic = 'the use of affirmative action '

articles = topic_article_dict[topic]
articles

['Civil Rights Act of 1964',
 'Racial quota',
 'Affirmative Action Around the World',
 'Black Economic Empowerment',
 'Reverse discrimination',
 'Convention on the Elimination of All Forms of Racial Discrimination',
 'Reservation in India',
 'Convention on the Elimination of All Forms of Discrimination Against Women',
 'Racism',
 'Racism in the United States',
 'Discrimination',
 'Affirmative action bake sale',
 'United Kingdom employment equality law',
 'Minority group',
 'Equal opportunity',
 'Color blindness (race)',
 'Symbolic racism',
 'Meritocracy',
 'Affirmative action in the United States',
 'Affirmative action']

In [0]:
len(articles)

20

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{99: "You can't legislate morality"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{20: 'one group is favored at the expense of another whenever a quota is invoked',
 21: 'using quotas displaces individuals that would normally be favored based on their individual achievements',
 22: 'qualifications should be the only determining factor'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{3: 'programs have at best a negligible impact on the groups they are intended to assist',
 7: 'they tend to benefit primarily the most fortunate among the preferred group',
 8: 'They reduce the incentives of both the preferred and non-preferred to perform at their best'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{2: 'direct intervention in the distribution of assets and opportunities was needed to resolve the economic disparities'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{4: 'discrimination inherent in affirmative action programs',
 5: 'identical treatment may sometimes act to preserve inequality rather than eliminate it'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{30: 'redress imbalances and promote equality',
 40: 'affirmative action policies for specific racial groups to guarantee "the full and equal enjoyment of human rights and fundamental freedoms'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{1: 'affirmative action designed to improve the well being of perceived backward and under represented communities',
 110: 'the identification of oppressed classes was difficult to carry out'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{8: 'States must take measures to seek to eliminate prejudices'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{5: 'is intended to ameliorate past discrimination'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(7, 1332)

In [0]:
cdc_idx_dict

{6: 'Historical racism continues to be reflected in socio-economic inequality',
 7: 'Racial stratification continues to occur',
 219: 'have been criticized as a form of "reverse discrimination',
 362: 'Motivation for affirmative action policies is to redress the effects of past discrimination',
 366: 'these policies demonstrate an overt preference for applicants from particular backgrounds over better-qualified (or equally-qualified) candidates from other backgrounds',
 367: 'the only consideration in choosing between applicants should be merit',
 368: 'it perpetuates racial division'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{129: 'attempts at antidiscrimination have been criticized as reverse discrimination',
 130: 'affirmative action) discriminate against members of a dominant or majority group',
 135: "each individual's civil rights include the right to be free from government sponsored social discrimination"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11 

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{16: 'affirmative action is racial division, not racial reconciliation'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 12

In [0]:
with open(articles_dict[articles[12].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[12]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{68: 'it violates the principle of equal treatment just as much as negative discrimination'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 13

In [0]:
with open(articles_dict[articles[13].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[13]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{78: "recognition and rights accorded to specific groups may interfere with the state's need to establish a cohesive identity",
 80: 'where members of minorities see that their specific needs and ambitions have been acknowledged and catered for, they will commit themselves more willingly to accepting the legitimacy of the nation',
 83: 'These may be considered necessary because the minority group in question is socially disadvantaged',
 89: 'the political function of rights is precisely to protect minorities from oppression by majorities'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 14 

In [0]:
with open(articles_dict[articles[14].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[14]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{1: 'jobs should go to those “most qualified',
 29: 'The selection process should not be based on some arbitrary or irrelevant criterion',
 52: "race and sex shouldn't matter when getting a job",
 65: 'final selection for posts must be made according to the principle the best person for the job',
 67: 'the overall idea is to give children from less fortunate backgrounds more of a chance',
 78: 'The idea is to help disadvantaged groups get back to a normal starting position after a long period of discrimination',
 192: 'any equalities achieved will entail future inequalities',
 202: 'it is an ideal that cannot and should not be realized through the actions of the government'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 15

In [0]:
with open(articles_dict[articles[15].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[15]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{11: 'those that give preference to individuals solely based on their race or gender should not be permitted',
 70: 'social inequality today is due to "cultural deficits" of individual people or racial or ethnic groups',
 71: 'there is no need to pay "systematic attention" to any current inequities'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 16

In [0]:
with open(articles_dict[articles[16].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[16]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{4: 'Racial prejudice and discrimination no longer exists'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 17

In [0]:
with open(articles_dict[articles[17].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[17]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{4: 'merit" itself should be a primary consideration during evaluation'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 18 

In [0]:
with open(articles_dict[articles[18].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[18]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(15, 1332)

In [0]:
cdc_idx_dict

{4: 'The impetus towards affirmative action is to redress the disadvantages associated with overt historical discrimination',
 7: 'have been criticised as a form of reverse discrimination',
 37: 'aims "to correct the effects of past and present discrimination',
 90: 'no one has a legal right to have any demographic characteristic they possess be considered a favorable point on their behalf',
 110: 'Race-conscious affirmative action remains necessary to address race-based obstacles',
 115: 'it is often contested on constitutional grounds',
 134: 'affirmative action requires the very discrimination it is seeking to eliminate',
 135: 'affirmative action counter-productive',
 138: 'affirmative action lowers the bar',
 141: 'it fails to achieve its goals',
 142: 'encourages groups to identify themselves as disadvantaged, even if they are not',
 143: 'It may increase racial tension',
 147: "affirmative action devalues the accomplishments of people who belong to a group it's supposed to help"

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 19

In [0]:
with open(articles_dict[articles[19].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[19]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(15, 1332)

In [0]:
cdc_idx_dict

{1: 'countering the effects of a history of discrimination',
 7: 'Affirmative action is intended to promote equal opportunity',
 8: 'to ensure that minority groups within a society are included in all programs',
 9: 'it helps to compensate for past discrimination, persecution or exploitation by the ruling class of a culture',
 18: 'the principle of equality sometimes requires States parties to take affirmative action in order to diminish or eliminate conditions which cause or help to perpetuate discrimination prohibited by the Covenant',
 22: 'In some countries which have laws on racial equality, affirmative action is rendered illegal because it does not treat all races equally',
 159: 'These laws cause disproportionally high costs for small companies and reduce economic growth and employment',
 162: 'it is impossible to favor somebody without discriminating against others',
 173: 'affirmative action devalues the accomplishments of people who are chosen based on the social group to whi

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'the use of performance enhancing drugs in professional sports '

In [0]:
topic = 'the use of performance enhancing drugs in professional sports '

articles = topic_article_dict[topic]
articles

['Substance abuse',
 'Anabolic steroid',
 'Use of performance-enhancing drugs in sport',
 'Ergogenic use of anabolic steroids',
 'Mitchell Report',
 'Drug Enforcement Administration',
 'Doping in East Germany']

In [0]:
len(articles)

7

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{82: 'Substance abuse can be harmful to your health and may even be deadly in certain scenarios'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{6: 'Health risks can be produced by long-term use or excessive doses of anabolic steroids'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(15, 1332)

In [0]:
cdc_idx_dict

{4: 'it is little different from the use of new materials in the construction of suits and sporting equipment, which similarly aid performance and can give competitors an unfair advantage over others',
 190: 'The use of anabolic steroids is now banned by all major sporting bodies',
 191: 'drug testing can be wildly inconsistent and, in some instances, has gone unenforced',
 232: 'there is little danger from anabolica, as they call it, when the athletes are kept on strictly monitored programmes',
 233: 'the extremely dangerous side-effects are admitted',
 252: 'Often, doping was carried out without the knowledge of the athletes, some of them as young as ten years of age',
 253: 'former athletes bear the physical and mental scars of years of drug abuse',
 292: 'the pursuit of doping athletes has turned into a modern day witch-hunt',
 294: 'Many sports organizations have banned the use of performance enhancing drugs and have very strict rules and consequences for people who are caught usi

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{3: 'There is a wide range of health concerns for users'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{42: 'The use of performance-enhancing substances by players is illegal and ethically "wrong'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{90: 'anybody should be free to put any substance they choose into their own bodies for any reason'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{3: 'Many former athletes suffer from health problems related to steroid consumption',
 13: 'there is little danger from anabolica, as they call it, when the athletes are kept on strictly monitored programmes',
 14: 'the extremely dangerous side-effects are admitted',
 33: 'Often, doping was carried out without the knowledge of the athletes, some of them as young as ten years of age',
 34: 'former athletes bear the physical and mental scars of years of drug abuse'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'trade aid'

In [0]:
topic = 'trade aid'

articles = topic_article_dict[topic]
articles

['Aid effectiveness',
 'Development geography',
 'Dambisa Moyo',
 'Peter Thomas Bauer',
 'Faith-based foreign aid',
 'Poverty trap',
 'Free trade debate',
 'Poverty reduction',
 'Trade and development',
 'Poverty in Africa',
 'William Easterly',
 'Protectionism',
 'Development aid',
 'Aid',
 'James Shikwati']

In [0]:
len(articles)

15

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(10, 1332)

In [0]:
cdc_idx_dict

{24: 'aid is never effective',
 25: 'it has achieved significant impact when it has been properly directed and managed',
 61: 'aid is ineffective',
 69: 'the impact of aid on GDP growth is positive',
 70: 'aid has less or no significant impact in countries with "poor" institutions and policies',
 78: 'aids used for infrastructure and investments will result in a positive economic growth',
 79: 'aid is effective under a wide variety of circumstances',
 96: 'aid alone is not enough to lift developing countries out of poverty',
 97: 'aid actually has a significant impact on growth',
 257: 'trade is an important tool for development'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{57: 'Countries which rely on only a few exports for much of their income are very vulnerable to changes in the market value of those commodities',
 71: 'Aid helps make the recipient country (the country that receives aid) get more developed',
 73: 'Often aid does not even reach the poorest people',
 76: 'the recipient country becomes more dependent on aid from a donor country'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{1: 'Aid is Not Working',
 19: 'foreign aid has harmed Africa',
 25: 'hinders economic growth',
 36: 'were aid cut, African governments would respond by turning to other sources of finance that would make them more accountable',
 41: 'the path to long-term development would only be achieved through private sector involvement and free market solutions'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(3, 1332)

In [0]:
cdc_idx_dict

{2: 'the most effective manner to help developing countries advance is through state-controlled foreign aid',
 32: 'government-to-government aid was neither necessary nor sufficient for development',
 33: 'erodes civil society'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{68: 'direct monetary assistance from industrialized countries has gone to increase the standard of living in many impoverished nations',
 72: 'intervention by financial contributors could ease domestic unrest',
 74: 'The contribution of funding to economically unstable nations has helped to create more economic opportunities',
 83: 'some aid has been proven to help nations develop in the past',
 84: 'When a country is given money because they cannot financially sustain themselves, several negative effects have the potential to develop',
 85: 'it simply does not work',
 87: 'In the case of federally funded aid, contributions have, in effect, crowded out any investment in the private sectors of many nations',
 89: 'pouring vast amounts of money into development aid without any concern for results has failed'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{10: 'people continue to die at a high rate due in large part to lack of sufficient aid',
 15: 'If the foreign assistance is substantial enough, and lasts long enough, the capital stock rises sufficiently to lift households above subsistence'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{6: 'free trade will make society more prosperous',
 84: 'Trade also allows for better quality produce and competitiveness between nations, effectively raising the living standards of those nations',
 95: 'increased trade is the best way to relieve extreme poverty throughout the world',
 109: 'free trade gives optimal economic advantages',
 212: 'Free trade is generally considered to achieve an overall increase in utility in a society',
 213: 'Individuals can be made worse off by an opening of trade'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 7

In [0]:
with open(articles_dict[articles[7].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[7]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{26: 'helps growth',
 28: 'is essential in providing better lives',
 47: 'trade rules are often unfair as they block access to richer nations’ markets and ban poorer nations from supporting their industries',
 130: 'Western monetary aid often only serves to increase poverty and social inequality',
 131: 'higher aid levels erode the quality of governance',
 133: 'aid is not spread properly'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 8

In [0]:
with open(articles_dict[articles[8].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[8]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{0: 'Trade is a key factor in economic development',
 1: "A successful use of trade can boost a country's development",
 2: 'opening up markets to international trade may leave local producers swamped by more competitive foreign producers',
 3: 'trade, development, and poverty reduction are intimately linked',
 4: 'trade and growth are strongly linked',
 5: 'export-led growth has been a key part of many countries’ successful development strategies',
 76: 'liberalization when institutions and the economy are not strong enough to face risks and opportunities can be harmful',
 79: 'trade-led growth is pro-poor'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 9

In [0]:
with open(articles_dict[articles[9].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[9]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{63: 'foreign aid may not even be helpful in the long run to many African nations'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 10

In [0]:
with open(articles_dict[articles[10].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[10]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{15: 'foreign aid to many third world countries has failed to produce sustainable growth'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 11 

In [0]:
with open(articles_dict[articles[11].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[11]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{139: 'free trade helps workers in developing countries'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 12

In [0]:
with open(articles_dict[articles[12].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[12]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{46: 'aid is ineffective',
 47: 'development aid has no effect on the speed with which countries develop',
 49: 'much government-to-government aid was ineffective',
 63: 'has an adverse effect on local production',
 74: 'rich countries have put so many conditions on aid that it has reduced aid effectiveness',
 77: 'a very large part of the spend money on development aid is simply wasted uselessly',
 83: 'Development Assistance to the Third World Has Failed',
 114: "aid's complexity and the ever expanding budgets leave it vulnerable to corruption"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 13

In [0]:
with open(articles_dict[articles[13].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[13]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(13, 1332)

In [0]:
cdc_idx_dict

{76: 'Aid to underdeveloped countries has sometimes been criticized as being more in the interest of the donor than the recipient',
 80: 'it did not do what it was intended to do or help the people it was intended to help',
 83: 'foreign aid generally does boost growth',
 86: 'foreign assistance, especially foreign capitalism, has been somewhat deleterious to African development',
 89: 'aid can often distort incentives in poor countries in various harmful ways',
 100: 'An implementation of aid can easily be problematic, causing more problems than it solves',
 101: 'hollows out the local economy',
 117: 'encouraging developing economies to develop their agriculture with a focus on exports is not effective on a global market where key players, such as the US and EU, heavily subsidise their products',
 127: 'aid is not targeting the most extreme poverty',
 141: 'it neither goes where it was intended nor helps those intended',
 161: 'Aid can make progress towards reducing poverty worldwide

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 14 

In [0]:
with open(articles_dict[articles[14].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[14]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{1: 'aid towards Africa does more harm than good'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'wind power should be a primary focus of future energy supply '

In [0]:
topic = 'wind power should be a primary focus of future energy supply '

articles = topic_article_dict[topic]
articles

['Environmental impact of wind power',
 'Wind power in Scotland',
 'Wind power in Austria',
 'Cost of electricity by source',
 'Energy development',
 'Wind power grid integration',
 'Environmental impact of the energy industry']

In [0]:
len(articles)

7

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(10, 1332)

In [0]:
cdc_idx_dict

{0: 'Compared to the environmental impact of traditional energy sources, the environmental impact of wind power is relatively minor',
 1: 'Wind power consumes no fuel',
 9: 'Wind power consumes no fuel',
 17: 'Producing electricity from wind reduces the consumption of fossil fuels and therefore leads to emissions savings',
 129: 'Wind turbines do not consume fuel or produce pollution during normal operation',
 150: 'They have a smaller footprint than other forms of energy generation',
 153: 'wind farms will damage tourism',
 164: 'may cause physiological problems',
 180: 'people living near wind power facilities are increasingly complaining of health problems',
 201: 'wind farms causing annoyance and ill health in people'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{30: 'The production of zero carbon electricity at the wind farm is expected to reduce emissions of carbon dioxide',
 53: 'wind power "cannot be relied upon to provide significant levels of power',
 70: 'wind power as the cleanest source of renewable energy',
 71: 'wind farms are necessary to meet current and future energy needs'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{23: 'Wind power is a product with no hidden costs and economically the most inexpensive form of energy production',
 42: 'The use of wind power reduces the necessity for importing electricity from abroad',
 47: 'Production of wind power does not release any pollutants',
 55: 'wind power does not pose a threat to people or the environment'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{163: 'Wind power has poor capacity contribution'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{190: 'Renewable energy is sustainable in its production'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 5 

In [0]:
with open(articles_dict[articles[5].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[5]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(6, 1332)

In [0]:
cdc_idx_dict

{1: 'Wind power is an intermittent energy source',
 4: 'integrating wind energy into the utility grid can be problematic',
 40: 'Electricity generated from wind power can be highly variable',
 44: 'predictability of wind plant output remains low',
 46: 'the non-dispatchable nature of wind energy production can raise costs',
 57: 'peak wind speeds may not coincide with peak demand for electrical power'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 6

In [0]:
with open(articles_dict[articles[6].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[6]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{47: 'the environmental effects of wind power are relatively minor',
 48: 'Wind power consumes no fuel'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### 'year round schooling '

In [0]:
topic = 'year round schooling '

articles = topic_article_dict[topic]
articles

['Year-round school',
 'Summer learning loss',
 'Summer vacation',
 'After-school activity']

In [0]:
len(articles)

4

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(12, 1332)

In [0]:
cdc_idx_dict

{14: 'Multitrack schedules reportedly bring many benefits to schools that use them',
 18: 'for every three multitrack schools, one less school must be built',
 25: 'students’ attitudes towards school did significantly increase as they spent more time on a year-round schedule',
 26: 'Students who attend year-round school say that their calendar is more balanced than their peers who have a typical school calendar',
 29: 'year-round schools showed a substantial gain in academic achievement for at-risk, low performing students',
 30: 'More frequent, short breaks provide struggling students more time for help',
 33: 'parents are in favor of the year-round schedule',
 35: 'The year round schedule provides more opportunities for family vacations',
 37: 'If schools are open for longer the operating and maintenance costs may increase',
 41: 'Year round schooling may create difficulties for teens to be able to maintain part-time or summer job',
 42: 'Students with attention learning disabilities

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{6: 'summer vacation is a period when students’ rate of academic development declines relative to the school year',
 7: 'All children lose academic skills during the summer months'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{4: 'children need the – months off to relax and also to take a break from other childhood stresses associated with school'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(2, 1332)

In [0]:
cdc_idx_dict

{9: 'working parents wish their children to be supervised',
 10: 'if unsupervised, children may fall into criminal or undesirable activity'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

### Topic - 'Europe should weaken its austerity measures to guarantee its citizens greater social support '

In [0]:
topic = 'Europe should weaken its austerity measures to guarantee its citizens greater social support '

articles = topic_article_dict[topic]
articles

['Welfare state',
 'Austerity',
 'Greek government-debt crisis',
 'Anti-austerity protests',
 'Deficit spending']

#### Article 0 

In [0]:
with open(articles_dict[articles[0].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[0]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(1, 1332)

In [0]:
cdc_idx_dict

{15: "the state should provide citizens their demands in order to achieve people's well-being"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 1 

In [0]:
with open(articles_dict[articles[1].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[1]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{4: 'government austerity can result in economic expansion',
 5: 'expansion from austerity is very limited',
 20: 'they tend to have an adverse impact on the poorest segments of the population',
 27: 'austerity measures tend to depress economic growth',
 28: 'austerity can engender deflation which inflates existing debt'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([df, ndf], axis=0, ignore_index=True)

#### Article 2 

In [0]:
with open(articles_dict[articles[2].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[2]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(4, 1332)

In [0]:
cdc_idx_dict

{361: 'social disruption could have a significantly negative impact on investment and growth in the longer term',
 362: 'policy makers consistently underestimated the disastrous effects of rigid spending cuts on economic growth',
 384: 'short-term sacrifices necessary for long-term success',
 418: "Governments borrowed too much, now they're paying the price, and fiscal austerity is the only answer"}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 3 

In [0]:
with open(articles_dict[articles[3].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[3]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(5, 1332)

In [0]:
cdc_idx_dict

{10: 'they tend to have an impact on the poorest segments of the population',
 22: "rather than 'punish' the banks and others truly responsible for the crisis, the government is instead 'punishing' regular people for the 'crimes' of others",
 64: 'austerity measures tend to be counterproductive',
 65: 'austerity simply depresses economic growth',
 66: 'austerity can engender deflation which inflates existing debt'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

#### Article 4 

In [0]:
with open(articles_dict[articles[4].replace(' ','_')+'.txt']) as article:
        doc = article.read()

c_doc = text_cleaner(doc)
sentences = nltk.sent_tokenize(c_doc)
sentences = [[sent, topic, articles[4]] for sent in sentences]

ndf = pd.DataFrame(sentences, columns=['text', 'topic', 'article'])

In [0]:
cdc_idx_dict = {}
cdc_idx_sent = {}
cdc_idx_art = {}

for i,claim in enumerate(ref_data['clean_Claim']):
    check_complete = False
    while check_complete == False:
        for idx,sentence in enumerate(ndf['text']):
            test = claim.lower() in sentence.lower()
            if test:
                article = ref_data['Article'][i]
                cdc_idx_dict[idx] = claim
                cdc_idx_sent[idx] = [claim, sentence]
                cdc_idx_art[idx] = [claim, article]
                check_complete = True
            elif idx == (len(ndf['text'])-1):
                check_complete = True

len(cdc_idx_dict), ref_data.shape[0]

(8, 1332)

In [0]:
cdc_idx_dict

{5: 'deficit spending is necessary',
 6: 'government should always run a balanced budget',
 10: 'one should have money before one spends it',
 17: 'deficit spending is necessary',
 18: 'deficit spending is logically necessary',
 41: 'deficit spending permits the private sector to accumulate net worth',
 59: 'deficit spending may create inflation',
 63: 'an increase in government spending will lead to inflation'}

In [0]:
cdc = []

for x in range(len(ndf)):
    if x in list(cdc_idx_dict.keys()):
        cdc.append(cdc_idx_dict.get(x))
    else:
        cdc.append('---')
ndf['cdc'] = cdc

df = pd.concat([ndf, df], axis=0, ignore_index=True)

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46086 entries, 0 to 46085
Data columns (total 4 columns):
text       46086 non-null object
topic      46086 non-null object
article    46086 non-null object
cdc        46086 non-null object
dtypes: object(4)
memory usage: 1.4+ MB


In [0]:
check = df.loc[df['cdc']!='---']

In [0]:
len(check)

1332

## Save compiled data

In [0]:
write_path = '/content/drive/My Drive/Colab Notebooks/Thinkful/Module 34 - Final Capstone/data/CDC Detection/cl_dat.csv'

df.to_csv(write_path)