In [1]:
import pandas as pd 
import numpy as np
import csv
from collections import Counter
from collections import defaultdict
import string
import nltk
from nltk import tokenize
import preprocessing as pp
import re

In [2]:
bodies = pd.read_csv('train_bodies.csv')
stances = pd.read_csv('train_stances.csv')

In [3]:
bodies.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [4]:
bodies.loc[bodies['Body ID'] == 5]

Unnamed: 0,Body ID,articleBody
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...


## Merges the two data frame based on body ID 

In [5]:
pd.merge(bodies, stances, on='Body ID', how='outer').to_csv('bodies_stances.csv', sep=',', encoding='utf-8')

In [6]:
bodies_stances = pd.read_csv('bodies_stances.csv')

## Removes unrelated from the data frame and drops duplicates

In [7]:
bodies_stances = bodies_stances[bodies_stances.Stance != 'unrelated']

In [8]:
bodies_stances.to_csv('bodies_stances_new.csv', sep=',', encoding='utf-8')

In [9]:
# bodies_stances = bodies_stances.drop_duplicates(subset=['articleBody'], keep=False)

## Coverts article body strings to lowercase and removes special characters

In [10]:
from nltk.tokenize import sent_tokenize

In [11]:
article_bodies_clean = bodies_stances.articleBody.str.lower().str.replace(r"[^a-zA-z]+", " ").str.strip()


In [None]:
article_bodies_clean = bodies_stances.articleBody.str.lower()

## Removes Stop Words from Article Body

In [12]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
pat = r'\b(?:{})\b'.format('|'.join(stop))


In [13]:
article_bodies_clean = article_bodies_clean.str.split(' ').apply(lambda x: ' '.join(k for k in x if k not in stop))


In [14]:
bodies_stances.articleBody = article_bodies_clean


## Splits up article bodies based on stance 

In [16]:
agree_bodies_stances = bodies_stances[bodies_stances.Stance == 'agree']
disagree_bodies_stances = bodies_stances[bodies_stances.Stance == 'disagree']

In [17]:
a = disagree_bodies_stances['articleBody'].str.cat(sep=' ')
b = agree_bodies_stances['articleBody'].str.cat(sep=' ')

In [18]:
a



In [20]:
disagree_words = pp.get_clean_tokens(a)
disagree_word_dist = nltk.FreqDist(disagree_words)


In [22]:
disagree_words = nltk.tokenize.word_tokenize(a)
disagree_word_dist = nltk.FreqDist(disagree_words)

agree_words = nltk.tokenize.word_tokenize(b)
agree_word_dist = nltk.FreqDist(agree_words)


In [23]:
#tags part of speech to each word in articles that disagree 
pos_disagree = nltk.pos_tag(disagree_words)

In [24]:
pos_disagreedf = pd.DataFrame(pos_disagree, columns =['Word', 'POS_Tag'])


In [25]:
pos_disagreedf.to_csv('pos_disagree.csv', sep=',', encoding='utf-8')

In [None]:
pos_disagree[28641]

In [None]:
pp.is_noun(pos_disagree[28641][1])

In [None]:
# Creates a list of non nouns 
non_nouns = []
for i in range(len(pos_disagree)):
    if pp.is_noun(pos_disagree[i][1]) == False:
        non_nouns.append(pos_disagree[i][0])
        

In [26]:
disagree_res = pd.DataFrame(disagree_word_dist.most_common(7000),
                    columns=['Word', 'Frequency'])
disagree_res.to_csv('disagree_wordcounts.csv', sep=',', encoding='utf-8')

agree_res = pd.DataFrame(agree_word_dist.most_common(7000),
                    columns=['Word', 'Frequency'])
agree_res.to_csv('agree_wordcounts.csv', sep=',', encoding='utf-8')
 

In [27]:
disagree_res = pd.DataFrame(disagree_word_dist.most_common(7000),
                    columns=['Word', 'Frequency'])

In [None]:
#adds a column with boolean values 
disagree_res['not_noun'] = disagree_res.Word.isin(non_nouns)
agree_bodies_stances = bodies_stances[bodies_stances.Stance == 'agree']


In [None]:
nonnoun_disagree_res = disagree_res[disagree_res.not_noun == True]

In [None]:
nonnoun_disagree_res.to_csv('nonnoun_disagree_wordcounts.csv', sep=',', encoding='utf-8')
