In [1]:
import pandas as pd 
import numpy as np
import csv
from collections import Counter
from collections import defaultdict
import string
import nltk
from nltk import tokenize
import preprocessing as pp
import re

In [2]:
bodies = pd.read_csv('train_bodies.csv')
stances = pd.read_csv('train_stances.csv')

In [3]:
bodies.head()

Unnamed: 0,Body ID,articleBody
0,0,A small meteorite crashed into a wooded area i...
1,4,Last week we hinted at what was to come as Ebo...
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...
3,6,"Posting photos of a gun-toting child online, I..."
4,7,At least 25 suspected Boko Haram insurgents we...


In [4]:
bodies.loc[bodies['Body ID'] == 5]

Unnamed: 0,Body ID,articleBody
2,5,(NEWSER) – Wonder how long a Quarter Pounder w...


## Merges the two data frame based on body ID 

In [5]:
pd.merge(bodies, stances, on='Body ID', how='outer').to_csv('bodies_stances.csv', sep=',', encoding='utf-8')

In [6]:
bodies_stances = pd.read_csv('bodies_stances.csv')

## Removes unrelated from the data frame and drops duplicates

In [7]:
bodies_stances = bodies_stances[bodies_stances.Stance != 'unrelated']

In [8]:
bodies_stances.to_csv('bodies_stances_new.csv', sep=',', encoding='utf-8')

In [None]:
# bodies_stances = bodies_stances.drop_duplicates(subset=['articleBody'], keep=False)

## Coverts article body strings to lowercase and removes special characters

In [9]:
from nltk.tokenize import sent_tokenize

In [None]:
article_bodies_clean = bodies_stances.articleBody.str.lower().str.replace(r"[^a-zA-z]+", " ").str.strip()


In [None]:
article_bodies_clean = bodies_stances.articleBody.str.lower()

# Experimenting by separating sentences

In [None]:
bodies_stances.articleBody = article_bodies_clean

In [10]:
textbody = bodies_stances['articleBody'].str.cat(sep=' ')

## For disagree

In [11]:
agree_bodies_stances = bodies_stances[bodies_stances.Stance == 'agree']
disagree_bodies_stances = bodies_stances[bodies_stances.Stance == 'disagree']

In [12]:
disagree_textbody = disagree_bodies_stances['articleBody'].str.cat(sep = ' ')

In [13]:
#tokenizes to a sentence
disagree_sentences = sent_tokenize(disagree_textbody)

In [14]:
#tokenizes each word in each sentence
disagree_sentences = [nltk.tokenize.word_tokenize(disagree_sentences[i]) for i in range(len(disagree_sentences))]

In [15]:
#tags each word 
disagree_pos = [nltk.pos_tag(disagree_sentences[i]) for i in range(len(disagree_sentences))]

In [17]:
#lowercases the words 
disagree_pos = [[(re.sub('[\W_]+', ' ', word, flags=re.UNICODE).lower(), tag) for word, tag in element]
                    for element in disagree_pos]

In [19]:
#removing stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')

disagree_pos = [disagree_pos[i][j] for i in range(len(disagree_pos)) for j in range(len(disagree_pos[i])) if disagree_pos[i][j][0] not in stop]

In [20]:
#removes words that are less than 3 letters 
disagree_pos = [disagree_pos[i] for i in range(len(disagree_pos)) if len(disagree_pos[i][0]) > 2]

In [21]:
disagree_pos[0][0]

'rumour'

In [22]:
# creates a list of all the words 
disagree_words = [disagree_pos[i][0] for i in range(len(disagree_pos))]
len(disagree_words)

172548

In [23]:
disagree_word_dist = nltk.FreqDist(disagree_words)


In [24]:
disagreepos_df = pd.DataFrame(disagree_pos, columns =['Word', 'POS_Tag'])


In [25]:
disagree_word_dist_df = pd.DataFrame(list(disagree_word_dist.items()), columns = ['Word','Frequency'])

In [26]:
disagree_dist_freq = pd.merge(disagreepos_df, disagree_word_dist_df, on='Word', how='outer')

In [27]:
disagree_dist_freq

Unnamed: 0,Word,POS_Tag,Frequency
0,rumour,NN,10
1,rumour,NN,10
2,rumour,NN,10
3,rumour,NN,10
4,rumour,NN,10
5,rumour,NN,10
6,rumour,NN,10
7,rumour,NN,10
8,rumour,NN,10
9,rumour,NN,10


In [28]:
disagree_dist_freq = disagree_dist_freq.drop_duplicates(subset=['Word'], keep='first')

In [None]:
disagree_dist_freq.to_csv('disagree_dist.csv', sep=',', encoding='utf-8')

### For all articles

In [None]:
sentences = sent_tokenize(textbody)

In [None]:
tokenized = []
pos = [] 

for i in range(len(sentences)):
    tokenized.append(nltk.tokenize.word_tokenize(sentences[i]))

In [None]:
for j in range(len(tokenized)):
    pos.append(nltk.pos_tag(tokenized[j]))

In [None]:
flat_list = [item for sublist in l for item in sublist]
for sublist in l:
    for item in sublist:
        flat_list.append(item)

In [None]:
flat_pos = [part for word in pos for part in word]

In [None]:
print(pos[0])

In [None]:
pos_df = pd.DataFrame(flat_pos, columns =['Word', 'POS_Tag'])


In [None]:
sentence1 = nltk.tokenize.word_tokenize(pp.clean(sentences[0]))

In [None]:
print(sentence1)

In [None]:
pos_sentence1 = nltk.pos_tag(sentence1)

In [None]:
print(pos_sentence1)

In [None]:
clean = []
for word in sentence1: 
    clean.append(pp.clean(word))

In [None]:
print(clean)

## Removes Stop Words from Article Body

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
pat = r'\b(?:{})\b'.format('|'.join(stop))


In [None]:
article_bodies_clean = article_bodies_clean.str.split(' ').apply(lambda x: ' '.join(k for k in x if k not in stop))


In [None]:
bodies_stances.articleBody = article_bodies_clean


## Splits up article bodies based on stance 

In [None]:
agree_bodies_stances = bodies_stances[bodies_stances.Stance == 'agree']
disagree_bodies_stances = bodies_stances[bodies_stances.Stance == 'disagree']

In [None]:
a = disagree_bodies_stances['articleBody'].str.cat(sep=' ')
b = agree_bodies_stances['articleBody'].str.cat(sep=' ')

In [None]:
a

In [None]:
disagree_words = pp.get_clean_tokens(a)
# disagree_word_dist = nltk.FreqDist(disagree_words)


In [None]:
disagree_words

In [None]:
disagree_words = nltk.tokenize.word_tokenize(a)
disagree_word_dist = nltk.FreqDist(disagree_words)

agree_words = nltk.tokenize.word_tokenize(b)
agree_word_dist = nltk.FreqDist(agree_words)


In [None]:
#tags part of speech to each word in articles that disagree 
pos_disagree = nltk.pos_tag(disagree_words)

In [None]:
pos_disagreedf = pd.DataFrame(pos_disagree, columns =['Word', 'POS_Tag'])


In [None]:
pos_disagreedf.to_csv('pos_disagree.csv', sep=',', encoding='utf-8')

In [None]:
pos_disagree[28641]

In [None]:
pp.is_noun(pos_disagree[28641][1])

In [None]:
# Creates a list of non nouns 
non_nouns = []
for i in range(len(pos_disagree)):
    if pp.is_noun(pos_disagree[i][1]) == False:
        non_nouns.append(pos_disagree[i][0])
        

In [None]:
disagree_res = pd.DataFrame(disagree_word_dist.most_common(7000),
                    columns=['Word', 'Frequency'])
disagree_res.to_csv('disagree_wordcounts.csv', sep=',', encoding='utf-8')

agree_res = pd.DataFrame(agree_word_dist.most_common(7000),
                    columns=['Word', 'Frequency'])
agree_res.to_csv('agree_wordcounts.csv', sep=',', encoding='utf-8')
 

In [None]:
disagree_res = pd.DataFrame(disagree_word_dist.most_common(7000),
                    columns=['Word', 'Frequency'])

In [None]:
#adds a column with boolean values 
disagree_res['not_noun'] = disagree_res.Word.isin(non_nouns)
agree_bodies_stances = bodies_stances[bodies_stances.Stance == 'agree']


In [None]:
nonnoun_disagree_res = disagree_res[disagree_res.not_noun == True]

In [None]:
nonnoun_disagree_res.to_csv('nonnoun_disagree_wordcounts.csv', sep=',', encoding='utf-8')
