In [2]:
import spacy
import corenlp
import nltk
from nltk import ngrams
from nltk.tokenize import sent_tokenize, wordpunct_tokenize, word_tokenize, RegexpTokenizer, TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import sklearn
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import os, math, glob, re

from collections import Counter
import re as regex
import contractions
import copy

from itertools import chain

nlp = spacy.load('en_core_web_sm')
nlp.max_length = 9999999


### Pre-Processing

In [10]:
def contract(text):
    return contractions.fix(text)


def regTokenize(text):
    tok = RegexpTokenizer('[A-Za-z0-9]*[.]?\w+')
    return tok.tokenize(text)


def lowercase(text):
    return text.lower()


def lemma(words):
    for i in range(0, len(words)):
        words[i] = WordNetLemmatizer().lemmatize(words[i])
    return words


def stemming(words):
    porter_stemmer = PorterStemmer()
    for i in range(0, len(words)):
        words[i] = porter_stemmer.stem(words[i])
    return words


def comma(text):
    text = "".join(c for c in text if c not in ('!', '.', ':', ',', '"', '?', '(', ')'))
    return text


def getBasicNorm(text):
    for i in range(0, len(text)):
        text[i] = contract(text[i])
        text[i] = lowercase(text[i])
    return text


### Feature Extraction functions

In [11]:
def getWholeString(data):
    string = ''
    for i in data:
        string += i[0] + ' '
    return string


def getNGrams(corpusSentence):
    corpusSentence = contract(corpusSentence)
    corpusSentence = lowercase(corpusSentence)
    tokenized = regTokenize(corpusSentence)
    uni = list(ngrams(tokenized, 1))
    bigram = list(ngrams(tokenized, 2))
    trigram = list(ngrams(tokenized, 3))
    print('uni:', len(uni), ' bi:', len(bigram), 'tri:', len(trigram))
    grams = [uni, bigram, trigram]
    return grams


def postagging(corpusSentence):
    corpusSentence = contract(corpusSentence)
    corpusSentence = lowercase(corpusSentence)
    doc = nlp(corpusSentence)
    return doc


def posPattern(corpusSentence):
    doc = nlp(corpusSentence)
    return doc


# -----------------------------------------POS Pattern-----------------------------------------
def getPosPattern(data_sentences):
    sentences_pos = []
    for i in data_sentences:
        sent = []
        s = postagging(i)
        for j in s:
            sent.append(j.pos_)
        sentences_pos.append(sent)
    return sentences_pos


def posGrams(sentence):
    trigram = list(ngrams(sentence, 3))
    fourgram = list(ngrams(sentence, 4))
    grams = [trigram, fourgram]
    return grams


def get34grams(sentences_pos):
    three_four_grams = []
    for sent_pos in sentences_pos:
        temp = posGrams(sent_pos)
        temp = list(chain(*temp))
        three_four_grams.append(temp)
    return three_four_grams


### Dataset Loading

In [3]:
dataset=joblib.load('dataset_with_labels.sav')
print(len(dataset))

for i in range(0,10):
    print(dataset[i])

11112
['Between one and 10 cases of Congo Fever are reported in South Africa annually, with about 20 to 25 percent of patients dying, according to statistics from the virology institute.', 1]
['Saeed said indications were that those tests would be negative too.', 0]
['A total of 158 cases of Congo Fever were diagnosed in southern Africa between 1981 and the end of 2000.', 1]
['He said it was his opinion that the patient -- a woman -- was suffering from tick bite fever.', 0]
['Early symptoms of the disease include severe headaches, red eyes, fevers and cold chills, body pain, and vomiting.', 1]
['The two have similar symptoms.', 1]
['The disease can be contracted if a person is bitten by a certain tick or if a person comes into contact with the blood of a Congo Fever sufferer.', 1]
['The woman was admitted to the hospital on Saturday after complaining of severe joint pains.', 0]
['She also had a skin rash and was vomiting.', 1]
['The patient told hospital authorities she became sick aft

In [5]:
X=np.array(dataset)[:,0] # sentences
Y=np.array(dataset)[:,1] # labels
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=20)

print(len(X_train),' ',len(X_test),' ',len(y_train),' ',len(y_test))

11112
8889   2223   8889   2223


In [7]:
train_facts=[]
train_nonfacts=[]
for i in range(len(X_train)):
    temp=[]
    if(y_train[i]=='1'):
        temp.append(X_train[i])
        temp.append(1)
        train_facts.append(temp)
    else:
        temp.append(X_train[i])
        temp.append(0)
        train_nonfacts.append(temp)
print(len(train_facts),' ',len(train_nonfacts))

3967   4922


### getting wholeSentences for wholedataset, facts, nonfacts as one sentence

In [12]:
sent_as_string=getWholeString(dataset)

train_facts_as_string=getWholeString(train_facts)
train_nonfacts_as_string=getWholeString(train_nonfacts)

print(len(sent_as_string) ,' ',len(train_facts_as_string),' ',len(train_nonfacts_as_string))


1635553   528863   783790


### getting 'nGrams' for 'facts and nonfacts sentences' and converting to 'set'

In [13]:
train_facts_n_grams = getNGrams(train_facts_as_string)
train_facts_unigram, train_facts_bigram, train_facts_trigram = list(set(train_facts_n_grams[0])), list(set(train_facts_n_grams[1])), list(set(train_facts_n_grams[2]))
print(len(train_facts_unigram), ' ', len(train_facts_bigram), ' ', len(train_facts_trigram), '\n===================')

train_nonfacts_n_grams = getNGrams(train_nonfacts_as_string)
train_nonfacts_unigram, train_nonfacts_bigram, train_nonfacts_trigram = list(set(train_nonfacts_n_grams[0])), list(set(train_nonfacts_n_grams[1])), list(set(train_nonfacts_n_grams[2]))
print(len(train_nonfacts_unigram), ' ', len(train_nonfacts_bigram), ' ', len(train_nonfacts_trigram),'\n===================')


uni: 86806  bi: 86805 tri: 86804
10164   53161   78045 
uni: 127875  bi: 127874 tri: 127873
11310   68124   109043 


In [14]:
train_facts_wholeGrams=list(chain(*train_facts_n_grams))
train_facts_wholeGrams_list_set=list(set(train_facts_wholeGrams))
print(len(train_facts_wholeGrams_list_set))

train_nonfacts_wholeGrams=list(chain(*train_nonfacts_n_grams))
train_nonfacts_wholeGrams_list_set=list(set(train_nonfacts_wholeGrams))
print(len(train_nonfacts_wholeGrams_list_set))

141370
188477


In [15]:
grams_toVector=['facts_unigram','facts_bigram','facts_trigram','nonfacts_unigram','nonfacts_bigram','nonfacts_trigram']

### getting pos tags and convert into 'set'

In [16]:
pos=postagging(sent_as_string)

print(len(pos))
pos_tag=[]

for i in pos:
    pos_tag.append(i.pos_)
pos_tag_list_set=list(set(pos_tag))
print(len(pos_tag_list_set))

309474
16


### getting 'named entities' for 'facts and nonfacts sentences' and convert into 'set'

In [21]:
named_entity=[]
for i in pos.ents:
    named_entity.append(i.label_)
named_entity_list_set=list(set(named_entity))
print(len(named_entity),'-->', len(named_entity_list_set))
print(named_entity_list_set)

15024 --> 17
['LOC', 'DATE', 'LANGUAGE', 'PERCENT', 'MONEY', 'GPE', 'QUANTITY', 'ORG', 'ORDINAL', 'PERSON', 'PRODUCT', 'TIME', 'CARDINAL', 'NORP', 'LAW', 'EVENT', 'FAC']


### getting 'POS Pattern' and getting '3,4-grams' for each sentence and convert into 'set'

In [22]:
X_train_facts=np.array(train_facts)[:,0]
X_train_nonfacts=np.array(train_nonfacts)[:,0]
# print(X_train_facts)

In [23]:
wholeTrainFacts=copy.deepcopy(X_train_facts)
wholeTrainFacts=getBasicNorm(wholeTrainFacts)

wholeNonFacts=copy.deepcopy(X_train_nonfacts)
wholeNonFacts=getBasicNorm(wholeNonFacts)

In [24]:
facts_posPattern=getPosPattern(wholeTrainFacts)

nonfacts_posPattern=getPosPattern(wholeNonFacts)

In [25]:
for i in facts_posPattern:
    print(i)
    break
print(len(facts_posPattern))
# ----------------------------------------------------

for i in nonfacts_posPattern:
    print(i)
    break
print(len(nonfacts_posPattern))

['ADP', 'DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'ADP', 'NUM', 'PUNCT', 'SPACE', 'DET', 'PROPN', 'PUNCT', 'NOUN', 'NOUN', 'VERB', 'DET', 'NOUN', 'PART', 'VERB', 'DET', 'NOUN', 'NOUN', 'ADP', 'SPACE', 'DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT']
3967
['DET', 'VERB', 'ADV', 'VERB', 'VERB', 'ADV', 'ADJ', 'ADP', 'DET', 'NUM', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'VERB', 'ADP', 'DET', 'NOUN', 'VERB', 'VERB', 'ADJ', 'NOUN', 'NOUN', 'PUNCT']
4922


In [54]:
facts_pos_pattern_3_4=get34grams(facts_posPattern)#*****
facts_pos_pattern_3_grams,facts_pos_pattern_4_grams=list(set(facts_pos_pattern_3_4[0])),list(set(facts_pos_pattern_3_4[1]))

nonfacts_pos_pattern_3_4=get34grams(nonfacts_posPattern)#*****
nonfacts_pos_pattern_3_grams,nonfacts_pos_pattern_4_grams=list(set(nonfacts_pos_pattern_3_4[0])),list(set(nonfacts_pos_pattern_3_4[1]))

# ----------------------------------------------------

facts_pos_pattern_3_4_list=list(chain(*facts_pos_pattern_3_4))
facts_pos_pattern_3_4_list_set=list(set(facts_pos_pattern_3_4_list))
print(len(facts_pos_pattern_3_4_list_set),'\n')

nonfacts_pos_pattern_3_4_list=list(chain(*nonfacts_pos_pattern_3_4))
nonfacts_pos_pattern_3_4_list_set=list(set(nonfacts_pos_pattern_3_4_list))
print(len(nonfacts_pos_pattern_3_4_list_set))

9569 

10396


In [27]:
pos_toVector=['facts_posPattern_3gram','facts_posPattern_4gram','nonfacts_posPattern_3gram','nonfacts_posPattern_4gram',]

### Sentiment Extractor

In [28]:
sentiment_scores=['_Sentiment_Feature_']

### TPattern

In [32]:
Tcases=[]
for i in range(1,6):
    cases='case'+str(i)
    Tcases.append(cases)
print(Tcases)

['case1', 'case2', 'case3', 'case4', 'case5']


### Adding into 'Vector'

In [33]:
vector=[]

# Ngrams features
# for i in wholeGrams_list_set:
#     vector.append(i)
# print('wholeGrams_list_set:',len(vector))

for i in grams_toVector:
    vector.append(i)
print('grams_toVector:',len(vector))

# ----------------------------------------------------

# pos Tagging features
for i in pos_tag_list_set:
    vector.append(i)
print('pos_tag_list_set:',len(vector))

# ----------------------------------------------------

# Named Entities features
for i in named_entity_list_set:
    vector.append(i)
print('named_entity_list_set:',len(vector))

# ----------------------------------------------------

# posPattern features
# for i in pos_pattern_3_4_list_set:
#     vector.append(i)
# print('pos_pattern_3_4_list_set:',len(vector))

for i in pos_toVector:
    vector.append(i)
print('pos_toVector:',len(vector))

# ----------------------------------------------------

# Tpattern cases features
for i in Tcases:
    vector.append(i)
print('Tpattern:',len(vector))

# ----------------------------------------------------

# Sentiment Scores features
for i in sentiment_scores:
    vector.append(i)
print('sentiment_scores:',len(vector))

grams_toVector: 6
pos_tag_list_set: 22
named_entity_list_set: 39
pos_toVector: 43
Tpattern: 48
sentiment_scores: 49


In [34]:
print(vector)

['facts_unigram', 'facts_bigram', 'facts_trigram', 'nonfacts_unigram', 'nonfacts_bigram', 'nonfacts_trigram', 'DET', 'PROPN', 'SPACE', 'NUM', 'NOUN', 'ADP', 'PART', 'PUNCT', 'PRON', 'SYM', 'CCONJ', 'X', 'VERB', 'ADJ', 'ADV', 'INTJ', 'LOC', 'DATE', 'LANGUAGE', 'PERCENT', 'MONEY', 'GPE', 'QUANTITY', 'ORG', 'ORDINAL', 'PERSON', 'PRODUCT', 'TIME', 'CARDINAL', 'NORP', 'LAW', 'EVENT', 'FAC', 'facts_posPattern_3gram', 'facts_posPattern_4gram', 'nonfacts_posPattern_3gram', 'nonfacts_posPattern_4gram', 'case1', 'case2', 'case3', 'case4', 'case5', '_Sentiment_Feature_']


### Saving(dumping)

In [37]:
X_file='sav_1/X_reduced.sav'#only sentences->not labels
train_facts_file='sav_1/train_facts_reduced.sav'
train_nonfacts_file='sav_1/train_nonfacts_reduced.sav'

joblib.dump(train_facts,train_facts_file)
joblib.dump(train_nonfacts,train_nonfacts_file)
joblib.dump(X,X_file)

['sav_1/X_reduced.sav']

In [38]:
sent_as_string_file='sav_1/sent_as_string_reduced.sav'
train_facts_as_string_file='sav_1/train_facts_as_string_reduced.sav'
train_nonfacts_as_string_file='sav_1/train_nonfacts_as_string_reduced.sav'

joblib.dump(sent_as_string,sent_as_string_file)
joblib.dump(train_facts_as_string,train_facts_as_string_file)
joblib.dump(train_nonfacts_as_string,train_nonfacts_as_string_file)

['sav_1/train_nonfacts_as_string_reduced.sav']

In [39]:
train_facts_unigram_file='sav_1/train_facts_unigram_reduced.sav'
train_facts_bigram_file='sav_1/train_facts_bigram_reduced.sav'
train_facts_trigram_file='sav_1/train_facts_trigram_reduced.sav'
joblib.dump(train_facts_unigram,train_facts_unigram_file)
joblib.dump(train_facts_bigram,train_facts_bigram_file)
joblib.dump(train_facts_trigram,train_facts_trigram_file)

train_nonfacts_unigram_file='sav_1/train_nonfacts_unigram_reduced.sav'
train_nonfacts_bigram_file='sav_1/train_nonfacts_bigram_reduced.sav'
train_nonfacts_trigram_file='sav_1/train_nonfacts_trigram_reduced.sav'
joblib.dump(train_nonfacts_unigram,train_nonfacts_unigram_file)
joblib.dump(train_nonfacts_bigram,train_nonfacts_bigram_file)
joblib.dump(train_nonfacts_trigram,train_nonfacts_trigram_file)

train_facts_wholeGrams_list_set_file='sav_1/train_facts_wholeGrams_list_set_reduced.sav'
train_nonfacts_wholeGrams_list_set_file='sav_1/train_nonfacts_wholeGrams_list_set_reduced.sav'
joblib.dump(train_facts_wholeGrams_list_set,train_facts_wholeGrams_list_set_file)
joblib.dump(train_nonfacts_wholeGrams_list_set,train_nonfacts_wholeGrams_list_set_file)

['sav_1/train_nonfacts_wholeGrams_list_set_reduced.sav']

In [48]:
pos_tag_list_set_file='sav_1/pos_tag_list_set_reduced.sav'
joblib.dump(pos_tag_list_set,pos_tag_list_set_file)

['sav_1/pos_tag_list_set_reduced.sav']

In [49]:
named_entity_list_set_file='sav_1/named_entity_list_set_reduced.sav'
joblib.dump(named_entity_list_set,named_entity_list_set_file)

['sav_1/named_entity_list_set_reduced.sav']

In [55]:
facts_pos_pattern_3_grams_file='sav_1/facts_pos_pattern_3_grams_reduced.sav'
facts_pos_pattern_4_grams_file='sav_1/facts_pos_pattern_4_grams_reduced.sav'
nonfacts_pos_pattern_3_grams_file='sav_1/nonfacts_pos_pattern_3_grams_reduced.sav'
nonfacts_pos_pattern_4_grams_file='sav_1/nonfacts_pos_pattern_4_grams_reduced.sav'

joblib.dump(facts_pos_pattern_3_grams,facts_pos_pattern_3_grams_file)
joblib.dump(facts_pos_pattern_4_grams,facts_pos_pattern_4_grams_file)
joblib.dump(nonfacts_pos_pattern_3_grams,nonfacts_pos_pattern_3_grams_file)
joblib.dump(nonfacts_pos_pattern_4_grams,nonfacts_pos_pattern_4_grams_file)

facts_pos_pattern_3_4_list_set_file='sav_1/facts_pos_pattern_3_4_list_set_reduced.sav'
nonfacts_pos_pattern_3_4_list_set_file='sav_1/nonfacts_pos_pattern_3_4_list_set_reduced.sav'

joblib.dump(facts_pos_pattern_3_4_list_set,facts_pos_pattern_3_4_list_set_file)
joblib.dump(nonfacts_pos_pattern_3_4_list_set,nonfacts_pos_pattern_3_4_list_set_file)

['sav_1/nonfacts_pos_pattern_3_4_list_set_reduced.sav']

In [51]:
vector_file='sav_1/vector_dimensions_reduced.sav'
joblib.dump(vector,vector_file)

['sav_1/vector_dimensions_reduced.sav']

In [52]:
vector_load=joblib.load('sav_1/vector_dimensions_reduced.sav')

In [53]:
len(vector_load)

49