In [26]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import tensorflow as tf

np.random.seed(13)
tf.random.set_seed(13)

nltk.download('punkt')

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from wordcloud import WordCloud
from xml.sax import ContentHandler, parse
from zipfile import ZipFile

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.wrappers.scikit_learn import KerasClassifier
from nltk import word_tokenize
import nltk

stemmer = SnowballStemmer('english', ignore_stopwords=True)
stop = set(stopwords.words('english'))

%matplotlib inline
sns.set(rc={'figure.figsize':(11.7,8.27)})

[nltk_data] Downloading package punkt to /home/carlos/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Class that hadles excel files

In [27]:
%%time

class ExcelHandler(ContentHandler):
    def __init__(self):
        self.chars = [  ]
        self.cells = [  ]
        self.rows = [  ]
        self.tables = [  ]
    def characters(self, content):
        self.chars.append(content)
    def startElement(self, name, atts):
        if name=="Cell":
            self.chars = [  ]
        elif name=="Row":
            self.cells=[  ]
        elif name=="Table":
            self.rows = [  ]
    def endElement(self, name):
        if name=="Cell":
            self.cells.append(''.join(self.chars))
        elif name=="Row":
            self.rows.append(self.cells)
        elif name=="Table":
            self.tables.append(self.rows)



CPU times: user 17 µs, sys: 0 ns, total: 17 µs
Wall time: 20.3 µs


In [28]:
excelHandler = ExcelHandler()
parse('data/features.xls', excelHandler)
features = pd.DataFrame(excelHandler.tables[0][1:], columns=excelHandler.tables[0][0])

Parse Excel file and create dataframe


In [29]:
y = np.where(features['Label'] == 'objective', 0, 1)

Create labels: objective = 0, subjective = 1

In [30]:
texts = []
normalized_texts = []

for i in range(1, 1001):
    if i // 10 == 0:
        num = '000' + str(i)
    elif i // 100 == 0:
        num = '00' + str(i)
    elif i // 1000 == 0:
        num = '0' + str(i)
    else:
        num = '1000'
    
    f = open('data/raw-data/Text' + num + '.txt', 'r', encoding='latin-1')
    text = f.read()
    
    # removes any non-alphabetic characters and tokenizes 
    # the text from the Natural Language Toolkit (nltk)
    
    normalized_text = ' '.join([stemmer.stem(w) for w in word_tokenize(text) if (w.isalpha() and w not in stop)])
    texts.append(text)
    normalized_texts.append(normalized_text)

Read text files and preprocess

In [31]:
dataframe = pd.DataFrame({'texts': np.array(texts), 'normalized_texts': np.array(normalized_texts), 'label': y})

Create dataframe for the texts and their labels

In [32]:
obj_texts = ' '.join(dataframe[dataframe['label'] == 0]['normalized_texts'].tolist())
sub_texts = ' '.join(dataframe[dataframe['label'] == 1]['normalized_texts'].tolist())

Create two strings for the preprocessed texts: one for objective and one for subjective

In [33]:
X_train, X_test, y_train, y_test = train_test_split(np.array(normalized_texts), y, random_state=13, stratify=y)

Splits the data into training and testing sets for use in a machine learning model.

We utilized pre-trained Glove (Global Vectors for Word Representation) embeddings derived from 6 billion tokens to prepare the embeddings for training our deep learning models.

In [34]:
%%time

embed_size = 50 # how big is each word vector
max_features = 10000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 200 # max number of words in text to use

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)
tokenized_train = tokenizer.texts_to_sequences(X_train)
tokenized_test = tokenizer.texts_to_sequences(X_test)
X_t = pad_sequences(tokenized_train, maxlen=maxlen)
X_te = pad_sequences(tokenized_test, maxlen=maxlen)

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in ZipFile('glove.6B.zip').open('glove.6B.50d.txt'))
#embeddings_index = dict(get_coefs(*o.strip().split()) for o in ZipFile('glove.840B.300d.zip').open('glove.840B.300d.txt'))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.zip'