In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import tensorflow as tf

np.random.seed(13)
tf.random.set_seed(13)

from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.wrappers.scikit_learn import KerasClassifier
from nltk import word_tokenize
import nltk

nltk.download('punkt')

from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from wordcloud import WordCloud
from xml.sax import ContentHandler, parse
from zipfile import ZipFile

stemmer = SnowballStemmer('english', ignore_stopwords=True)
stop = set(stopwords.words('english'))

%matplotlib inline
sns.set(rc={'figure.figsize':(11.7,8.27)})

[nltk_data] Downloading package punkt to /home/carlos/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Class that hadles excel files

In [36]:
%%time

class ExcelHandler(ContentHandler):
    def __init__(self):
        self.chars = [  ]
        self.cells = [  ]
        self.rows = [  ]
        self.tables = [  ]
    def characters(self, content):
        self.chars.append(content)
    def startElement(self, name, atts):
        if name=="Cell":
            self.chars = [  ]
        elif name=="Row":
            self.cells=[  ]
        elif name=="Table":
            self.rows = [  ]
    def endElement(self, name):
        if name=="Cell":
            self.cells.append(''.join(self.chars))
        elif name=="Row":
            self.rows.append(self.cells)
        elif name=="Table":
            self.tables.append(self.rows)



CPU times: user 0 ns, sys: 22 µs, total: 22 µs
Wall time: 25 µs


In [37]:
excelHandler = ExcelHandler()
parse('data/features.xls', excelHandler)
features = pd.DataFrame(excelHandler.tables[0][1:], columns=excelHandler.tables[0][0])

Parse Excel file and create dataframe


In [33]:
y = np.where(features['Label'] == 'objective', 0, 1)

Create labels: objective = 0, subjective = 1

In [35]:
texts = []
normalized_texts = []

for i in range(1, 1001):
    if i // 10 == 0:
        number = '000' + str(i)
    elif i // 100 == 0:
        number = '00' + str(i)
    elif i // 1000 == 0:
        number = '0' + str(i)
    else:
        number = '1000'
    
    f = open('data/raw-data/Text' + number + '.txt', 'r', encoding='latin-1')
    text = f.read()
    normalized_text = ' '.join([stemmer.stem(w) for w in word_tokenize(text) if (w.isalpha() and w not in stop)])
    texts.append(text)
    normalized_texts.append(normalized_text)

Read text files and preprocess

In [29]:
data = pd.DataFrame({'texts': np.array(texts), 'normalized_texts': np.array(normalized_texts), 'label': y})

Create dataframe for the texts and their labels

In [30]:
obj_texts = ' '.join(data[data['label'] == 0]['normalized_texts'].tolist())
sub_texts = ' '.join(data[data['label'] == 1]['normalized_texts'].tolist())

Create two strings for the preprocessed texts: one for objective and one for subjective