In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import time
import tensorflow as tf

np.random.seed(13)
tf.random.set_seed(13)

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from wordcloud import WordCloud
from xml.sax import ContentHandler, parse

from nltk import word_tokenize
import nltk

#nltk.download('punkt')

stemmer = SnowballStemmer('english', ignore_stopwords=True)
stop = set(stopwords.words('english'))

%matplotlib inline
sns.set(rc={'figure.figsize':(11.7,8.27)})

2023-05-18 16:18:22.703492: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Class that hadles excel files

In [2]:
%%time

class ExcelHandler(ContentHandler):
    def __init__(self):
        self.chars = [  ]
        self.cells = [  ]
        self.rows = [  ]
        self.tables = [  ]
    def characters(self, content):
        self.chars.append(content)
    def startElement(self, name, atts):
        if name=="Cell":
            self.chars = [  ]
        elif name=="Row":
            self.cells=[  ]
        elif name=="Table":
            self.rows = [  ]
    def endElement(self, name):
        if name=="Cell":
            self.cells.append(''.join(self.chars))
        elif name=="Row":
            self.rows.append(self.cells)
        elif name=="Table":
            self.tables.append(self.rows)



CPU times: user 17 µs, sys: 2 µs, total: 19 µs
Wall time: 20.5 µs


In [3]:
excelHandler = ExcelHandler()
parse('data/features.xls', excelHandler)
features = pd.DataFrame(excelHandler.tables[0][1:], columns=excelHandler.tables[0][0])

Parse Excel file and create dataframe


In [4]:
y = np.where(features['Label'] == 'objective', 0, 1)

Create labels: objective = 0, subjective = 1

In [5]:
texts = []
normalized_texts = []

for i in range(1, 1001):
    if i // 10 == 0:
        num = '000' + str(i)
    elif i // 100 == 0:
        num = '00' + str(i)
    elif i // 1000 == 0:
        num = '0' + str(i)
    else:
        num = '1000'
    
    f = open('data/raw-data/Text' + num + '.txt', 'r', encoding='latin-1')
    text = f.read()
    
    # removes any non-alphabetic characters and tokenizes 
    # the text from the Natural Language Toolkit (nltk)
    
    normalized_text = ' '.join([stemmer.stem(w) for w in word_tokenize(text) if (w.isalpha() and w not in stop)])
    texts.append(text)
    normalized_texts.append(normalized_text)

Read text files and preprocess

In [6]:
dataframe = pd.DataFrame({'texts': np.array(texts), 'normalized_texts': np.array(normalized_texts), 'label': y})

Create dataframe for the texts and their labels

In [7]:
obj_texts = ' '.join(dataframe[dataframe['label'] == 0]['normalized_texts'].tolist())
sub_texts = ' '.join(dataframe[dataframe['label'] == 1]['normalized_texts'].tolist())

Create two strings for the preprocessed texts: one for objective and one for subjective

In [8]:
X_train, X_test, y_train, y_test = train_test_split(np.array(normalized_texts), y, random_state=13, stratify=y)

Splits the data into training and testing sets for use in a machine learning model.

# Decision Tree's

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

Create an instance of the TfidfVectorize

In [None]:
vectorizer = TfidfVectorizer()

Fit the vectorizer on the training data and transform the training data

In [None]:
X_train_vectorized = vectorizer.fit_transform(X_train)

Transform the test data

In [None]:
X_test_vectorized = vectorizer.transform(X_test)

Create an instance of the DecisionTreeClassifier

In [None]:
classifier = DecisionTreeClassifier()

Fit the model on the vectorized training data

In [None]:
classifier.fit(X_train_vectorized, y_train)

Make predictions on the vectorized test data


In [None]:
y_pred = classifier.predict(X_test_vectorized)

Summary of the predictions made by the classifier

In [None]:
print(classification_report(y_test, y_pred))

#### Confusion matrix

In [None]:
print(confusion_matrix(y_test, y_pred))

#### Accuracy

In [None]:
print('accuracy is',accuracy_score(y_pred,y_test))

# Support Vector Machine's (SVM) 

In [None]:
from sklearn.svm import SVC

Create an instance of the SVC (Support Vector Classifier)

In [None]:
classifier = SVC()

Fit the model on the vectorized training data

In [None]:
classifier.fit(X_train_vectorized, y_train)

Make predictions on the vectorized test data

In [None]:
y_pred = classifier.predict(X_test_vectorized)

In [None]:
print(classification_report(y_test, y_pred))

#### Print the confusion matrix

In [None]:
print(confusion_matrix(y_test, y_pred))

In [None]:
#### Accuracy score

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)