In [1]:
# Name: Darragh Tate

# Minor Exercise 2 - Text Classification
# This assignment is about categorising different articles from the newsgroups dataset using 3 different ML techniques:
#   - Naive Bayes Classification
#   - Support Vector Machine Classification
#   - Neural Network Classification

# Data Collection & Cleaning

! pip install sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
! pip install numpy
import numpy as np
! pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

dataset = fetch_20newsgroups(subset='all', random_state=17)

# Returns the tokenized version of the input string, as determined by the punkt tokenizer
def tokenize(text):
    return word_tokenize(text)

# Removes unimportant words from the sentence
def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    return( [token.lower()for token in text if token not in stop_words] )

# Returns the string with only root versions of words (e.g. "loving" changes to "love"). Reduces the number of words in the pool without losing context
def stem_words(text):
    stemmer = WordNetLemmatizer()
    return([stemmer.lemmatize(token) for token in text])

# More agressive version of stem_words, for example removes "s" from plural words (e.g "bikes" changes to "bike")
def stem_words_more(text):
    stemmer = PorterStemmer()
    return([stemmer.stem(token) for token in text])

# Gets rid of punctuation. String in function is every commonly used punctuation mark.
def remove_punctuation(text):
    punctuation = '!"#$%&\'()*+, -./:;<=>?@[\]^_`{|}~'
    return([char for char in text if char not in punctuation])

# Calls all the above functions on each sentence in a given list
def clean_data(input_list):
    return_list = []
    for li in input_list:
        return_list.append(stem_words(remove_punctuation(remove_stop_words(tokenize(li)))))
    return return_list

def dummy(doc):
    return doc

X, y = dataset.data, dataset.target
# Used to determine number of records, as the full data set is large and can have long processing times
# Legacy code; Used for shorter test times during development of models
data_size = len(X)
X = X[:data_size]
y = y[:data_size]

# Splits the data into training & test data, random state is for consistency
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state = 23)
X_train = clean_data(X_train)
X_test = clean_data(X_test)

# Converts data into tf-idf (Term Frequency over Item Document Frequency) Matrix. This tells us the relevance of each word in a document, by seeing how often it appears proportionally.
tfidf = TfidfVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy, token_pattern=None)

#Fits the data to a matrix as defined with the tfidf object
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

In [None]:
# Naive Bayes Classification

# Naive Bayes Classification Score: 0.9198797311637779

# Naive bayes is a probabilistic classifier that assumes (hence 'naive') independence between the factors, not assuming correlation between varaibles.
# They collect average statistics on each class.
# Data is compared to the averages of each class in the training set, and whichever is closest it is classified as.
# Scikit-Learn has 3 classifiers: MultinomialNB (used for counts, such a word counts, making it ideal for this), BernouliiNB (for binary data) and GaussianNB (for continuous data)

# Source - "Introduction to Machine Learning with Python", Andreas C. Muller & Sarah Guido, O'Reilly 2017, ISBN 978-1-449-36941-5, p 70-72, Retrieved 10/04/'21

from sklearn.naive_bayes import MultinomialNB

# alpha = model smoothness. Higher alpha results in a less complex model
alpha = 0.01

nb_classifier = MultinomialNB(alpha=alpha)

# Fit the model with training data
nb_classifier.fit(X_train, y_train)

# .score calculates the r^2 values, or correlation, between the predicted and actual values
print(f'Naive Bayes Classification Score: {nb_classifier.score(X_test, y_test)}')

# Preditced classes of X_test, when fed through the trained model
y_pred = nb_classifier.predict(X_test)

print(f'Naive Bayes Classification Report:\n{classification_report(y_test, y_pred)}\n')
 
print(confusion_matrix(y_test, y_pred))

Naive Bayes Classification Score: 0.9193491333569155
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91       242
           1       0.82      0.89      0.86       276
           2       0.89      0.86      0.87       304
           3       0.82      0.88      0.85       290
           4       0.91      0.90      0.90       297
           5       0.94      0.91      0.93       333
           6       0.91      0.81      0.86       302
           7       0.89      0.93      0.91       280
           8       0.96      0.95      0.96       286
           9       0.97      0.96      0.96       309
          10       0.95      0.98      0.97       311
          11       0.97      0.96      0.96       315
          12       0.89      0.90      0.89       293
          13       0.95      0.96      0.96       282
          14       0.96      0.96      0.96       283
          15       0.91      0.96      0.93    

In [2]:
# Support Vector Machine Classification

# SVC Classification Score: 0.9154580827732579

# SVMs work by conceptually plotting the data on a scatteplot, then trying to draw lines that separate the data.
# Data is analysed and placed on this conceptual plot
# The idea is that margins are drawn along the plot which clearly separates the data, resulting in defined categorisation.
# The support vector is the instance that lies along the margins of a class, i.e. the instance that is closest in definition to a memeber of another class. It is almost an outlier.
# If data isn't linearly separable, then we must use soft margin classification (which allows outlying instances to be misclassified for the sake of model accuracy)
# over hard margin classification (which only works if a straight line can be drawn between each class)

# Source: "Hands-on Machine Learning with Scikit-Learn, Keras & Tensorflow", Aurelien Geron, O'Reilly 2019, ISBN 978-1-492-03264-9, p153-158, Retrieved 10/04/'21



from sklearn.svm import SVC
# Polynomial kernel allows for linear classification, but with the line "bending". 
# Poly with a degree of 1 for some reason results in higher scores than linear
kernel = 'poly' 
#kernel = 'rbf'# Score = 0.814
#kernel = 'sigmoid'# Score = 0.829
#kernel = 'linear' # Score = 0.838
degree = 1
# NOTE: I theorise a higher score could be attained with a large degree value, however the training process is too slow to test (even in colab, which I was using)

# Maximum number of iterations. Not guaranteed to iterate this many times, if the data converges (no signifcant loss over x iterations) beforehand it terminates
max_iter = 5000

# C represents tolerance, or width of the margins generated by the SVM. High values results in narrower margins, which can result in better training score at the risk of overfitting
C = 100

svm_classifier = SVC(C = C, kernel=kernel, degree = degree, max_iter=max_iter)

# Fit the training data to the model
svm_classifier.fit(X_train, y_train)

# .score calculates the r^2 values, or correlation, between the predicted and actual values
print(f'SVM Classification Score: {svm_classifier.score(X_test, y_test)}')

# Preditced classes of X_test, when fed through the trained model
y_pred = svm_classifier.predict(X_test)
print(f'SVM Classification Report:\n{classification_report(y_test, y_pred)}\n')

SVM Classification Score: 0.9154580827732579
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.90      0.91       242
           1       0.76      0.88      0.82       276
           2       0.86      0.86      0.86       304
           3       0.82      0.84      0.83       290
           4       0.92      0.89      0.91       297
           5       0.92      0.93      0.93       333
           6       0.88      0.90      0.89       302
           7       0.94      0.91      0.92       280
           8       0.98      0.96      0.97       286
           9       0.97      0.94      0.96       309
          10       0.97      0.97      0.97       311
          11       0.99      0.94      0.96       315
          12       0.84      0.91      0.87       293
          13       0.92      0.94      0.93       282
          14       0.98      0.95      0.97       283
          15       0.93      0.92      0.92       286
         

In [None]:
# Neural Network Classification

# Neural Network Classification Score: 0.9257294429708223

# Neural Networks are collections of Perceptrons, which are binary classifiers that can take in multiple sources of data 
# and "activate" (return a 1) if the activation function requirements are satisified by the input data.
# The MLP models in scikit-learn are Multilayer Perceptrions, which facilitate non-binary outputs, allowing for more complex categorisation and regression.
# Each node recieves data, and then is activated or not activated. This value is given a weight and passed on to the next node, and the process repeats.
# After passing through multiple nodes, the final output can be either a classification or a calculated value (regression).
# This uses the MLPClassifier, as we are splitting the text into classes.
from sklearn.neural_network import MLPClassifier

# Artefact testing of different hidden layer structures
#hidden_layer_sizes = (22,33)
#hidden_layer_sizes = (22,33,)
#hidden_layer_sizes = (25, 50,)# 0.8892819243013795
#hidden_layer_sizes = (40, 80,) # 0.8997170145030067
#hidden_layer_sizes = (100, 200,) # 0.90484612663600994
#hidden_layer_sizes = (10,) # 0.8935267067562788
#hidden_layer_sizes = (20,) # 0.8960028298549699
#hidden_layer_sizes = (50,) # 0.8802617615847188
#hidden_layer_sizes = (100,) # 0.9078528475415635
#hidden_layer_sizes = (10, 20,) # 0.8608065086664308
#hidden_layer_sizes = (110,) # Score: 0.911743898125221
#hidden_layer_sizes = (120,) # 0.9094446409621507
#hidden_layer_sizes = (115,) # Score: 0.8990095507605235
hidden_layer_sizes = (110,)

# Maximum number of iterations. Not guaranteed to iterate this many times, if the data converges (no signifcant loss over x iterations) beforehand it terminates
max_iter = 50000

# limited memory Broyden-Fletcher-Goldfarb-Shannon algorithm
solver = 'lbfgs' #'sgd' 'adam' 'lbfgs'

# Rectified  Linear Activation
activation = 'relu' # identity' 

#alpha = model smoothness. Higher alpha results in a less complex model
alpha = 0.01
nn_classifier = MLPClassifier(solver=solver, activation=activation, hidden_layer_sizes=hidden_layer_sizes ,alpha=alpha, max_iter=max_iter, verbose=False)

# Fit the training data to the model
nn_classifier.fit(X_train, y_train)

# .score calculates the r^2 values, or correlation, between the predicted and actual values
print(f'Neural Network Classification Score: {nn_classifier.score(X_test, y_test)}')

# y_pred is what the classifier predicts the test values will be categorised as
y_pred = nn_classifier.predict(X_test)
print(f'Neural Network Classification Report:\n{classification_report(y_test, y_pred)}\n')
print(confusion_matrix(y_test, y_pred))

Neural Network Classification Score: 0.911743898125221
Neural Network Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.89      0.89       242
           1       0.85      0.86      0.85       276
           2       0.85      0.87      0.86       304
           3       0.79      0.85      0.82       290
           4       0.90      0.91      0.90       297
           5       0.91      0.92      0.91       333
           6       0.91      0.90      0.90       302
           7       0.92      0.91      0.92       280
           8       0.97      0.95      0.96       286
           9       0.93      0.97      0.95       309
          10       0.95      0.98      0.97       311
          11       0.96      0.96      0.96       315
          12       0.88      0.86      0.87       293
          13       0.95      0.94      0.94       282
          14       0.94      0.96      0.95       283
          15       0.92      0.91      0.9