In [14]:
# Data Collection & Cleaning

! pip install sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
! pip install numpy
import numpy as np
! pip install nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

dataset = fetch_20newsgroups(subset='all', random_state=17)

def tokenize(text):
    return word_tokenize(text)

def remove_stop_words(text):
    stop_words = set(stopwords.words('english'))
    return( [token.lower()for token in text if token not in stop_words] )

def stem_words(text):
    stemmer = WordNetLemmatizer()
    return([stemmer.lemmatize(token) for token in text])

def stem_words_more(text):
    stemmer = PorterStemmer()
    return([stemmer.stem(token) for token in text])

def remove_punctuation(text):
    punctuation = '!"#$%&\'()*+, -./:;<=>?@[\]^_`{|}~'
    return([char for char in text if char not in punctuation])

def clean_data(input_list):
    return_list = []
    for li in input_list:
        return_list.append(stem_words(remove_punctuation(remove_stop_words(tokenize(li)))))
    return return_list

def dummy(doc):
    return doc

X, y = dataset.data, dataset.target
data_size = len(X)
X = X[:data_size]
y = y[:data_size]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state = 23)
X_train = clean_data(X_train)
X_test = clean_data(X_test)
y_train = y_train
y_test = y_test

tfidf = TfidfVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy, token_pattern=None)

X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\darra\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\darra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\darra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Naive Bayes Classification

# Naive Bayes Classification Score: 0.9241379310344827

from sklearn.naive_bayes import MultinomialNB
nb_classifier = MultinomialNB(alpha=0.01)
nb_classifier.fit(X_train, y_train)
print(f'Naive Bayes Classification Score: {nb_classifier.score(X_test, y_test)}')
y_pred = nb_classifier.predict(X_test)
print(f'Naive Bayes Classification Report:\n{classification_report(y_test, y_pred)}\n')
print(confusion_matrix(y_test, y_pred))

Naive Bayes Classification Score: 0.9241379310344827
Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.93      0.92       167
           1       0.83      0.89      0.86       193
           2       0.88      0.87      0.88       206
           3       0.84      0.88      0.86       179
           4       0.94      0.89      0.91       193
           5       0.92      0.93      0.92       229
           6       0.91      0.83      0.87       193
           7       0.92      0.95      0.94       187
           8       0.98      0.97      0.97       199
           9       0.98      0.96      0.97       211
          10       0.95      0.99      0.97       204
          11       0.96      0.95      0.95       203
          12       0.90      0.91      0.91       206
          13       0.97      0.96      0.96       179
          14       0.96      0.96      0.96       187
          15       0.90      0.96      0.93    

In [10]:
# Support Vector Machine Classification 

# SVM Classification Score: 0.8938992042440318

from sklearn.svm import SVC
svm_classifier = SVC(kernel='poly', degree=2, max_iter = 5000)
svm_classifier.fit(X_train, y_train)
print(f'SVM Classification Score: {svm_classifier.score(X_test, y_test)}')
y_pred = svm_classifier.predict(X_test)
print(f'SVM Classification Report:\n{classification_report(y_test, y_pred)}\n')

SVM Classification Score: 0.8893899204244032
SVM Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.85      0.89       167
           1       0.67      0.90      0.77       193
           2       0.81      0.86      0.83       206
           3       0.88      0.87      0.88       179
           4       0.97      0.87      0.91       193
           5       0.95      0.91      0.93       229
           6       0.59      0.88      0.71       193
           7       0.93      0.89      0.91       187
           8       0.98      0.89      0.93       199
           9       0.99      0.93      0.96       211
          10       0.95      0.94      0.95       204
          11       1.00      0.90      0.95       203
          12       0.87      0.84      0.86       206
          13       0.93      0.94      0.94       179
          14       0.96      0.93      0.94       187
          15       0.90      0.89      0.90       196
         

In [13]:
# Neural Network Classification

# Neural Network Classification Score: 0.9257294429708223

from sklearn.neural_network import MLPClassifier
hidden_layer_sizes = 20
max_iter = 50000
solver = 'lbfgs' #'sgd' 'adam' 'lbfgs'
activation = 'relu' # identity' #'relu'
alpha = 0.1
nn_classifier = MLPClassifier(solver=solver, activation=activation, hidden_layer_sizes = hidden_layer_sizes, alpha=alpha, max_iter=max_iter, verbose=False)
nn_classifier.fit(X_train, y_train)
print(f'Neural Network Classification Score: {nn_classifier.score(X_test, y_test)}')
y_pred = nn_classifier.predict(X_test)
print(f'Neural Network Classification Report:\n{classification_report(y_test, y_pred)}\n')
print(confusion_matrix(y_test, y_pred))

Neural Network Classification Score: 0.920159151193634
Neural Network Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.87      0.89       167
           1       0.84      0.89      0.86       193
           2       0.88      0.89      0.88       206
           3       0.84      0.88      0.86       179
           4       0.94      0.91      0.92       193
           5       0.93      0.93      0.93       229
           6       0.86      0.91      0.89       193
           7       0.91      0.91      0.91       187
           8       0.94      0.93      0.94       199
           9       0.98      0.94      0.96       211
          10       0.94      0.98      0.96       204
          11       0.99      0.94      0.97       203
          12       0.90      0.88      0.89       206
          13       0.94      0.95      0.95       179
          14       0.94      0.96      0.95       187
          15       0.96      0.92      0.9