# Übung 03 - Aufgabe 2

In [1]:
!pip install progressbar2
import os
import wget
import tarfile
import re
import progressbar

import numpy as np
from sklearn.model_selection import train_test_split



In [2]:
%reload_ext version_information
%version_information numpy, sklearn

Software,Version
Python,3.6.12 64bit [GCC 9.3.0]
IPython,7.16.1
OS,Linux 5.4.0 54 generic x86_64 with debian bullseye sid
numpy,1.18.5
sklearn,0.23.2
Thu Nov 26 14:38:28 2020 CET,Thu Nov 26 14:38:28 2020 CET


## Aufgabe a.)
### Datensatz herunterladen

In [3]:
newsgroup20_tgz = '20news-18828.tar.gz'
newsgroup20_folder = os.path.join(os.getcwd(), '20news-18828')

if not os.path.isdir(newsgroup20_folder):
    print("Downloading file...\n")

    if not os.path.isfile(newsgroup20_tgz):
        wget.download('http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz', newsgroup20_tgz)

    os.mkdir(newsgroup20_folder)

    with tarfile.open(newsgroup20_tgz) as tar:
        tar.extractall(path=os.getcwd())

    print("Files extracted")

## Aufgabe b.)
### Stringvektoren & Labels vorbereiten

In [4]:
def strip_header(text):
    _befor, _blankline, after = text.partition('\n\n')
    return after

In [5]:
newsgroups = ["alt.atheism", "comp.graphics", "sci.space", "talk.religion.misc"]
raw_data = []
labels = []

for newsgroup in newsgroups:
    for subdir, _, files in os.walk(os.path.join(newsgroup20_folder, newsgroup)):
        for file in files:
            labels.append(newsgroup)
            with open(os.path.join(subdir, file), 'r', encoding="ISO-8859-1") as f:
                s = strip_header(f.read())
                raw_data.append(s)

print("Number of vectors:", len(raw_data))
print("Number of labels:", len(labels))

Number of vectors: 3387
Number of labels: 3387


## Aufgabe c.)
### Tokens generieren

In [6]:
stripped_data = [strip_header(text) for text in raw_data]
data_regex = [re.compile(r"(?u)\b[a-zA-Z]+\b").findall(s.lower()) for s in stripped_data]

In [7]:
unique_tokens = []
for tokens in data_regex:
    unique_tokens.extend(tokens)

unique_tokens = list(set(unique_tokens))
unique_tokens.sort()
print("Number of unique tokens: {}".format(len(unique_tokens)))

Number of unique tokens: 32122


In [8]:
FEATURE_VECTORS_PATH = os.path.join(os.getcwd(), 'feature_vectors.npy')

def generate_feature_vector(_data, _tokens):
    _feature_vectors = np.zeros(shape=(len(_data), len(_tokens)))

    with progressbar.ProgressBar(max_value=len(_data)) as bar:
        for i, text in enumerate(_data):
            for j, token in enumerate(_tokens):
                _feature_vectors[i][j] = text.count(token)
            bar.update(i)

    _feature_vectors = np.asarray(_feature_vectors, dtype=np.int)

    np.save(FEATURE_VECTORS_PATH, _feature_vectors)

    return _feature_vectors

### Tokens & Merkmalsvektor generieren und speichern

In [9]:
try:
    feature_vectors = np.load(FEATURE_VECTORS_PATH)
    print("Loading saved feature vectors.")
except IOError:
    print("No saved feature vectors found. Generating new ones..")
    feature_vectors = generate_feature_vector(data_regex, unique_tokens)

Loading saved feature vectors.


## Aufgabe d.)
### Train- und Testset aufteilen

In [10]:
X_train, X_test, y_train, y_test = train_test_split(feature_vectors, labels, test_size=0.4, random_state=42)

print("Complete-Set shape:", np.shape(feature_vectors))
print("Complete-Set labels shape:", np.shape(labels))
print()
print("Training-Set shape:", np.shape(X_train))
print("Training-Set labels shape:", np.shape(y_train))
print()
print("Test-Set shape:", np.shape(X_test))
print("Test-Set labels shape:", np.shape(y_test))

Complete-Set shape: (3387, 32122)
Complete-Set labels shape: (3387,)

Training-Set shape: (2032, 32122)
Training-Set labels shape: (2032,)

Test-Set shape: (1355, 32122)
Test-Set labels shape: (1355,)


### Trainieren des multinomialen naiven Bayes-Klassifikators

In [11]:
Ni = {}
nij = {}
pij = {}

for newsgroup in newsgroups:
    Ni[newsgroup] = 0
    nij[newsgroup] = 0
    pij[newsgroup] = 0

for x, y in zip(X_train, y_train):
    Ni[y] += x.sum()
    nij[y] += x

for newsgroup in newsgroups:
    pij[newsgroup] = (nij[newsgroup] + 1) / (Ni[newsgroup] + len(unique_tokens))

### Bestimmung von korrekter Klassifikationen im Testdatensatz

In [12]:
corrects = np.full(shape=(len(X_test)), fill_value=False)

for i, lable in enumerate(y_test):
    max_p = -np.inf
    max_newsgroup = ""

    for newsgroup in newsgroups:
        p = np.sum(np.log(pij[newsgroup]) * X_test[i])

        if p > max_p:
            max_p = p
            max_newsgroup = newsgroup

    corrects[i] = (lable==max_newsgroup)

print("Correct classification: {}/{}".format(corrects.sum(), len(X_test)))
print("Score: {}%".format(np.round((corrects.sum() / len(X_test))*100, decimals=2)))

Correct classification: 1084/1355
Score: 80.0%


### Verwendung von sklearn Bibliothek

In [13]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)
score = mnb.score(X_test, y_test)

print("Score: {}%".format(np.round(score*100, decimals=2)))

Score: 79.48%
