In [2]:
import urllib.request
import tarfile
import numpy as np
import pandas as pd
import os
%load_ext version_information

#### a. Download und Entpacken der Textdatenbank

In [3]:
data_folder_name = "20news-18828"
if not os.path.isdir(data_folder_name):
    url = "http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz"
    ftpstream = urllib.request.urlopen(url)
    tar = tarfile.open(fileobj=ftpstream, mode="r|gz")
    tar.extractall()
    tar.close()
    print('Data extracted')
data_path = data_folder_name

#### b. Dateien aus  Newsgroups alt.atheism, comp.graphics, sci.spaceund talk.religion.misc in String umwandeln und Header entfernen

In [3]:
def strip_header(text) :
    _before, _blankline, after = text.partition('\n\n')
    return after

In [5]:
news_groups = ["alt.atheism", "comp.graphics", "sci.space", "talk.religion.misc"]

document_array = []
document_class = []
for root, dirs, files in os.walk(data_path):
    for dir_name in filter(news_groups.__contains__, dirs):
        path = os.path.join(data_path, dir_name)
        filesInDir = os.listdir(path)
        for file in filesInDir:
            with open(os.path.join(path, file), encoding='ISO-8859-1') as f:
                contents = f.read()
                document_class.append(dir_name)
                document_array.append(contents)
                
# strip headers
data = [strip_header(text) for text in document_array]
len(data)
            

3387

#### c. Zerlegung in Tokens und Bestimmung des Vocabulars

In [48]:
import re
voc = []
for text in data:
    l = re.compile(r"(?u)\b\w\w+\b").findall(text.lower())
    voc = list(set(voc) | set(l))
len(voc)

41777

#### d. Training und Klassifikation der Daten

Daten in Training 60% und Testdatensatz 40% aufteilen

In [49]:
data_Frame = pd.DataFrame(data, columns=["string_data"])
class_Frame = pd.DataFrame(document_class, columns=["class_name"])
data_class = pd.concat([data_Frame, class_Frame], axis=1)
data_class.set_index('class_name', inplace=True)

In [175]:
data_training_frame = pd.DataFrame()
data_test_frame = pd.DataFrame()
for g in news_groups:
    data_tmp = data_class.loc[g]["string_data"]
    data_training_frame = pd.concat([data_training_frame, data_tmp[:int(.6*len(data_tmp))]])
    data_test_frame = pd.concat([data_test_frame, data_tmp[int(.6*len(data_tmp)):len(data_tmp)]])

In [51]:
training_class = list(data_training_frame.index.values)
data_training = data_training_frame[0].values
data_test = data_test_frame[0].values
print(np.shape(data_training))
print(np.shape(data_test))

(2030,)
(1357,)


Trainieren des multinomialen naiven Bayes-Klassifikator

In [53]:
training_word_count = []
for text in data_training:
    vector_text = []
    for word in voc:
        word_count = text.count(word)
        vector_text.append(word_count)
    training_word_count.append(vector_text)

In [54]:
training_word_count_frame = pd.DataFrame(training_word_count, columns=voc)
training_class_frame = pd.DataFrame(training_class, columns=["class_name"])
training_word_count = pd.concat([training_word_count_frame, training_class_frame], axis=1)
training_word_count.set_index('class_name')

Unnamed: 0_level_0,cisc,arena,1718,vsop,underlies,theologically,glewis,genus,xabj,88m,...,heights,doctrines,complain,multitudes,bobcat,mole,easton,instinctive,484,cameras
class_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alt.atheism,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alt.atheism,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alt.atheism,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alt.atheism,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alt.atheism,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
talk.religion.misc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
talk.religion.misc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
talk.religion.misc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
talk.religion.misc,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
sum_wordcount_byclass = training_word_count.groupby(by=["class_name"]).sum()
sum_wordcount_byclass

Unnamed: 0_level_0,cisc,arena,1718,vsop,underlies,theologically,glewis,genus,xabj,88m,...,heights,doctrines,complain,multitudes,bobcat,mole,easton,instinctive,484,cameras
class_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alt.atheism,2,1,0,0,1,1,0,0,0,0,...,0,10,10,1,0,2,0,29,6,0
comp.graphics,4,0,0,0,0,0,4,0,0,0,...,0,0,5,0,2,20,0,0,7,4
sci.space,7,2,1,0,0,0,0,0,0,0,...,4,0,30,0,0,14,0,0,2,10
talk.religion.misc,0,0,0,0,1,1,0,0,2,0,...,1,9,1,0,0,2,0,0,6,0


In [56]:
totality_word_per_class = sum_wordcount_byclass.sum(axis=1)
totality_word_per_class

class_name
alt.atheism           871857
comp.graphics         785457
sci.space             999187
talk.religion.misc    665076
dtype: int64

In [57]:
# relative Häufigkeit mit laplace Glättung
sum_wordcount_byclass_array=np.array(sum_wordcount_byclass)
totality_word_per_class_array = np.array(totality_word_per_class)

res_laplace = ((sum_wordcount_byclass_array.T + 1)/(totality_word_per_class_array + len(voc))).T
training_likelihood_v = pd.DataFrame(res_laplace, columns=voc)
classes_frame = pd.DataFrame(news_groups, columns=["classes_name"])
training_likelihood = pd.concat([training_likelihood_v, classes_frame], axis=1)
training_likelihood

Unnamed: 0,cisc,arena,1718,vsop,underlies,theologically,glewis,genus,xabj,88m,...,doctrines,complain,multitudes,bobcat,mole,easton,instinctive,484,cameras,classes_name
0,3e-06,2e-06,1e-06,1.09453e-06,2.18906e-06,2.18906e-06,1.09453e-06,1.09453e-06,1.09453e-06,1.09453e-06,...,1.203983e-05,1.2e-05,2.18906e-06,1.09453e-06,3e-06,1.09453e-06,3.283591e-05,8e-06,1e-06,alt.atheism
1,6e-06,1e-06,1e-06,1.208848e-06,1.208848e-06,1.208848e-06,6.044239e-06,1.208848e-06,1.208848e-06,1.208848e-06,...,1.208848e-06,7e-06,1.208848e-06,3.626543e-06,2.5e-05,1.208848e-06,1.208848e-06,1e-05,6e-06,comp.graphics
2,8e-06,3e-06,2e-06,9.60648e-07,9.60648e-07,9.60648e-07,9.60648e-07,9.60648e-07,9.60648e-07,9.60648e-07,...,9.60648e-07,3e-05,9.60648e-07,9.60648e-07,1.4e-05,9.60648e-07,9.60648e-07,3e-06,1.1e-05,sci.space
3,1e-06,1e-06,1e-06,1.414721e-06,2.829443e-06,2.829443e-06,1.414721e-06,1.414721e-06,4.244164e-06,1.414721e-06,...,1.414721e-05,3e-06,1.414721e-06,1.414721e-06,4e-06,1.414721e-06,1.414721e-06,1e-05,1e-06,talk.religion.misc


Klassifikation der Testdaten

In [210]:
result_array = pd.DataFrame(columns=['Newsgroup', 'Zugeordnet', 'korrekt'])
for index, row in data_test_frame.iterrows(): 
    text = re.compile(r"(?u)\b\w\w+\b").findall(row[0].lower())
    t = training_likelihood[text]
    t = t.apply(np.log)
    df1 = pd.concat([t.sum(axis=1).to_frame(), classes_frame], axis=1)
    res_class = df1[df1[0]==df1[0].max()]["classes_name"].values[0]
    correct = (index == res_class)
    result_array = result_array.append({'Newsgroup': index, 'Zugeordnet': res_class, 'korrekt': correct,}, ignore_index=True)

In [227]:
result_frame = result_array['korrekt'].value_counts().to_frame()
count = result_frame.sum().values[0]
result_frame["Anteil"] = (result_frame["korrekt"].values * 100) / count
result_frame

Unnamed: 0,korrekt,Anteil
True,1209,89.093589
False,148,10.906411


In [127]:
%version_information numpy

Software,Version
Python,3.7.6 64bit [GCC 7.3.0]
IPython,7.12.0
OS,Linux 5.4.0 53 generic x86_64 with debian bullseye sid
numpy,1.18.1
Fri Nov 20 00:01:36 2020 CET,Fri Nov 20 00:01:36 2020 CET
