In [1]:
from collections import Counter
import numpy as np
from pprint import pprint
from sklearn.datasets import fetch_20newsgroups

In [2]:
cats = ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware']

In [3]:
train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=cats)

In [4]:
test_data = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=cats)

In [5]:
train_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [6]:
train_data.target_names

['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware']

In [7]:
print(train_data.data[0])

I am looking for a WIN31 driver (or set) for my Diamond 
Speedstar 1MB video card. Does anybody know of an archive
site that has these? I looked at CICA and it had drivers for
the Stealth card and for Generic ET4000 cards but not one 
specifically for the Speedstar. Is there one? Or has Diamond
dropped the Speedstar out of the driver development loop.

Thanks for any info,

Rob
-- 


In [114]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [115]:
#vectorizer = CountVectorizer(min_df = 5, binary = True)
vectorizer = TfidfVectorizer(min_df = 5)

In [116]:
X_train = vectorizer.fit_transform(train_data.data)

In [117]:
X_train.shape

(1181, 3409)

In [118]:
vectorizer.get_feature_names()

['00',
 '01',
 '02',
 '03',
 '04',
 '05',
 '06',
 '07',
 '08',
 '09',
 '0_',
 '0b',
 '0c',
 '0d',
 '0f',
 '0g',
 '0h',
 '0i',
 '0iv',
 '0j',
 '0k',
 '0l',
 '0m',
 '0m75u',
 '0p',
 '0q',
 '0qax',
 '0qq',
 '0r',
 '0s',
 '0t',
 '0tbxn',
 '0tbxom',
 '0tq',
 '0u',
 '0v',
 '0w',
 '0x',
 '10',
 '100',
 '1000',
 '1024',
 '1024x768',
 '1024x768x256',
 '10mb',
 '11',
 '12',
 '120',
 '120mb',
 '128',
 '1280',
 '1280x1024',
 '12mb',
 '13',
 '13p',
 '14',
 '140',
 '145',
 '146',
 '14di',
 '15',
 '1542b',
 '16',
 '1604s',
 '16450',
 '16550',
 '16m',
 '16mb',
 '17',
 '170',
 '18',
 '19',
 '199',
 '1990',
 '1992',
 '1993',
 '1_',
 '1a',
 '1c',
 '1d',
 '1d9',
 '1d9l',
 '1e',
 '1eq',
 '1eqtct',
 '1eqtm',
 '1f',
 '1f9',
 '1f9f8',
 '1f9f9f',
 '1fp',
 '1fp4',
 '1fp4u',
 '1fpl',
 '1g',
 '1h',
 '1j',
 '1k',
 '1l',
 '1m',
 '1mb',
 '1n',
 '1o',
 '1p',
 '1q',
 '1ri',
 '1s',
 '1st',
 '1t',
 '1t7',
 '1u',
 '1v',
 '1w',
 '1x',
 '1y',
 '1z4',
 '1z6e',
 '1z6ei',
 '1z6ei0l',
 '20',
 '200',
 '2000',
 '200mb',
 '2048',

In [119]:
y_train = train_data.target

In [120]:
X_test = vectorizer.transform(test_data.data)

In [121]:
X_test.shape

(786, 3409)

In [122]:
y_test = test_data.target

In [123]:
from sklearn.naive_bayes import BernoulliNB

In [124]:
bnb = BernoulliNB(alpha=1)

In [125]:
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1)

In [126]:
y_train_pred = bnb.predict(X_train)

In [127]:
y_test_pred = bnb.predict(X_test)

In [128]:
from sklearn.metrics import confusion_matrix, accuracy_score, \
                            precision_score, recall_score, f1_score, classification_report

In [129]:
bnb.classes_

array([0, 1], dtype=int64)

In [130]:
train_data.target_names

['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware']

In [131]:
Counter(y_train)

Counter({0: 591, 1: 590})

In [132]:
Counter(y_train_pred)

Counter({1: 957, 0: 224})

In [133]:
confusion_matrix(y_train, y_train_pred, labels=[0, 1])

array([[223, 368],
       [  1, 589]], dtype=int64)

In [134]:
Counter(y_test)

Counter({1: 392, 0: 394})

In [135]:
Counter(y_test_pred)

Counter({1: 711, 0: 75})

In [136]:
confusion_matrix(y_test, y_test_pred, labels=[0, 1])

array([[ 73, 321],
       [  2, 390]], dtype=int64)

In [137]:
def print_metrics(y_a, y_p):
    print("{:<20} {:.3f}".format("Accuracy:", accuracy_score(y_a, y_p)))
    print("{:<20} {:.3f}".format("Precision(0):", precision_score(y_a, y_p, pos_label = 0)))
    print("{:<20} {:.3f}".format("Precision(1):", precision_score(y_a, y_p, pos_label = 1)))
    print("{:<20} {:.3f}".format("Recall(0):", recall_score(y_a, y_p, pos_label = 0)))
    print("{:<20} {:.3f}".format("Recall(1):", recall_score(y_a, y_p, pos_label = 1)))
    print("{:<20} {:.3f}".format("F1(0):", f1_score(y_a, y_p, pos_label = 0)))
    print("{:<20} {:.3f}".format("F1(1):", f1_score(y_a, y_p, pos_label = 1)))

In [138]:
print_metrics(y_train, y_train_pred)

Accuracy:            0.688
Precision(0):        0.996
Precision(1):        0.615
Recall(0):           0.377
Recall(1):           0.998
F1(0):               0.547
F1(1):               0.761


In [139]:
print(classification_report(y_train, y_train_pred, digits=3))

              precision    recall  f1-score   support

           0      0.996     0.377     0.547       591
           1      0.615     0.998     0.761       590

    accuracy                          0.688      1181
   macro avg      0.806     0.688     0.654      1181
weighted avg      0.806     0.688     0.654      1181



In [140]:
print_metrics(y_test, y_test_pred)

Accuracy:            0.589
Precision(0):        0.973
Precision(1):        0.549
Recall(0):           0.185
Recall(1):           0.995
F1(0):               0.311
F1(1):               0.707


In [141]:
print(classification_report(y_test, y_test_pred, digits=3))

              precision    recall  f1-score   support

           0      0.973     0.185     0.311       394
           1      0.549     0.995     0.707       392

    accuracy                          0.589       786
   macro avg      0.761     0.590     0.509       786
weighted avg      0.761     0.589     0.509       786



In [142]:
from sklearn.naive_bayes import MultinomialNB

In [143]:
mnb = MultinomialNB(alpha=1)

In [144]:
mnb.fit(X_train, y_train)

MultinomialNB(alpha=1)

In [145]:
y_train_pred = mnb.predict(X_train)

In [146]:
y_test_pred = mnb.predict(X_test)

In [147]:
print_metrics(y_train, y_train_pred)

Accuracy:            0.923
Precision(0):        0.930
Precision(1):        0.917
Recall(0):           0.915
Recall(1):           0.931
F1(0):               0.922
F1(1):               0.923


In [148]:
print(classification_report(y_train, y_train_pred, digits=3))

              precision    recall  f1-score   support

           0      0.930     0.915     0.922       591
           1      0.917     0.931     0.923       590

    accuracy                          0.923      1181
   macro avg      0.923     0.923     0.923      1181
weighted avg      0.923     0.923     0.923      1181



In [149]:
print_metrics(y_test, y_test_pred)

Accuracy:            0.817
Precision(0):        0.853
Precision(1):        0.787
Recall(0):           0.766
Recall(1):           0.867
F1(0):               0.807
F1(1):               0.825


In [150]:
print(classification_report(y_test, y_test_pred, digits=3))

              precision    recall  f1-score   support

           0      0.853     0.766     0.807       394
           1      0.787     0.867     0.825       392

    accuracy                          0.817       786
   macro avg      0.820     0.817     0.816       786
weighted avg      0.820     0.817     0.816       786

