In [58]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer as TFIDF
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import brier_score_loss as BS
from sklearn.calibration import CalibratedClassifierCV

In [1]:
sample = ["Machine learning is fascinating, it is wonderful"
          ,"Machine learning is a sensational techonology"
          ,"Elsa is a popular character"]

In [3]:
vec = CountVectorizer()

In [4]:
X = vec.fit_transform(sample)

In [5]:
X

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [7]:
# 系数矩阵类无法创建 df，要用 toarray 中转一下
CVresult = pd.DataFrame(X.toarray(), columns = vec.get_feature_names())

In [8]:
CVresult

Unnamed: 0,character,elsa,fascinating,is,it,learning,machine,popular,sensational,techonology,wonderful
0,0,0,1,2,1,1,1,0,0,0,1
1,0,0,0,1,0,1,1,0,1,1,0
2,1,1,0,1,0,0,0,1,0,0,0


In [11]:
vec = TFIDF()

In [12]:
X = vec.fit_transform(sample)

In [13]:
X

<3x11 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [14]:
TFIDFresult = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

In [15]:
TFIDFresult

Unnamed: 0,character,elsa,fascinating,is,it,learning,machine,popular,sensational,techonology,wonderful
0,0.0,0.0,0.424396,0.50131,0.424396,0.322764,0.322764,0.0,0.0,0.0,0.424396
1,0.0,0.0,0.0,0.315444,0.0,0.406192,0.406192,0.0,0.534093,0.534093,0.0
2,0.546454,0.546454,0.0,0.322745,0.0,0.0,0.0,0.546454,0.0,0.0,0.0


In [16]:
CVresult.sum(axis=0) / CVresult.sum(axis=0).sum()

character      0.0625
elsa           0.0625
fascinating    0.0625
is             0.2500
it             0.0625
learning       0.1250
machine        0.1250
popular        0.0625
sensational    0.0625
techonology    0.0625
wonderful      0.0625
dtype: float64

In [18]:
CVresult.sum(axis=1)

0    7
1    5
2    4
dtype: int64

In [21]:
TFIDFresult.sum(axis=0) / TFIDFresult.sum(axis=0).sum()

character      0.083071
elsa           0.083071
fascinating    0.064516
is             0.173225
it             0.064516
learning       0.110815
machine        0.110815
popular        0.083071
sensational    0.081192
techonology    0.081192
wonderful      0.064516
dtype: float64

In [29]:
data = fetch_20newsgroups(subset="all")

In [31]:
# 不同类型的新闻

data.data

["From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>\nSubject: Pens fans reactions\nOrganization: Post Office, Carnegie Mellon, Pittsburgh, PA\nLines: 12\nNNTP-Posting-Host: po4.andrew.cmu.edu\n\n\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n",
 'From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson)\nSubject: Which h

In [32]:
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [34]:
categories = ["sci.space"
              ,"rec.sport.hockey"
              ,"talk.politics.guns"
              ,"talk.politics.mideast"
             ]

In [35]:
train = fetch_20newsgroups(subset="train", categories=categories)
test = fetch_20newsgroups(subset="test", categories=categories)

In [36]:
train

{'data': ["From: tvartiai@vipunen.hut.fi (Tommi Vartiainen)\nSubject: Re: Finland/Sweden vs.NHL teams (WAS:Helsinki/Stockholm & NHL expansion)\nNntp-Posting-Host: vipunen.hut.fi\nOrganization: Helsinki University of Technology, Finland\nLines: 51\n\nIn <1993Apr16.195754.5476@ousrvr.oulu.fi> mep@phoenix.oulu.fi (Marko Poutiainen) writes:\n\n>: FINLAND:  \n>: \n>: D-Jyrki Lumme.......20\n>: D-Teppo Numminen....20\n>: D-Peter Ahola.......13\n>: \n>Well well, they don't like our defenders (mainly Lumme and Numminen)...\n\nAbout 25 is correct for Numminen and Lumme.\n\n\n>: R-Teemu Selanne.....27\n>: \n>Compared to Kurri, Selanne's points are too high, lets make it 25 or 26.\n\nNo, Kurri's points are too low. 27 for Kurri and 28 for Sel{nne.\n\n>: well in the Canada Cup and World Championships largely due to the efforts of\n>: Markus Ketterer (the goalie), 3-4 or the players listed above and luck. There's\n>: presumably a lot of decent players in Finland that wouldn't be superstars at\n>: t

In [37]:
len(train.data)

2303

In [38]:
train.data[0]

"From: tvartiai@vipunen.hut.fi (Tommi Vartiainen)\nSubject: Re: Finland/Sweden vs.NHL teams (WAS:Helsinki/Stockholm & NHL expansion)\nNntp-Posting-Host: vipunen.hut.fi\nOrganization: Helsinki University of Technology, Finland\nLines: 51\n\nIn <1993Apr16.195754.5476@ousrvr.oulu.fi> mep@phoenix.oulu.fi (Marko Poutiainen) writes:\n\n>: FINLAND:  \n>: \n>: D-Jyrki Lumme.......20\n>: D-Teppo Numminen....20\n>: D-Peter Ahola.......13\n>: \n>Well well, they don't like our defenders (mainly Lumme and Numminen)...\n\nAbout 25 is correct for Numminen and Lumme.\n\n\n>: R-Teemu Selanne.....27\n>: \n>Compared to Kurri, Selanne's points are too high, lets make it 25 or 26.\n\nNo, Kurri's points are too low. 27 for Kurri and 28 for Sel{nne.\n\n>: well in the Canada Cup and World Championships largely due to the efforts of\n>: Markus Ketterer (the goalie), 3-4 or the players listed above and luck. There's\n>: presumably a lot of decent players in Finland that wouldn't be superstars at\n>: the highest

In [39]:
np.unique(train.target)

array([0, 1, 2, 3])

In [42]:
for i in np.unique(train.target):
    print(i, (train.target ==  i).sum() / len(train.target))

0 0.26052974381241856
1 0.25749023013460703
2 0.23708206686930092
3 0.24489795918367346


In [43]:
# TFIDF 编码

Xtrain = train.data
Xtest = test.data
Ytrain = train.target
Ytest = test.target

In [44]:
tfidf = TFIDF().fit(Xtrain)
Xtrain_ = tfidf.transform(Xtrain)
Xtest_ = tfidf.transform(Xtest)

In [45]:
Xtrain_

<2303x40725 sparse matrix of type '<class 'numpy.float64'>'
	with 430306 stored elements in Compressed Sparse Row format>

In [46]:
tosee = pd.DataFrame(Xtrain_.toarray(), columns=tfidf.get_feature_names())

In [47]:
tosee.head()

Unnamed: 0,00,000,0000,00000,000000,000021,000062david42,000152,000246,000256,...,zwrm,zx,zx6wre,zxp,zxqi,zy,zyg,zz,zz_g9q3,zzzzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.058046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
tosee.shape

(2303, 40725)

In [54]:
def get_brier_score(y_true, y_prob, pos_label):
    y_prob_ = y_prob.copy()[:, pos_label]
    y_true_ = y_true.copy()
    for i in range(len(y_true_)):
        if y_true_[i] == pos_label:
            y_true_[i] = 1
        else:
            y_true_[i] = 0
            
    score = BS(y_true=y_true_, y_prob=y_prob_)
    return score 

In [57]:
name = ["Multinomial","Complement","Bournulli"]
models = [MultinomialNB(), ComplementNB(), BernoulliNB()]

for name_, clf in zip(name, models):
    clf.fit(Xtrain_, Ytrain)
    Y_pred = clf.predict(Xtest_)
    proba = clf.predict_proba(Xtest_)
    score = clf.score(Xtest_, Ytest)
    print(name_)
    Bscore = []
    for i in range(len(np.unique(Ytrain))):
        bs = get_brier_score(Ytest, proba, i)
        Bscore.append(bs)
        print("\tBrier under {}: {:.3f}".format(train.target_names[i], bs))
    
    print("\tAverage Brier: {:.3f}".format(np.mean(Bscore)))
    print("\tAccuracy: {:.3f}".format(score))
    print("\n")

Multinomial
	Brier under rec.sport.hockey: 0.018
	Brier under sci.space: 0.033
	Brier under talk.politics.guns: 0.030
	Brier under talk.politics.mideast: 0.026
	Average Brier: 0.027
	Accuracy: 0.975


Complement
	Brier under rec.sport.hockey: 0.023
	Brier under sci.space: 0.039
	Brier under talk.politics.guns: 0.039
	Brier under talk.politics.mideast: 0.033
	Average Brier: 0.033
	Accuracy: 0.986


Bournulli
	Brier under rec.sport.hockey: 0.068
	Brier under sci.space: 0.025
	Brier under talk.politics.guns: 0.045
	Brier under talk.politics.mideast: 0.053
	Average Brier: 0.048
	Accuracy: 0.902




In [59]:
# 概率校准补集朴素贝叶斯

name = ["Multinomial"
        ,"Multinomial + Isotonic"
        ,"Multinomial + Sigmoid"
        ,"Complement"
        ,"Complement + Isotonic"
        ,"Complement + Sigmoid"
        ,"Bernoulli"
        ,"Bernoulli + Isotonic"
        ,"Bernoulli + Sigmoid"
       ]

models = [MultinomialNB()
          ,CalibratedClassifierCV(MultinomialNB(), cv=2, method="isotonic")
          ,CalibratedClassifierCV(MultinomialNB(), cv=2, method="sigmoid")
          ,ComplementNB()
          ,CalibratedClassifierCV(ComplementNB(), cv=2, method="isotonic")
          ,CalibratedClassifierCV(ComplementNB(), cv=2, method="sigmoid")
          ,BernoulliNB()
          ,CalibratedClassifierCV(BernoulliNB(), cv=2, method="isotonic")
          ,CalibratedClassifierCV(BernoulliNB(), cv=2, method="sigmoid")
         ]

for name_, clf in zip(name, models):
    clf.fit(Xtrain_, Ytrain)
    Y_pred = clf.predict(Xtest_)
    proba = clf.predict_proba(Xtest_)
    score = clf.score(Xtest_, Ytest)
    print(name_)
    Bscore = []
    for i in range(len(np.unique(Ytrain))):
        bs = get_brier_score(Ytest, proba, i)
        Bscore.append(bs)
        print("\tBrier under {}: {:.3f}".format(train.target_names[i], bs))
    
    print("\tAverage Brier: {:.3f}".format(np.mean(Bscore)))
    print("\tAccuracy: {:.3f}".format(score))
    print("\n")

Multinomial
	Brier under rec.sport.hockey: 0.018
	Brier under sci.space: 0.033
	Brier under talk.politics.guns: 0.030
	Brier under talk.politics.mideast: 0.026
	Average Brier: 0.027
	Accuracy: 0.975


Multinomial + Isotonic
	Brier under rec.sport.hockey: 0.006
	Brier under sci.space: 0.012
	Brier under talk.politics.guns: 0.013
	Brier under talk.politics.mideast: 0.009
	Average Brier: 0.010
	Accuracy: 0.973


Multinomial + Sigmoid
	Brier under rec.sport.hockey: 0.006
	Brier under sci.space: 0.012
	Brier under talk.politics.guns: 0.013
	Brier under talk.politics.mideast: 0.009
	Average Brier: 0.010
	Accuracy: 0.973


Complement
	Brier under rec.sport.hockey: 0.023
	Brier under sci.space: 0.039
	Brier under talk.politics.guns: 0.039
	Brier under talk.politics.mideast: 0.033
	Average Brier: 0.033
	Accuracy: 0.986


Complement + Isotonic
	Brier under rec.sport.hockey: 0.004
	Brier under sci.space: 0.007
	Brier under talk.politics.guns: 0.009
	Brier under talk.politics.mideast: 0.006
	Avera