# Text Classification

**Outline**

1. Simple text
2. 20 news group

In [1]:
import pandas as pd

import numpy as np

from scipy.special import logsumexp

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import BernoulliNB, MultinomialNB

## Simple Text

In [3]:
d0 = 'This doc is about machine learning.'
d1 = 'This doc is about databases.'
d2 = 'We investigate deep learning.'
d3 = 'We investigate sql.'

D =[d0, d1, d2, d3]
y = ['ML', 'DB', 'ML', 'DB']

In [4]:
vectorizer = CountVectorizer(min_df = 1, binary = True)

In [5]:
X = vectorizer.fit_transform(D)

In [6]:
feature_names = vectorizer.get_feature_names_out()

In [7]:
X.todense()

matrix([[1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0],
        [1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0],
        [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1],
        [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1]], dtype=int64)

In [8]:
df = pd.DataFrame(X.todense(), columns = feature_names)
df['Y'] = y
df

Unnamed: 0,about,databases,deep,doc,investigate,is,learning,machine,sql,this,we,Y
0,1,0,0,1,0,1,1,1,0,1,0,ML
1,1,1,0,1,0,1,0,0,0,1,0,DB
2,0,0,1,0,1,0,1,0,0,0,1,ML
3,0,0,0,0,1,0,0,0,1,0,1,DB


In [9]:
clf = BernoulliNB(alpha=1, fit_prior=True)
clf.fit(X, y)

BernoulliNB(alpha=1)

In [10]:
clf.classes_

array(['DB', 'ML'], dtype='<U2')

In [11]:
df = pd.DataFrame(clf.class_count_, columns = ['Counts'])
df['Y'] = clf.classes_
df

Unnamed: 0,Counts,Y
0,2.0,DB
1,2.0,ML


In [12]:
df = pd.DataFrame(clf.feature_count_, columns = feature_names)
df['Y'] = clf.classes_
df

Unnamed: 0,about,databases,deep,doc,investigate,is,learning,machine,sql,this,we,Y
0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,DB
1,1.0,0.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,ML


In [13]:
df = pd.DataFrame(np.exp(clf.feature_log_prob_), columns = feature_names)
df['Y'] = clf.classes_
df

Unnamed: 0,about,databases,deep,doc,investigate,is,learning,machine,sql,this,we,Y
0,0.5,0.5,0.25,0.5,0.5,0.5,0.25,0.25,0.5,0.5,0.5,DB
1,0.5,0.25,0.5,0.5,0.5,0.5,0.75,0.5,0.25,0.5,0.5,ML


In [14]:
df = pd.DataFrame(1-np.exp(clf.feature_log_prob_), columns = feature_names)
df['Y'] = clf.classes_
df

Unnamed: 0,about,databases,deep,doc,investigate,is,learning,machine,sql,this,we,Y
0,0.5,0.5,0.75,0.5,0.5,0.5,0.75,0.75,0.5,0.5,0.5,DB
1,0.5,0.75,0.5,0.5,0.5,0.5,0.25,0.5,0.75,0.5,0.5,ML


In [15]:
df = pd.DataFrame(clf.feature_log_prob_, columns = feature_names)
df['Y'] = clf.classes_
df

Unnamed: 0,about,databases,deep,doc,investigate,is,learning,machine,sql,this,we,Y
0,-0.693147,-0.693147,-1.386294,-0.693147,-0.693147,-0.693147,-1.386294,-1.386294,-0.693147,-0.693147,-0.693147,DB
1,-0.693147,-1.386294,-0.693147,-0.693147,-0.693147,-0.693147,-0.287682,-0.693147,-1.386294,-0.693147,-0.693147,ML


In [16]:
test_docs = ['Learning.']
X_test = vectorizer.transform(test_docs)

In [17]:
clf.predict_proba(X_test)

array([[0.25, 0.75]])

In [18]:
p = np.ones(2)

p *= np.exp(clf.class_log_prior_)


print("Prior: ", p)
print()

for i in range(len(feature_names)):
    print(feature_names[i])
    if i in X_test[0].indices:
        print("Doc has it.")
        print(np.exp(clf.feature_log_prob_[0][i]), np.exp(clf.feature_log_prob_[1][i]))

        p *= np.exp(clf.feature_log_prob_[:, i])

    else:
        print("Doc doesn't have it.")
        print(1-np.exp(clf.feature_log_prob_[0][i]), 1-np.exp(clf.feature_log_prob_[1][i]))

        p *= (1-np.exp(clf.feature_log_prob_[:, i]))
    
    print(p)
    print(p/p.sum())
    
    print()

Prior:  [0.5 0.5]

about
Doc doesn't have it.
0.5 0.5
[0.25 0.25]
[0.5 0.5]

databases
Doc doesn't have it.
0.5 0.75
[0.125  0.1875]
[0.4 0.6]

deep
Doc doesn't have it.
0.75 0.5
[0.09375 0.09375]
[0.5 0.5]

doc
Doc doesn't have it.
0.5 0.5
[0.046875 0.046875]
[0.5 0.5]

investigate
Doc doesn't have it.
0.5 0.5
[0.0234375 0.0234375]
[0.5 0.5]

is
Doc doesn't have it.
0.5 0.5
[0.01171875 0.01171875]
[0.5 0.5]

learning
Doc has it.
0.25 0.7500000000000001
[0.00292969 0.00878906]
[0.25 0.75]

machine
Doc doesn't have it.
0.75 0.5
[0.00219727 0.00439453]
[0.33333333 0.66666667]

sql
Doc doesn't have it.
0.5 0.75
[0.00109863 0.0032959 ]
[0.25 0.75]

this
Doc doesn't have it.
0.5 0.5
[0.00054932 0.00164795]
[0.25 0.75]

we
Doc doesn't have it.
0.5 0.5
[0.00027466 0.00082397]
[0.25 0.75]



In [19]:
clf = MultinomialNB(alpha=1)
clf.fit(X, y)

MultinomialNB(alpha=1)

In [20]:
clf.classes_

array(['DB', 'ML'], dtype='<U2')

In [21]:
df = pd.DataFrame(clf.class_count_, columns = ['Counts'])
df['Y'] = clf.classes_
df

Unnamed: 0,Counts,Y
0,2.0,DB
1,2.0,ML


In [22]:
df = pd.DataFrame(clf.feature_count_, columns = feature_names)
df['Y'] = clf.classes_
df

Unnamed: 0,about,databases,deep,doc,investigate,is,learning,machine,sql,this,we,Y
0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,DB
1,1.0,0.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,1.0,1.0,ML


In [23]:
df = pd.DataFrame(np.exp(clf.feature_log_prob_), columns = feature_names)
df['Y'] = clf.classes_
df

Unnamed: 0,about,databases,deep,doc,investigate,is,learning,machine,sql,this,we,Y
0,0.105263,0.105263,0.052632,0.105263,0.105263,0.105263,0.052632,0.052632,0.105263,0.105263,0.105263,DB
1,0.095238,0.047619,0.095238,0.095238,0.095238,0.095238,0.142857,0.095238,0.047619,0.095238,0.095238,ML


In [24]:
np.sum(clf.feature_count_, axis=1)

array([ 8., 10.])

In [25]:
clf.predict_proba(X_test)

array([[0.26923077, 0.73076923]])

In [26]:
clf.predict_log_proba(X_test)

array([[-1.31218639, -0.31365756]])

In [27]:
p = np.ones(2)

p *= np.exp(clf.class_log_prior_)

jl = np.zeros(2)

jl += clf.class_log_prior_


print("Prior: ", p)
print(jl)
print()

for i in range(len(feature_names)):
    print(feature_names[i])
    if i in X_test[0].indices:
        print("Doc has it.")
        print(np.exp(clf.feature_log_prob_[0][i]), np.exp(clf.feature_log_prob_[1][i]))

        p *= np.exp(clf.feature_log_prob_[:, i])

        jl += clf.feature_log_prob_[:, i]

    else:
        print("Doc doesn't have it. Nothing done.")
    
    print(p)
    print(p/p.sum())
    print(jl)
    
    print()

Prior:  [0.5 0.5]
[-0.69314718 -0.69314718]

about
Doc doesn't have it. Nothing done.
[0.5 0.5]
[0.5 0.5]
[-0.69314718 -0.69314718]

databases
Doc doesn't have it. Nothing done.
[0.5 0.5]
[0.5 0.5]
[-0.69314718 -0.69314718]

deep
Doc doesn't have it. Nothing done.
[0.5 0.5]
[0.5 0.5]
[-0.69314718 -0.69314718]

doc
Doc doesn't have it. Nothing done.
[0.5 0.5]
[0.5 0.5]
[-0.69314718 -0.69314718]

investigate
Doc doesn't have it. Nothing done.
[0.5 0.5]
[0.5 0.5]
[-0.69314718 -0.69314718]

is
Doc doesn't have it. Nothing done.
[0.5 0.5]
[0.5 0.5]
[-0.69314718 -0.69314718]

learning
Doc has it.
0.05263157894736843 0.14285714285714288
[0.02631579 0.07142857]
[0.26923077 0.73076923]
[-3.63758616 -2.63905733]

machine
Doc doesn't have it. Nothing done.
[0.02631579 0.07142857]
[0.26923077 0.73076923]
[-3.63758616 -2.63905733]

sql
Doc doesn't have it. Nothing done.
[0.02631579 0.07142857]
[0.26923077 0.73076923]
[-3.63758616 -2.63905733]

this
Doc doesn't have it. Nothing done.
[0.02631579 0.0

In [28]:
jl

array([-3.63758616, -2.63905733])

In [29]:
jl-logsumexp(jl)

array([-1.31218639, -0.31365756])

In [30]:
np.exp(jl-logsumexp(jl))

array([0.26923077, 0.73076923])

## 20 News Group

In [31]:
from sklearn.datasets import fetch_20newsgroups

In [32]:
train_data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
#train_data = fetch_20newsgroups(subset='train')

In [33]:
test_data = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

In [34]:
train_data.keys()

dict_keys(['target_names', 'filenames', 'data', 'target', 'DESCR'])

In [35]:
train_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [36]:
print(train_data.data[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [37]:
vectorizer = CountVectorizer(min_df = 5, binary = True)

In [38]:
X_train = vectorizer.fit_transform(train_data.data)

In [39]:
X_train.shape

(11314, 18101)

In [40]:
X_test = vectorizer.transform(test_data.data)

In [41]:
X_test.shape

(7532, 18101)

In [42]:
y_train = train_data.target
y_test = test_data.target

In [43]:
bn_clf = BernoulliNB(alpha=1)

In [44]:
bn_clf.fit(X_train, y_train)

BernoulliNB(alpha=1)

In [45]:
from sklearn.metrics import classification_report

In [46]:
def print_classification_report(X, y, clf, labels):
    y_pred = clf.predict(X)
    print(classification_report(y, y_pred, digits=3, target_names=labels))

In [47]:
print_classification_report(X_train, y_train, bn_clf, labels = train_data.target_names)

                          precision    recall  f1-score   support

             alt.atheism      0.724     0.562     0.633       480
           comp.graphics      0.729     0.695     0.712       584
 comp.os.ms-windows.misc      0.980     0.330     0.494       591
comp.sys.ibm.pc.hardware      0.699     0.800     0.746       590
   comp.sys.mac.hardware      0.426     0.908     0.580       578
          comp.windows.x      0.886     0.577     0.699       593
            misc.forsale      0.440     0.872     0.585       585
               rec.autos      0.610     0.796     0.691       594
         rec.motorcycles      0.294     0.918     0.445       598
      rec.sport.baseball      0.773     0.841     0.806       597
        rec.sport.hockey      1.000     0.647     0.785       600
               sci.crypt      0.892     0.556     0.685       595
         sci.electronics      0.779     0.745     0.761       591
                 sci.med      0.990     0.643     0.780       594
         

In [48]:
print_classification_report(X_test, y_test, bn_clf, labels = train_data.target_names)

                          precision    recall  f1-score   support

             alt.atheism      0.390     0.295     0.336       319
           comp.graphics      0.524     0.514     0.519       389
 comp.os.ms-windows.misc      0.818     0.069     0.126       394
comp.sys.ibm.pc.hardware      0.500     0.622     0.555       392
   comp.sys.mac.hardware      0.317     0.743     0.445       385
          comp.windows.x      0.796     0.435     0.563       395
            misc.forsale      0.389     0.826     0.529       390
               rec.autos      0.478     0.659     0.554       396
         rec.motorcycles      0.246     0.847     0.381       398
      rec.sport.baseball      0.731     0.746     0.738       397
        rec.sport.hockey      0.987     0.576     0.728       399
               sci.crypt      0.680     0.386     0.493       396
         sci.electronics      0.549     0.496     0.521       393
                 sci.med      0.876     0.394     0.544       396
         

In [49]:
mn_clf = MultinomialNB(alpha=1)

In [50]:
mn_clf.fit(X_train, y_train)

MultinomialNB(alpha=1)

In [51]:
print_classification_report(X_train, y_train, mn_clf, labels = train_data.target_names)

                          precision    recall  f1-score   support

             alt.atheism      0.838     0.810     0.824       480
           comp.graphics      0.795     0.803     0.799       584
 comp.os.ms-windows.misc      0.949     0.506     0.660       591
comp.sys.ibm.pc.hardware      0.659     0.922     0.768       590
   comp.sys.mac.hardware      0.882     0.870     0.876       578
          comp.windows.x      0.824     0.899     0.860       593
            misc.forsale      0.888     0.824     0.855       585
               rec.autos      0.862     0.848     0.855       594
         rec.motorcycles      0.878     0.866     0.872       598
      rec.sport.baseball      0.941     0.881     0.910       597
        rec.sport.hockey      0.629     0.918     0.747       600
               sci.crypt      0.916     0.881     0.898       595
         sci.electronics      0.884     0.822     0.852       591
                 sci.med      0.940     0.889     0.913       594
         

In [52]:
print_classification_report(X_test, y_test, mn_clf, labels = train_data.target_names)

                          precision    recall  f1-score   support

             alt.atheism      0.526     0.382     0.443       319
           comp.graphics      0.592     0.656     0.622       389
 comp.os.ms-windows.misc      0.742     0.168     0.273       394
comp.sys.ibm.pc.hardware      0.478     0.791     0.596       392
   comp.sys.mac.hardware      0.674     0.639     0.656       385
          comp.windows.x      0.720     0.742     0.731       395
            misc.forsale      0.818     0.726     0.769       390
               rec.autos      0.732     0.710     0.721       396
         rec.motorcycles      0.720     0.719     0.719       398
      rec.sport.baseball      0.892     0.788     0.837       397
        rec.sport.hockey      0.594     0.870     0.706       399
               sci.crypt      0.722     0.720     0.721       396
         sci.electronics      0.630     0.560     0.593       393
                 sci.med      0.801     0.720     0.758       396
         

In [53]:
from sklearn.linear_model import LogisticRegression

In [54]:
lr_clf = LogisticRegression(C=1)

In [55]:
lr_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1)

In [56]:
print_classification_report(X_train, y_train, lr_clf, labels = train_data.target_names)

                          precision    recall  f1-score   support

             alt.atheism      0.998     0.973     0.985       480
           comp.graphics      0.993     0.962     0.977       584
 comp.os.ms-windows.misc      0.989     0.953     0.971       591
comp.sys.ibm.pc.hardware      0.998     0.969     0.984       590
   comp.sys.mac.hardware      0.989     0.958     0.974       578
          comp.windows.x      0.998     0.987     0.992       593
            misc.forsale      0.985     0.983     0.984       585
               rec.autos      0.977     0.944     0.961       594
         rec.motorcycles      0.993     0.970     0.981       598
      rec.sport.baseball      0.668     0.995     0.799       597
        rec.sport.hockey      1.000     0.970     0.985       600
               sci.crypt      1.000     0.975     0.987       595
         sci.electronics      0.998     0.968     0.983       591
                 sci.med      1.000     0.966     0.983       594
         

In [57]:
print_classification_report(X_test, y_test, lr_clf, labels = train_data.target_names)

                          precision    recall  f1-score   support

             alt.atheism      0.409     0.420     0.414       319
           comp.graphics      0.551     0.614     0.581       389
 comp.os.ms-windows.misc      0.548     0.546     0.547       394
comp.sys.ibm.pc.hardware      0.574     0.556     0.565       392
   comp.sys.mac.hardware      0.627     0.584     0.605       385
          comp.windows.x      0.745     0.620     0.677       395
            misc.forsale      0.758     0.749     0.754       390
               rec.autos      0.608     0.626     0.617       396
         rec.motorcycles      0.606     0.673     0.638       398
      rec.sport.baseball      0.465     0.761     0.577       397
        rec.sport.hockey      0.816     0.777     0.796       399
               sci.crypt      0.789     0.606     0.686       396
         sci.electronics      0.479     0.496     0.488       393
                 sci.med      0.696     0.614     0.652       396
         

In [62]:
lr_clf = LogisticRegression(C=0.05)

In [63]:
lr_clf.fit(X_train, y_train)

LogisticRegression(C=0.05)

In [64]:
print_classification_report(X_train, y_train, lr_clf, labels = train_data.target_names)

                          precision    recall  f1-score   support

             alt.atheism      0.892     0.840     0.865       480
           comp.graphics      0.838     0.848     0.843       584
 comp.os.ms-windows.misc      0.879     0.849     0.864       591
comp.sys.ibm.pc.hardware      0.911     0.849     0.879       590
   comp.sys.mac.hardware      0.904     0.862     0.882       578
          comp.windows.x      0.928     0.887     0.907       593
            misc.forsale      0.864     0.891     0.877       585
               rec.autos      0.860     0.850     0.855       594
         rec.motorcycles      0.791     0.886     0.836       598
      rec.sport.baseball      0.518     0.955     0.672       597
        rec.sport.hockey      0.962     0.890     0.925       600
               sci.crypt      0.983     0.881     0.929       595
         sci.electronics      0.903     0.846     0.873       591
                 sci.med      0.957     0.891     0.922       594
         

In [65]:
print_classification_report(X_test, y_test, lr_clf, labels = train_data.target_names)

                          precision    recall  f1-score   support

             alt.atheism      0.441     0.448     0.445       319
           comp.graphics      0.566     0.650     0.605       389
 comp.os.ms-windows.misc      0.602     0.548     0.574       394
comp.sys.ibm.pc.hardware      0.639     0.569     0.602       392
   comp.sys.mac.hardware      0.666     0.621     0.642       385
          comp.windows.x      0.762     0.623     0.685       395
            misc.forsale      0.727     0.772     0.749       390
               rec.autos      0.651     0.616     0.633       396
         rec.motorcycles      0.577     0.688     0.628       398
      rec.sport.baseball      0.392     0.783     0.523       397
        rec.sport.hockey      0.876     0.782     0.826       399
               sci.crypt      0.831     0.609     0.703       396
         sci.electronics      0.482     0.511     0.496       393
                 sci.med      0.657     0.596     0.625       396
         