In [3]:
import sklearn
from sklearn import datasets, metrics
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_20newsgroups, load_files
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.model_selection import train_test_split
import os

In [3]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [4]:
twenty_train = fetch_20newsgroups(subset='train', categories=categories,
                                 shuffle=True, random_state=42)

In [5]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [6]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [7]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [8]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [9]:
count_vect = CountVectorizer()

In [10]:
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [11]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [12]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [13]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [14]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [15]:
docs_new = ["God is Love", "OpenGL on the GPU is fast"]
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [16]:
predicted = clf.predict(X_new_tfidf)

In [17]:
for doc, category in zip(docs_new, predicted):
    print('{0} -> {1}'.format(doc, twenty_train.target_names[category]))

God is Love -> soc.religion.christian
OpenGL on the GPU is fast -> comp.graphics


In [18]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [19]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [21]:
!py "D:\Users\shaun\Documents\01 UofW\Current Classes\Astro 480\machine_learning\scikit-learn\doc\tutorial\text_analytics\solutions\exercise_01_language_train_model.py" "D:\Users\shaun\Documents\01 UofW\Current Classes\Astro 480\machine_learning\paragraphs"

              precision    recall  f1-score   support

          ar       1.00      1.00      1.00        13
          de       0.99      0.99      0.99        78
          en       0.99      1.00      0.99        72
          es       1.00      0.95      0.97        57
          fr       1.00      1.00      1.00        64
          it       1.00      0.98      0.99        44
          ja       1.00      0.97      0.99        40
          nl       1.00      0.96      0.98        23
          pl       0.87      1.00      0.93        26
          pt       0.98      1.00      0.99        45
          ru       1.00      1.00      1.00        33

   micro avg       0.99      0.99      0.99       495
   macro avg       0.98      0.99      0.98       495
weighted avg       0.99      0.99      0.99       495

[[13  0  0  0  0  0  0  0  0  0  0]
 [ 0 77  0  0  0  0  0  0  1  0  0]
 [ 0  0 72  0  0  0  0  0  0  0  0]
 [ 0  0  1 54  0  0  0  0  1  1  0]
 [ 0  0  0  0 64  0  0  0  0  0  0]
 [ 0  0

# Using skeleton as help

In [4]:
dataset = load_files(os.getcwd()+"/paragraphs")
docs_train, docs_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.5)

In [5]:
#Build a vectorizer that splits strings into a sequence of 1 to 3 characters
#     instead of word tokens
vectorizer = TfidfVectorizer(ngram_range = (1,3), analyzer='char', use_idf=False)

In [6]:
#BUild a pipeline that uses the above vectorizer
clf = Pipeline([
    ('vec', vectorizer),
    ('clf', Perceptron(tol=10**(-3)))
])

In [7]:
clf.fit(docs_train, y_train)

Pipeline(memory=None,
     steps=[('vec', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
 ...ndom_state=0, shuffle=True, tol=0.001,
      validation_fraction=0.1, verbose=0, warm_start=False))])

In [8]:
y_predicted = clf.predict(docs_test)

In [9]:
print(metrics.classification_report(y_test, y_predicted, target_names=dataset.target_names))

              precision    recall  f1-score   support

          ar       1.00      1.00      1.00        12
          de       1.00      1.00      1.00        79
          en       1.00      1.00      1.00        81
          es       1.00      0.96      0.98        51
          fr       1.00      1.00      1.00        54
          it       0.98      1.00      0.99        41
          ja       1.00      1.00      1.00        41
          nl       1.00      1.00      1.00        21
          pl       1.00      1.00      1.00        22
          pt       0.98      1.00      0.99        55
          ru       1.00      1.00      1.00        38

   micro avg       1.00      1.00      1.00       495
   macro avg       1.00      1.00      1.00       495
weighted avg       1.00      1.00      1.00       495



In [10]:
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

[[12  0  0  0  0  0  0  0  0  0  0]
 [ 0 79  0  0  0  0  0  0  0  0  0]
 [ 0  0 81  0  0  0  0  0  0  0  0]
 [ 0  0  0 49  0  1  0  0  0  1  0]
 [ 0  0  0  0 54  0  0  0  0  0  0]
 [ 0  0  0  0  0 41  0  0  0  0  0]
 [ 0  0  0  0  0  0 41  0  0  0  0]
 [ 0  0  0  0  0  0  0 21  0  0  0]
 [ 0  0  0  0  0  0  0  0 22  0  0]
 [ 0  0  0  0  0  0  0  0  0 55  0]
 [ 0  0  0  0  0  0  0  0  0  0 38]]


In [11]:
sentences = [
    'This is a language detection test.',
    'Ceci est un test de d\xe9tection de la langue.',
    'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)

for s, p in zip(sentences, predicted):
    print('The language of "%s" is "%s"' % (s, dataset.target_names[p]))

The language of "This is a language detection test." is "en"
The language of "Ceci est un test de détection de la langue." is "fr"
The language of "Dies ist ein Test, um die Sprache zu erkennen." is "de"
