In [128]:
from sklearn.datasets import fetch_20newsgroups

In [129]:
from sklearn.feature_extraction.text import CountVectorizer

In [130]:
from sklearn.linear_model import SGDClassifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [131]:
from sklearn.pipeline import Pipeline

In [132]:
from sklearn.feature_extraction.text import TfidfTransformer

In [133]:
from sklearn.model_selection import GridSearchCV

In [134]:
import numpy as np

In [135]:
#Fetch data and store them in appropriate arrays

In [137]:
#The following portion needs to be replaced by calls to appropriate data access API.

In [138]:
#marker_train.data is an array of strings (containing as many "texts" as there are coming from Marker.)

In [139]:
#marker_train.target is an array of integers (containing the class indices for the above texts' class labels)

In [140]:
#marker_train_target_names is an array of class label names indexed by the array indices.

In [None]:
categories = ['alt.atheism', 'soc.religion.christian',
               'comp.graphics', 'sci.med']

In [141]:
marker_train = fetch_20newsgroups(subset='train',
     categories=categories, shuffle=True, random_state=42)

In [142]:
#flagger_new_text = fetch_20newsgroups(subset='test',
#                                      categories=categories, shuffle=True, random_state=42)

In [145]:
for t in marker_train.target[:10]:
...     print(marker_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [146]:
#Classes for model representations packaging the components in the modeling pipeline.
#For SGDClassifier and NaiveBayes, but the one for DGDClassifier is untested.

In [147]:
class ModelRepSGD:
    def __init__(self, count_vect, X_train_counts, tf_transformer, text_clf):
        self.count_vect = CountVectorizer()
        self.X_train_counts = self.count_vect.fit_transform(marker_train.data)
        self.tf_transformer = TfidfTransformer(use_idf=False).fit(self.X_train_counts)
        self.text_clf = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                  alpha=1e-3, random_state=42,
                                  max_iter=5, tol=None)),
        ])

In [195]:
class ModelRepNB:
    def __init__(self, count_vect, X_train_counts, tf_transformer, clf):
        self.count_vect = CountVectorizer()
        self.X_train_counts = self.count_vect.fit_transform(marker_train.data)
        self.tf_transformer = TfidfTransformer(use_idf=False).fit(self.X_train_counts)
        self.clf = MultinomialNB()

In [None]:
#Functions for model training - for SGDClassifier and NaiveBayes (SGD one is untested)

In [148]:
def model_train_SGD(marker_train): 
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(marker_train.data)
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge', penalty='l2',
                              alpha=1e-3, random_state=42,
                              max_iter=5, tol=None)),
    ])
    text_clf.fit(marker_train.data, marker_train.target)
    trained_model = ModelRep(count_vect, X_train_counts, tf_transformer, text_clf)
    return trained_model

In [244]:
def model_train_NB(marker_train): 
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(marker_train.data)
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    clf = MultinomialNB().fit(X_train_tf, marker_train.target)
    trained_model = ModelRepNB(count_vect, X_train_counts, tf_transformer, clf)
    trained_model.clf = clf
    return trained_model

In [149]:
#Function for model testing (common for SGDClassifier and NB)

In [150]:
def model_test(flagger_new_test, count_vect, tf_transformer, text_clf):
    X_new_counts = count_vect.transform(flagger_new_test)
    X_new_tfidf = tf_transformer.transform(X_new_counts)
    predicted = text_clf.predict(X_new_tfidf)
    return predicted

In [151]:
#Now testing model_train and model_test

In [245]:
trained_model = model_train_NB(marker_train)

In [None]:
#Inspecting the components of the trained model

In [247]:
trained_model.clf

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [248]:
trained_model.count_vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [249]:
trained_model.tf_transformer

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=False)

In [None]:
#Calling model tester/predictor on a new text from Flagger

In [250]:
flagger_new_text = ['OpenGL on the GPU is fast']

In [252]:
predicted = model_test(flagger_new_text, trained_model.count_vect, trained_model.tf_transformer, trained_model.clf)

In [253]:
predicted

array([1])