In [205]:
from sklearn.feature_extraction.text import CountVectorizer

In [206]:
from sklearn.linear_model import SGDClassifier

In [207]:
from sklearn.naive_bayes import MultinomialNB

In [208]:
from sklearn.pipeline import Pipeline

In [209]:
from sklearn.feature_extraction.text import TfidfTransformer

In [210]:
from sklearn.model_selection import GridSearchCV

In [211]:
import numpy as np

In [1]:
import requests

API_URL = "<databaseurl>/"

In [2]:
#get the categories
response = requests.get(API_URL + "categories")
if response.status_code == 200:
    categories = [category["name"] for category in response.json()]
    categories += ["racial slur"] #add additional not in the API manually

In [7]:
#Fetch data and store them in appropriate arrays
response = requests.get(API_URL + "mark")
if response.status_code == 200:
    raw_data = response.json()
    
response.status_code

200

In [8]:
from collections import namedtuple
data_set = namedtuple('data_set', ["data", "urls", "target_names", "target"])([],[],categories,[])

for row in raw_data:
    if row['category'] in categories:
        data_set.data.append(row["flagged_string"])
        data_set.urls.append(row["url"])
        data_set.target.append(categories.index(row['category']))

In [214]:
#marker_train.data is an array of strings (containing as many "texts" as there are coming from Marker.)

In [215]:
#marker_train.target is an array of integers (containing the class indices for the above texts' class labels)

In [216]:
#marker_train_target_names is an array of class label names indexed by the array indices.

In [9]:
marker_train = data_set

In [219]:
#flagger_new_text = fetch_20newsgroups(subset='test',
#                                      categories=categories, shuffle=True, random_state=42)

In [10]:
for t in marker_train.target[:10]:
...     print(marker_train.target_names[t])

racial slur
racial slur
gaslighting
racial slur
racial slur
stereotyping
appropriation
deflection
racial slur
racial slur


In [221]:
#Classes for model representations packaging the components in the modeling pipeline.
#For SGDClassifier and NaiveBayes, but the one for DGDClassifier is untested.

In [222]:
class ModelRepSGD:
    def __init__(self, count_vect, X_train_counts, tf_transformer, text_clf):
        self.count_vect = CountVectorizer()
        self.X_train_counts = self.count_vect.fit_transform(marker_train.data)
        self.tf_transformer = TfidfTransformer(use_idf=False).fit(self.X_train_counts)
        self.text_clf = Pipeline([
            ('vect', CountVectorizer()),
            ('tfidf', TfidfTransformer()),
            ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                  alpha=1e-3, random_state=42,
                                  max_iter=5, tol=None)),
        ])

In [223]:
class ModelRepNB:
    def __init__(self, count_vect, X_train_counts, tf_transformer, clf):
        self.count_vect = CountVectorizer()
        self.X_train_counts = self.count_vect.fit_transform(marker_train.data)
        self.tf_transformer = TfidfTransformer(use_idf=False).fit(self.X_train_counts)
        self.clf = MultinomialNB()

In [224]:
#Functions for model training - for SGDClassifier and NaiveBayes (SGD one is untested)

In [225]:
def model_train_SGD(marker_train): 
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(marker_train.data)
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge', penalty='l2',
                              alpha=1e-3, random_state=42,
                              max_iter=5, tol=None)),
    ])
    text_clf.fit(marker_train.data, marker_train.target)
    trained_model = ModelRep(count_vect, X_train_counts, tf_transformer, text_clf)
    return trained_model

In [226]:
def model_train_NB(marker_train): 
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(marker_train.data)
    tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
    X_train_tf = tf_transformer.transform(X_train_counts)
    clf = MultinomialNB().fit(X_train_tf, marker_train.target)
    trained_model = ModelRepNB(count_vect, X_train_counts, tf_transformer, clf)
    trained_model.clf = clf
    return trained_model

In [227]:
#Function for model testing (common for SGDClassifier and NB)

In [228]:
def model_test(flagger_new_test, trained_model):
    count_vect = trained_model.count_vect
    tf_transformer = trained_model.tf_transformer
    clf = trained_model.clf
    X_new_counts = count_vect.transform(flagger_new_test)
    X_new_tfidf = tf_transformer.transform(X_new_counts)
    predicted = clf.predict(X_new_tfidf)
    return predicted

In [229]:
#Now testing model_train and model_test

In [230]:
trained_model = model_train_NB(marker_train)

In [231]:
#Inspecting the components of the trained model

In [232]:
trained_model.clf

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [233]:
trained_model.count_vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [234]:
trained_model.tf_transformer

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=False)

In [235]:
#Calling model tester/predictor on a new text from Flagger

In [236]:
flagger_new_text = ['OpenGL on the GPU is fast']

In [237]:
predicted = model_test(flagger_new_text, trained_model)

In [238]:
predicted

array([1])