# Special Model Comparison

---

In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
import time
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from xgboost import XGBClassifier

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
from util import databases
from util import dataloader
from util import grid_models
from util.reddit_functions import Labeler
from util.reddit_functions import plot_confusion_matrix
from util.grid_models import custom_stop_words

In [22]:
# subreddit_list = ['css', 'html', 'javascript', 'php', 'perl', 'java', 'datascience', 'machinelearning', 'etl', 'python', 'dataengineering']

In [23]:
subreddit_list = ['datascience','machinelearning','dataengineering','python','aws']

In [24]:
df = dataloader.data_selector(subreddit_list, 'sqlite')

Connection to SQLite DB successful


In [25]:
len(df)

10298

In [7]:
df.sample(10)

Unnamed: 0,title,subreddit,date
33054,Any good alternative to SAS DQMATCH?,dataengineering,2020-04-22
74298,convert text to handwriting (MyHandWriting-ver...,python,2020-05-03
55059,"[P][D] My Talk on Bias in AI/ML, Using a Keras...",machinelearning,2020-04-28
71902,DNS Health Check for custom application says o...,aws,2020-05-03
68771,[D] Using Unrolled GANs in Practice,machinelearning,2020-05-02
79144,An interview with a DataDog engineer about how...,dataengineering,2020-05-07
78664,Dell Technologies and Ververica: Analyzing Con...,dataengineering,2020-05-07
54078,running a regression in Spark: R vs. Scala vs....,datascience,2020-04-28
31081,"[D] Director of customer analytics says: ""I th...",machinelearning,2020-04-22
61378,What are some good conferences you recommend f...,dataengineering,2020-04-29


In [None]:
X = df['title']
y = df['subreddit']

In [None]:
labeler = Labeler()
labeler.fit(y)
y = labeler.transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=7)

In [None]:
tfidfvectorizer = {
    'preprocessor': TfidfVectorizer(stop_words=custom_stop_words),
    'pipe_params': {
        "tfidfvectorizer__ngram_range": [(1, 2)],
#         "prep__max_df": [.7, .8, .9],
        "tfidfvectorizer__use_idf": [True],
        "tfidfvectorizer__norm": ["l2"]
    }
}

In [None]:
estimators = {
    'logisticregression': {
        'name': 'Logistic Regression',
        'estimator': LogisticRegression(max_iter=1000),
        'pipe_params': {
            "logisticregression__C": [5]•
        }
    },
    'mlp': {
        'name': 'Multi Layer Percetpron Classifier',
        'estimator': MLPClassifier(),
        'pipe_params': {
            "clf__hidden_layer_sizes": [50, 100, 200]
        }
    },

    'xgbclassifier': {
        'name': 'XGBoost Classifier',
        'estimator': XGBClassifier(),
        'pipe_params': {
            "clf__hidden_layer_sizes": [10, 25, 50],
            "clf__n_estimators": [50, 100, 200],
            "clf__max_depth": [5, 10, 20]
        }
    }
}

In [None]:
def score_model(model):
    
    print(f'Train Score: {model.score(X_train, y_train)}')
    print(f'Test Score: {model.score(X_test, y_test)}')
    print(f'AUC Score: {roc_auc_score(y_test, y_proba, multi_class="ovr")}')
    print(classification_report(y_test, y_pred, digits=3))

In [None]:
fitted_models = dict()

In [None]:
text = TextClassifier()

In [None]:
for estimator in estimators.values():

    model = text.build_and_train_model(
        text=X_train,
        labels=y_train,
        preprocessor=tfidfvectorizer,
        estimator=estimator,
        verbose=5
    )
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    score_model(model)
    plot_confusion_matrix(model, y_true=y_test,
                          y_pred=y_pred, classes=labeler.classes_)
    fitted_models[estimator.get('name')] = model

In [None]:
fitted_models