# RP-Mod-2 Analysis
In this notebook we analyse the RP-Mod-2 dataset and train our baseline classifiers

In [None]:
#!pip install scikit-learn
import nltk
import re
import spacy
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords  
import pandas as pd
import matplotlib.pyplot as plt


##skopt
from skopt import BayesSearchCV
from skopt import dump
from skopt.space import Real, Categorical, Integer


##sklearn
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import precision_recall_curve
import sklearn.datasets
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
from sklearn.base import TransformerMixin
from sklearn.naive_bayes import GaussianNB

## automl
import autosklearn.classification
import six.moves.cPickle as pickle



# Preprocessing

In [None]:
## first we load the dataset with the predefined folds

experiment = "abusive_min_2"

## load data
df = pd.read_csv("../../Dataset/Text-Data/RP-Crowd-2-folds.csv")

## load labels and cast to int
y_dat = df["label"].values
y_dat = y_dat.astype(int)

## test data filter
filter_q =  df['ten_folds'] < 8 


In [None]:
class TextPreprocessingTransformer(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        documents = []
        nlp = spacy.load("de_core_news_lg")
        for sen in tqdm(range(0, len(X))):
            # Remove all the special characters
            document = re.sub(r'\W', ' ', str(X[sen]))

            # Remove numbers
            document = re.sub(r'[0-9]', ' ', document)

            # remove all single characters
            document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

            # Remove single characters from the start
            document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

            # Substituting multiple spaces with single space
            document = re.sub(r'\s+', ' ', document, flags=re.I)

            # Removing prefixed 'b'
            document = re.sub(r'^b\s+', '', document)

            # Converting to Lowercase
            document = document.lower()

            # Lemmatization
            document = nlp(document)
 
            document = [word.lemma_ for word in document]
            document = ' '.join(document)
        
            documents.append(document)

        return documents

In [None]:
nltk.download("stopwords")
german_stop_words = stopwords.words('german')

from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words=german_stop_words, min_df = 5, ngram_range=(1,2), max_features=3224)
preprocessor = TextPreprocessingTransformer()


preprocessed = preprocessor.transform(np.array(df["text"]))

tfidf_dat = tf.fit_transform(preprocessed).toarray()
tfidf_dat.shape

In [None]:
a = pickle.load( open( "../../Evaluation/Baseline-Results/automl_model_tfidf_"+experiment,"rb")) 

In [None]:
a.show_models()

In [None]:
a.refit(tfidf_dat[filter_q], y_dat[filter_q])

# TFIDF-Evaluation
Now we start building models based on TF-IDF representation. We build the following models:
1. Naive Bayes
2. Logistic Regression
3. Gradient Boosted Trees
4. AutoML pipeline

### Naive Bayes

In [None]:
params_NB = {
    "alpha":(0,1,"uniform"),
    "fit_prior":[True,False],
}


# log-uniform: understand as search over p = exp(x) by varying x
NB_opt = BayesSearchCV(
    MultinomialNB(),
    search_spaces=params_NB,
    n_iter=50,
    cv=10,
    n_jobs=40,
    n_points = 4,
    return_train_score = True
)


def on_step(optim_result):
    score = -optim_result['fun']
    print("best score: %s" % score)
    if score >= 0.98:
        print('Interrupting!')
        return True

   

NB_opt.fit(tfidf_dat[filter_q], y_dat[filter_q], callback=on_step)
dump(NB_opt,"../../Evaluation/Best-Baseline-Results/NB_model_tfidf_"+experiment)

### Logistic Regression

In [None]:
params_LOG = {
    'C': (1e-6, 1e+6, 'log-uniform'),
    'solver':[ "liblinear", "saga", "lbfgs"],
    'max_iter':Integer(100, 500, 'uniform'),
    
}


# log-uniform: understand as search over p = exp(x) by varying x
LOGREG_opt = BayesSearchCV(
    LogisticRegression(),
    search_spaces=params_LOG,
    n_iter=50,
    cv=10,
    n_jobs=40,
    n_points = 4,
    return_train_score = True
    
)


def on_step(optim_result):
    score = -optim_result['fun']
    print("best score: %s" % score)
    if score >= 0.98:
        print('Interrupting!')
        return True

   

LOGREG_opt.fit(tfidf_dat[filter_q], y_dat[filter_q], callback=on_step)
dump(LOGREG_opt,"../../Evaluation/Best-Baseline-Results/logreg_model_tfidf_"+experiment)

### XGBoost

In [None]:
params_XG = {
    'max_depth': Integer(1, 20, 'uniform'),
    'learning_rate': Real(10**-5, 10**0,"log-uniform"),
    "min_samples_split" :(2,100,"uniform"),
    "min_samples_leaf":(2,100,"uniform")
}


# log-uniform: understand as search over p = exp(x) by varying x
XG_opt = BayesSearchCV(
    GradientBoostingClassifier(),
    search_spaces=params_XG,
    n_iter=50,
    cv=10,
    n_jobs=20,
    n_points = 4
)

def on_step(optim_result):
    score = -optim_result['fun']
    print("best score: %s" % score)
    if score >= 0.98:
        print('Interrupting!')
        return True


XG_opt.fit(tfidf_dat[filter_q], y_dat[filter_q], callback=on_step)
dump(XG_opt,"../../Evaluation/Best-Baseline-Results/xg_model_tfidf_"+experiment)

### AutoML

In [None]:
## Now we train the automl classifier

from autosklearn.experimental.askl2 import AutoSklearn2Classifier
automl =  autosklearn.classification.AutoSklearnClassifier(
     time_left_for_this_task=18000,
    per_run_time_limit=600,
    tmp_folder='../../Evaluation/Baseline-Results/autosklearn_tfidf_min2_classification_results',
    output_folder='../../Evaluation/Baseline-Results/autosklearn_tfidf_min2_classification_results_out',
    memory_limit=None,
    n_jobs=40,
    metric = autosklearn.metrics.accuracy,
    ensemble_nbest=10,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 10}
)

automl.fit(tfidf_dat[filter_q], y_dat[filter_q])

In [None]:
## refit on the complete training data
automl.refit(tfidf_dat[filter_q], y_dat[filter_q])

In [None]:
with open("../../Evaluation/Baseline-Results/automl_model_tfidf_"+experiment, "wb") as f:
    pickle.dump(automl, f)

In [None]:
## load all results
results = {"nb":NB_opt, "logreg":LOGREG_opt ,"xg":XG_opt,"automl":automl}

In [None]:
## specify test data

test = tfidf_dat[-filter_q]
test_y = y_dat[-filter_q]

In [None]:
## save all results

for key,value in results.items():
        opt = value
        probs = opt.predict_proba(test)
        decision = opt.predict(test)
        with open('../../Evaluation/Baseline-Results/{}.p'.format(key+"_"+experiment+"_tfidf"), "wb") as f:
            pickle.dump({'probs' : probs, 'decision' : decision}, f)

In [None]:
results = [NB_opt, LOGREG_opt, XG_opt, automl]

In [None]:
def create_precision_recall_curve(ax,res,test):
    for opt in res:
        probs = opt.predict_proba(test)
        decision = opt.predict(test)

        print(sklearn.metrics.accuracy_score(y_dat[-filter_q], decision))
        print(sklearn.metrics.f1_score(y_dat[-filter_q], decision))

        probs_pos = [x[1] for x in probs]
        lr_precision, lr_recall, _ = sklearn.metrics.precision_recall_curve(y_dat[-filter_q], probs_pos)
        ax.plot(lr_recall, lr_precision, label='Bla')
        ax.set_xlabel('Recall')
        ax.set_ylabel('Precision')
        # show the legend
        
    #probs = bert["probs"]
    #decision =bert["decision"]
    #probs_pos = [x[1] for x in probs]
    #lr_precision, lr_recall, _ = sklearn.metrics.precision_recall_curve(y_dat[-filter_q], probs_pos)
    #ax.plot(lr_recall, lr_precision, label='Bla')
    
    ax.legend(["MN-Bayes","Logistic Regression","Gradient Boosted Trees", "AutoML"])
    ax

In [None]:
def create_auc(ax,res,test):
    for opt in res:
        probs = opt.predict_proba(test)
        decision = opt.predict(test)
        probs_pos = [x[1] for x in probs]
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(df[-filter_q]["label"].values, probs_pos,drop_intermediate=True)
        ax.plot(fpr, tpr,linestyle='-', label='AutoSklearn',linewidth=2)
    
    #probs = bert["probs"]
    #decision = bert["decision"]
    #probs_pos = [x[1] for x in probs]
    
    #fpr, tpr, thresholds = sklearn.metrics.roc_curve(df[-filter_q]["label"].values, probs_pos,drop_intermediate=True)
    #ax.plot(fpr, tpr,linestyle='-', label='AutoSklearn',linewidth=2)
        
    ax.plot([0, 1], [0, 1], color='black', linestyle='--')
    ax.set_xlabel('FPR')
    ax.set_ylabel('TPR')
    ax.legend(["MN-Bayes","Logistic Regression", "Gradient Boosted Trees", "AutoML"])
    ax

In [None]:
fig, (ax1, ax2) = plt.subplots(1 ,2, figsize=(8,4))
create_precision_recall_curve(ax1,results,test)
create_auc(ax2,results,test)

fig.tight_layout()
plt.show()


# FASTTEXT EMBEDDING EVALUATION
Now we start building models based on fasttext embeddings. We build the following models:
1. Naive Bayes
2. Logistic Regression
3. Gradient Boosted Trees
4. AutoML pipeline

In [None]:
import fasttext.util
#fasttext.util.download_model('de', if_exists='ignore')
ft = fasttext.load_model('cc.de.300.bin')

In [None]:
## get sentence embedding
dat_embedding = np.array([ft.get_sentence_vector(x) for x in preprocessed])
dat_embedding[filter_q]

In [None]:
params_NB = {
    'var_smoothing': np.logspace(0,-9, num=100)
}


# log-uniform: understand as search over p = exp(x) by varying x
NB_embed_opt = BayesSearchCV(
    GaussianNB(),
    search_spaces=params_NB,
    n_iter=50,
    cv=10,
    n_jobs=40,
    n_points = 4
    
)


def on_step(optim_result):
    score = -optim_result['fun']
    print("best score: %s" % score)
    if score >= 0.98:
        print('Interrupting!')
        return True

NB_embed_opt.fit(dat_embedding[filter_q], y_dat[filter_q], callback=on_step)
dump(NB_embed_opt,"../../Evaluation/Baseline-Results/NB_model_embed_"+experiment)





In [None]:
params_LOG = {
    'C': (1e-6, 1e+6, 'log-uniform'),
    'solver':[ "liblinear", "saga", "lbfgs"],
    
}

# log-uniform: understand as search over p = exp(x) by varying x
LOGREG_embed_opt = BayesSearchCV(
    LogisticRegression(),
    search_spaces=params_LOG,
    n_iter=50,
    cv=10,
    n_jobs=20,
    return_train_score = True
  
)


# callback handler
def on_step(optim_result):
    score = -optim_result['fun']
    print("best score: %s" % score)
    if score >= 0.98:
        print('Interrupting!')
        return True


LOGREG_embed_opt.fit(dat_embedding[filter_q], y_dat[filter_q], callback=on_step)
dump(LOGREG_embed_opt,"../../Evaluation/Baseline-Results/logreg_model_embed_"+experiment)

In [None]:
params_XG = {
    'max_depth': Integer(1, 20, 'uniform'),
    'learning_rate': Real(10**-5, 10**0,"log-uniform"),
    "min_samples_split" :(2,100,"uniform"),
    "min_samples_leaf":(2,100,"uniform"),
}


# log-uniform: understand as search over p = exp(x) by varying x
XG_embed_opt = BayesSearchCV(
    GradientBoostingClassifier(),
    search_spaces=params_XG,
    n_iter=50,
    cv=10,
    n_jobs=40,
    n_points = 4
)    

    # callback handler
def on_step(optim_result):
    score = -optim_result['fun']
    print("best score: %s" % score)
    if score >= 0.98:
        print('Interrupting!')
        return True

XG_embed_opt.fit(dat_embedding[filter_q], y_dat[filter_q], callback=on_step)
dump(XG_embed_opt,"../../Evaluation/Baseline-Results/xg_model_embed_"+experiment)

In [None]:
## Now we train the automl classifier
automl_emb = autosklearn.classification.AutoSklearnClassifier(
      time_left_for_this_task=18000,
    per_run_time_limit=600,
    tmp_folder='../../Evaluation/Baseline-Results/autosklearn_emb_min2_classification_results',
    output_folder='../../Evaluation/Baseline-Results/autosklearn_emb_min2_classification_results_out',
    memory_limit=None,
    n_jobs=10,
    metric = autosklearn.metrics.accuracy,
    ensemble_nbest=10,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 10}
)

automl_emb.fit(dat_embedding[filter_q], y_dat[filter_q])

In [None]:
automl_emb.refit(dat_embedding[filter_q].copy(), y_dat[filter_q].copy())

In [None]:
with open("../../Evaluation/Baseline-Results/autosklearn_model_embed_"+experiment, "wb") as f:
    pickle.dump(automl_emb, f)

In [None]:
results_embed = {"nb":NB_embed_opt, "logreg":LOGREG_embed_opt, "xg": XG_embed_opt, "automl":automl_emb}

## specify test data
test_embed = dat_embedding[-filter_q]
test_embed_y = y_dat[-filter_q]

In [None]:
## save all results
for key,value in results_embed.items():
        opt = value
        probs = opt.predict_proba(test_embed)
        decision = opt.predict(test_embed)
        with open('../../Evaluation/Baseline-Results/{}.p'.format(key+"_"+experiment+"_embed"), "wb") as f:
            pickle.dump({'probs' : probs, 'decision' : decision}, f)



In [None]:
results_embed = [NB_embed_opt,LOGREG_embed_opt,XG_embed_opt,automl_emb]

In [None]:
fig, (ax1, ax2) = plt.subplots(1 ,2, figsize=(8,4))
create_precision_recall_curve(ax1,results_embed,test_embed)
create_auc(ax2,results_embed,test_embed)

fig.tight_layout()
plt.show()



In [None]:
def create_auc_both(ax,res,res_embed,test,test_embed):
    for opt in res:
        probs = opt.predict_proba(test)
        decision = opt.predict(test)
        probs_pos = [x[1] for x in probs]
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(df[~filter_q]["label"].values, probs_pos,drop_intermediate=True)
        print(sklearn.metrics.auc(fpr,tpr))
        ax.plot(fpr, tpr,linestyle='-',linewidth=1, alpha = 0.3)
        
    ## reset color cycle
    ax.set_prop_cycle(None)
    
    for opt in res_embed:
        probs = opt.predict_proba(test_embed)
        decision = opt.predict(test_embed)
        probs_pos = [x[1] for x in probs]
        
        fpr, tpr, thresholds = sklearn.metrics.roc_curve(df[~filter_q]["label"].values, probs_pos,drop_intermediate=True)
        print(sklearn.metrics.auc(fpr,tpr))
        ax.plot(fpr, tpr,linestyle='-',linewidth=1)
 
        
    ax.plot([0, 1], [0, 1], color='black', linestyle='--')
    ax.set_xlabel('FPR')
    ax.set_ylabel('TPR')
    ax.legend(["_nolegend_","_nolegend_","_nolegend_","_nolegend_","MN-Bayes","Logistic Regression", "Gradient Boosted Trees", "AutoML"])
    ax