# RP-Crowd-2 Analysis
In this notebook we analyse the RP-Crowd-2 dataset and train our baseline classifiers

In [1]:
#!pip3 install fasttext

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/usr/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [1]:
#!pip3 install autosklearn
import nltk
import re
import spacy
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords  
import pandas as pd
import matplotlib.pyplot as plt


##skopt
from skopt import BayesSearchCV
from skopt import dump
from skopt.space import Real, Categorical, Integer


##sklearn
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import precision_recall_curve
import sklearn.datasets
import sklearn.metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics
from sklearn.base import TransformerMixin
from sklearn.naive_bayes import GaussianNB

## automl
#import autosklearn.classification
import six.moves.cPickle as pickle



# Preprocessing

In [2]:
## first we load the dataset with the predefined folds

experiment = "abusive_min_2"
out = "experiments"
## load data
df = pd.read_csv("../../../Dataset/Text-Data/RP-Crowd-2-folds.csv")

## load labels and cast to int
y_dat = df["label"].values
y_dat = y_dat.astype(int)

## test data filter
filter_q =  df['ten_folds'] < 8 


In [3]:
class TextPreprocessingTransformer(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        documents = []
        nlp = spacy.load("de_core_news_lg")
        for sen in tqdm(range(0, len(X))):
            # Remove all the special characters
            document = re.sub(r'\W', ' ', str(X[sen]))

            # Remove numbers
            document = re.sub(r'[0-9]', ' ', document)

            # remove all single characters
            document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

            # Remove single characters from the start
            document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

            # Substituting multiple spaces with single space
            document = re.sub(r'\s+', ' ', document, flags=re.I)

            # Removing prefixed 'b'
            document = re.sub(r'^b\s+', '', document)

            # Converting to Lowercase
            document = document.lower()

            # Lemmatization
            document = nlp(document)
 
            document = [word.lemma_ for word in document]
            document = ' '.join(document)
        
            documents.append(document)

        return documents

In [4]:
#!python -m spacy download de_core_news_lg
nltk.download("stopwords")
german_stop_words = stopwords.words('german')

from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words=german_stop_words, min_df = 5, ngram_range=(1,2), max_features=3224)
preprocessor = TextPreprocessingTransformer()


preprocessed = preprocessor.transform(np.array(df["text"]))

tfidf_dat = tf.fit_transform(preprocessed).toarray()
tfidf_dat.shape

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dennisassenmacher/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|█████████████████████████████████████████████████████████████████████████████| 17368/17368 [03:14<00:00, 89.18it/s]


(17368, 3224)

In [6]:
## specify test data

test = tfidf_dat[-filter_q]
test_y = y_dat[-filter_q]

# TFIDF-Evaluation
Now we start building models based on TF-IDF representation. We build the following models:
1. Naive Bayes
2. Logistic Regression
3. Gradient Boosted Trees


### Naive Bayes

In [7]:
for i in range(0,5):
    params_NB = {
        "alpha":(0,1,"uniform"),
        "fit_prior":[True,False],
    }


    # log-uniform: understand as search over p = exp(x) by varying x
    NB_opt = BayesSearchCV(
        MultinomialNB(),
        search_spaces=params_NB,
        n_iter=50,
        cv=10,
        n_jobs=20,
        n_points = 4,
        return_train_score = True
    )


    def on_step(optim_result):
        score = -optim_result['fun']
        print("best score: %s" % score)
        if score >= 0.98:
            print('Interrupting!')
            return True



    NB_opt.fit(tfidf_dat[filter_q], y_dat[filter_q], callback=on_step)
    
    ## save model
    with open(out+'/{}_model.p'.format("nb_"+experiment+"_tfidf_"+str(i)), "wb") as f:
        pickle.dump(NB_opt, f)
    
    ## save predictions
    probs = NB_opt.predict_proba(test)
    decision = NB_opt.predict(test)
    with open(out+'/{}.p'.format("nb_"+experiment+"_tfidf_"+str(i)), "wb") as f:
        pickle.dump({'probs' : probs, 'decision' : decision}, f)



best score: 0.6986173998166476




best score: 0.6989053767784907




best score: 0.6989053767784907






best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6986173998166476




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907




best score: 0.6989053767784907


### Logistic Regression

In [9]:
for i in range(0,5):
    params_LOG = {
        'C': (1e-6, 1e+6, 'log-uniform'),
        'solver':[ "liblinear", "saga", "lbfgs"],
        'max_iter':Integer(100, 500, 'uniform'),

    }


    # log-uniform: understand as search over p = exp(x) by varying x
    LOGREG_opt = BayesSearchCV(
        LogisticRegression(),
        search_spaces=params_LOG,
        n_iter=50,
        cv=10,
        n_jobs=20,
        n_points = 4,
        return_train_score = True

    )


    def on_step(optim_result):
        score = -optim_result['fun']
        print("best score: %s" % score)
        if score >= 0.98:
            print('Interrupting!')
            return True
    
    LOGREG_opt.fit(tfidf_dat[filter_q], y_dat[filter_q], callback=on_step)
    
    ## save model
    with open(out+'/{}_model.p'.format("logreg_"+experiment+"_tfidf_"+str(i)), "wb") as f:
        pickle.dump(LOGREG_opt, f)
    
    ## save predictions
    probs = LOGREG_opt.predict_proba(test)
    decision = LOGREG_opt.predict(test)
    with open(out+'/{}.p'.format("logreg_"+experiment+"_tfidf_"+str(i)), "wb") as f:
        pickle.dump({'probs' : probs, 'decision' : decision}, f)

best score: 0.6952339812814975


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6952339812814975


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6952339812814975




best score: 0.6952339812814975


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6952339812814975
best score: 0.6952339812814975




best score: 0.6952339812814975


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6952339812814975




best score: 0.6952339812814975
best score: 0.6952339812814975


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6952339812814975




best score: 0.6952339812814975
best score: 0.6952339812814975
best score: 0.6926435870741852
best score: 0.6937947180052935
best score: 0.6943697396294628


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6943697396294628


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6943706201345619




best score: 0.6943706201345619


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


best score: 0.6944423554029348
best score: 0.6944423554029348


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6947303323647777
best score: 0.6948736993126879


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6949458489364015
best score: 0.6949458489364015




best score: 0.6950180503545328
best score: 0.6811296362478052


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6811296362478052


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6935785799006583




best score: 0.6948742172568642


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6948742172568642


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6948742172568642
best score: 0.6948742172568642
best score: 0.6950899410061584


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



best score: 0.6950899410061584


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


best score: 0.6950899410061584
best score: 0.6950899410061584
best score: 0.6950899410061584
best score: 0.6950899410061584


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6664485603741629




best score: 0.6939388100750501




best score: 0.6939388100750501
best score: 0.694658338124317


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.694658338124317
best score: 0.694658338124317


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.694658338124317


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.694658338124317
best score: 0.694658338124317


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt



best score: 0.694658338124317
best score: 0.694658338124317
best score: 0.694658338124317
best score: 0.694658338124317




best score: 0.6939389654583029


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6940108043155109




best score: 0.6940108043155109


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6940108043155109
best score: 0.6952338776926623




best score: 0.6952338776926623
best score: 0.6952338776926623
best score: 0.6952338776926623


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6952338776926623


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

best score: 0.6952338776926623




best score: 0.6952338776926623
best score: 0.6953058201387055
best score: 0.6953058201387055


### XGBoost

In [None]:
for i in range(0,5):
    params_XG = {
        'max_depth': Integer(1, 20, 'uniform'),
        'learning_rate': Real(10**-5, 10**0,"log-uniform"),
        "min_samples_split" :(2,100,"uniform"),
        "min_samples_leaf":(2,100,"uniform")
    }


    # log-uniform: understand as search over p = exp(x) by varying x
    XG_opt = BayesSearchCV(
        GradientBoostingClassifier(),
        search_spaces=params_XG,
        n_iter=50,
        cv=10,
        n_jobs=20,
        n_points = 4
    )

    def on_step(optim_result):
        score = -optim_result['fun']
        print("best score: %s" % score)
        if score >= 0.98:
            print('Interrupting!')
            return True

    XG_opt.fit(tfidf_dat[filter_q], y_dat[filter_q], callback=on_step)
    
    ## save model
    with open(out+'/{}_model.p'.format("xg_"+experiment+"_tfidf_"+str(i)), "wb") as f:
        pickle.dump(XG_opt, f)
    
    ## save predictions
    probs = XG_opt.predict_proba(test)
    decision = XG_opt.predict(test)
    with open(out+'/{}.p'.format("xg_"+experiment+"_tfidf_"+str(i)), "wb") as f:
        pickle.dump({'probs' : probs, 'decision' : decision}, f)

best score: 0.6440701607180778
best score: 0.6440701607180778
best score: 0.6440701607180778
best score: 0.6534964339543484
best score: 0.6800495154632233
best score: 0.6800495154632233
best score: 0.6800495154632233
best score: 0.6800495154632233
best score: 0.6800495154632233
best score: 0.6800495154632233
best score: 0.6800495154632233
best score: 0.6823522434751983
best score: 0.6823522434751983
best score: 0.5903843663729924
best score: 0.6453620688762165
best score: 0.6453620688762165
best score: 0.6619146842353331
best score: 0.6789709485111695
best score: 0.6789709485111695
best score: 0.6835046174723288
best score: 0.6835046174723288
best score: 0.6835046174723288
best score: 0.6835046174723288
best score: 0.6835046174723288
best score: 0.6835046174723288
best score: 0.6835046174723288
best score: 0.6189549440361317
best score: 0.6189549440361317
best score: 0.6658006640044336
best score: 0.6765943616597004
best score: 0.6765943616597004
best score: 0.6765943616597004
best sco

# FASTTEXT EMBEDDING EVALUATION
Now we start building models based on fasttext embeddings. We build the following models:
1. Naive Bayes
2. Logistic Regression
3. Gradient Boosted Trees
4. AutoML pipeline

In [None]:
import fasttext.util
fasttext.util.download_model('de', if_exists='--overwrite')
ft = fasttext.load_model('cc.de.300.bin')

In [None]:
## get sentence embedding
dat_embedding = np.array([ft.get_sentence_vector(x) for x in preprocessed])
dat_embedding[filter_q]

In [None]:
## specify test data
test_embed = dat_embedding[-filter_q]
test_embed_y = y_dat[-filter_q]

In [None]:
for i in range(0,5):
    
    params_NB = {
        'var_smoothing': np.logspace(0,-9, num=100)
    }


    # log-uniform: understand as search over p = exp(x) by varying x
    NB_embed_opt = BayesSearchCV(
        GaussianNB(),
        search_spaces=params_NB,
        n_iter=50,
        cv=10,
        n_jobs=20,
        n_points = 4

    )


    def on_step(optim_result):
        score = -optim_result['fun']
        print("best score: %s" % score)
        if score >= 0.98:
            print('Interrupting!')
            return True

    NB_embed_opt.fit(dat_embedding[filter_q], y_dat[filter_q], callback=on_step)
    #dump(NB_embed_opt,"Evaluation/Baseline-Results/NB_model_embed_"+str(i)+"_"+experiment)
    
    ## save model
    with open(out+'/{}_model.p'.format("nb_"+experiment+"_embed_"+str(i)), "wb") as f:
        pickle.dump(NB_embed_opt, f)
        
    ## save predictions
    probs = NB_embed_opt.predict_proba(test_embed)
    decision = NB_embed_opt.predict(test_embed)
    with open(out+'/{}.p'.format("nb_"+experiment+"_embed_"+str(i)), "wb") as f:
        pickle.dump({'probs' : probs, 'decision' : decision}, f)
    



In [None]:
for i in range(0,5):
    params_LOG = {
        'C': (1e-6, 1e+6, 'log-uniform'),
        'solver':[ "liblinear", "saga", "lbfgs"],

    }

    # log-uniform: understand as search over p = exp(x) by varying x
    LOGREG_embed_opt = BayesSearchCV(
        LogisticRegression(),
        search_spaces=params_LOG,
        n_iter=50,
        cv=10,
        n_jobs=20,
        return_train_score = True

    )


    # callback handler
    def on_step(optim_result):
        score = -optim_result['fun']
        print("best score: %s" % score)
        if score >= 0.98:
            print('Interrupting!')
            return True


    LOGREG_embed_opt.fit(dat_embedding[filter_q], y_dat[filter_q], callback=on_step)
    #dump(NB_embed_opt,"Evaluation/Baseline-Results/NB_model_embed_"+str(i)+"_"+experiment)
    
    ## save model
    with open(out+'/{}_model.p'.format("logreg_"+experiment+"_embed_"+str(i)), "wb") as f:
        pickle.dump(LOGREG_embed_opt, f)
        
    ## save predictions
    probs = LOGREG_embed_opt.predict_proba(test_embed)
    decision = LOGREG_embed_opt.predict(test_embed)
    with open(out+'/{}.p'.format("logreg_"+experiment+"_embed_"+str(i)), "wb") as f:
        pickle.dump({'probs' : probs, 'decision' : decision}, f)



In [None]:
for i in range(0,5):
    params_XG = {
        'max_depth': Integer(1, 20, 'uniform'),
        'learning_rate': Real(10**-5, 10**0,"log-uniform"),
        "min_samples_split" :(2,100,"uniform"),
        "min_samples_leaf":(2,100,"uniform"),
    }


    # log-uniform: understand as search over p = exp(x) by varying x
    XG_embed_opt = BayesSearchCV(
        GradientBoostingClassifier(),
        search_spaces=params_XG,
        n_iter=50,
        cv=10,
        n_jobs=20,
        n_points = 4
    )    

        # callback handler
    def on_step(optim_result):
        score = -optim_result['fun']
        print("best score: %s" % score)
        if score >= 0.98:
            print('Interrupting!')
            return True


    XG_embed_opt.fit(dat_embedding[filter_q], y_dat[filter_q], callback=on_step)
    #dump(NB_embed_opt,"Evaluation/Baseline-Results/NB_model_embed_"+str(i)+"_"+experiment)
    
    ## save model
    with open(out+'/{}_model.p'.format("xg_"+experiment+"_embed_"+str(i)), "wb") as f:
        pickle.dump(XG_embed_opt, f)
        
    ## save predictions
    probs = XG_embed_opt.predict_proba(test_embed)
    decision = XG_embed_opt.predict(test_embed)
    with open(out+'/{}.p'.format("xg_"+experiment+"_embed_"+str(i)), "wb") as f:
        pickle.dump({'probs' : probs, 'decision' : decision}, f)

