# RP-Crowd-2 Analysis
In this notebook we analyse the RP-Crowd-2 dataset and train AutoML classifiers

In [2]:
#!pip install scikit-learn
import nltk
import re
import spacy
import numpy as np
from tqdm import tqdm
from nltk.corpus import stopwords  
import pandas as pd
import matplotlib.pyplot as plt


##sklearn
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import precision_recall_curve
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics


## automl
import autosklearn.classification
import six.moves.cPickle as pickle
from sklearn.base import TransformerMixin


# Preprocessing

In [None]:
## first we load the dataset with the predefined folds

experiment = "abusive_min_2"

## load data
df = pd.read_csv("../../../Dataset/Text-Data/RP-Crowd-2-folds.csv")

## load labels and cast to int
y_dat = df["label"].values
y_dat = y_dat.astype(int)

## test data filter
filter_q =  df['ten_folds'] < 8 


In [None]:
class TextPreprocessingTransformer(TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        documents = []
        nlp = spacy.load("de_core_news_lg")
        for sen in tqdm(range(0, len(X))):
            # Remove all the special characters
            document = re.sub(r'\W', ' ', str(X[sen]))

            # Remove numbers
            document = re.sub(r'[0-9]', ' ', document)

            # remove all single characters
            document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

            # Remove single characters from the start
            document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 

            # Substituting multiple spaces with single space
            document = re.sub(r'\s+', ' ', document, flags=re.I)

            # Removing prefixed 'b'
            document = re.sub(r'^b\s+', '', document)

            # Converting to Lowercase
            document = document.lower()

            # Lemmatization
            document = nlp(document)
 
            document = [word.lemma_ for word in document]
            document = ' '.join(document)
        
            documents.append(document)

        return documents

In [None]:
nltk.download("stopwords")
german_stop_words = stopwords.words('german')

from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(stop_words=german_stop_words, min_df = 5, ngram_range=(1,2), max_features=3224)
preprocessor = TextPreprocessingTransformer()


preprocessed = preprocessor.transform(np.array(df["text"]))

tfidf_dat = tf.fit_transform(preprocessed).toarray()
tfidf_dat.shape

In [None]:
## specify test data

test = tfidf_dat[-filter_q]
test_y = y_dat[-filter_q]

### AutoML

In [None]:
## Now we train the automl classifier
automl =  autosklearn.classification.AutoSklearnClassifier(
     time_left_for_this_task=18000,
    per_run_time_limit=600,
    tmp_folder='../../Evaluation/Best-Baseline-Results/autosklearn_tfidf_min2_classification_results',
    output_folder='../../Evaluation/Best-Baseline-Results/autosklearn_tfidf_min2_classification_results_out',
    memory_limit=None,
    n_jobs=40,
    metric = autosklearn.metrics.accuracy,
    ensemble_nbest=10,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 10}
)

automl.fit(tfidf_dat[filter_q], y_dat[filter_q])

In [None]:
## refit on the complete training data
automl.refit(tfidf_dat[filter_q], y_dat[filter_q])

In [None]:
with open("../../Evaluation/Baseline-Results/automl_model_tfidf_"+experiment, "wb") as f:
    pickle.dump(automl, f)

# FASTTEXT EMBEDDING EVALUATION
Now we start building models based on fasttext embeddings. We build the following models:
1. Naive Bayes
2. Logistic Regression
3. Gradient Boosted Trees
4. AutoML pipeline

In [None]:
import fasttext.util
#fasttext.util.download_model('de', if_exists='ignore')
ft = fasttext.load_model('cc.de.300.bin')

In [None]:
## get sentence embedding
dat_embedding = np.array([ft.get_sentence_vector(x) for x in preprocessed])
dat_embedding[filter_q]

In [None]:
## Now we train the automl classifier
automl_emb = autosklearn.classification.AutoSklearnClassifier(
      time_left_for_this_task=18000,
    per_run_time_limit=600,
    tmp_folder='../../Evaluation/Best-Baseline-Results/autosklearn_emb_min2_classification_results',
    output_folder='../../Evaluation/Best-Baseline-Results/autosklearn_emb_min2_classification_results_out',
    memory_limit=None,
    n_jobs=10,
    metric = autosklearn.metrics.accuracy,
    ensemble_nbest=10,
    resampling_strategy='cv',
    resampling_strategy_arguments={'folds': 10}
)

automl_emb.fit(dat_embedding[filter_q], y_dat[filter_q])

In [None]:
automl_emb.refit(dat_embedding[filter_q].copy(), y_dat[filter_q].copy())

In [None]:
with open("../../Evaluation/Best-Baseline-Results/autosklearn_model_embed_"+experiment, "wb") as f:
    pickle.dump(automl_emb, f)