In [None]:
import catboost
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
nltk.download("stopwords")

In [None]:
SEED = 1
LABELS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
N_LABELS = len(LABELS)

train = pd.read_csv('../../Data/train.csv')
test = pd.read_csv('../../Data/test.csv')
test_labels = pd.read_csv('../../Data/test_labels.csv')

In [None]:
# Pattern form filtering english stopwords, taken from https://stackoverflow.com/questions/19560498/faster-way-to-remove-stop-words-in-python
stopword_pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')

# Preprocessing
def preprocessComments(comment):
    # Remove leading and trailing spaces
    comment = comment.strip()

    # Remove stopwords
    comment = stopword_pattern.sub('', comment)
    
    # Remove numbers
    comment = re.sub(r'[0-9]', '', comment)

    # Remove anything that is not alphanumeric characters or underscore
    comment = re.sub(r'[^\w\s]', '', comment)

    # Remove consecutive spaces      
    comment = re.sub(r' +', ' ', comment)

    # Remove Newlines
    comment = re.sub(r'\n', ' ', comment)

    return comment

train.comment_text = train.comment_text.map(preprocessComments)
test.comment_text = test.comment_text.map(preprocessComments)

In [None]:
test_filtered = pd.merge(test, test_labels)
test_filtered = test_filtered.drop(test_filtered.index[test_filtered['toxic'] == -1])
test_filtered.shape

In [None]:
test_numpy = test_labels.to_numpy()
test_numpy = test_numpy[:, 1:]
test_numpy = test_numpy.astype('float32')

In [None]:
# Set aside a validation set of 20%
train_set, validation_set = train_test_split(train, test_size=0.2, random_state=SEED)

# Pool datassets into labels
def createPool(dataset, use_label=True):
    current = {}
    for label in LABELS:
        if use_label:
            current[label] = catboost.Pool(dataset[['comment_text']], text_features=['comment_text'], label=dataset[label])
        else:
            current[label] = catboost.Pool(dataset[['comment_text']], text_features=['comment_text'])
    return current


train_pools = createPool(train_set)
val_pools = createPool(validation_set)
test_pools = createPool(test_filtered)

In [None]:
models = {}
for label in LABELS:
    print(label)
    models[label] = catboost.CatBoostClassifier(
        learning_rate=0.3,
        iterations=5000,
        eval_metric='F1',
        od_wait=350,
        od_type='Iter',
        random_seed=SEED)
        
    models[label].fit(
        train_pools[label], 
        eval_set=val_pools[label], 
        verbose=100,
        early_stopping_rounds=350, 
        use_best_model=True
    )

In [None]:
predictions = np.ndarray((test_filtered.shape[0], N_LABELS))


In [None]:
predictions = np.ndarray((test_filtered.shape[0], N_LABELS))

avg = 0
for i, label in enumerate(LABELS):
    print(label, ":")
    predictions[:, i] = models[label].predict(test_pools[label])
    score = f1_score(test_filtered[label], predictions[:, i])
    print(score)
    avg += score

avg /= N_LABELS
print("Average f1-score:", avg)

In [None]:
# Save models

for label in LABELS:
    models[label].save_model("./catboost_models/" + label + "_model")

In [None]:
#load and rerun preds
models = {}

for label in LABELS:
    models[label] = catboost.CatBoostClassifier()
    models[label].load_model("./catboost_models/" + label + "_model")

In [None]:
predictions = np.ndarray((test_filtered.shape[0], N_LABELS))
avg = 0
for i, label in enumerate(LABELS):
    print(label, ":")
    p = models[label].predict_proba(test_pools[label])
    predictions[:, i] = (p[:, 1] >= 0.5)
    score = f1_score(test_filtered[label], predictions[:, i])
    score = f1_score(test_filtered[label], predictions[:, i])
    print(score)
    avg += score

avg /= N_LABELS
print("Average f1-score:", avg)
    

In [None]:
for label in LABELS:
    test_filtered[label] = models[label].predict_proba(test_pools[label])[:, 1]
test_filtered.to_csv('catboost_predictions_raw.csv', index=False)