In [20]:
import pandas as pd
import numpy as np
import torch



def read_file(fname: str) -> pd.DataFrame:
    """Reads a filename and formats it properly for simpletransformers"""
    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    offensive_ids = df.labels != "Acceptable speech"

    df.labels[offensive_ids] = 1
    df.labels[~offensive_ids] = 0
    
    df["labels"] = df.labels.astype(np.int8)
    df = df.drop(columns=["role"])
    return df

# English

In [27]:
%%time
train_fname = "../data/lgbt-en.train.tsv"
test_fname = "../data/lgbt-en.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1,3))
X_train_counts = count_vect.fit_transform(train.text.values)


from sklearn.feature_extraction.text import TfidfTransformer
#tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
#X_train_tf = tf_transformer.transform(X_train_counts)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

from sklearn.svm import SVC


clf = SVC().fit(X=X_train_tfidf, y=train.labels)

docs_new = test.text.values.tolist()
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]

accuracy = accuracy_score(y_true, predicted)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, predicted)
print("F1 score: ", f1)

Accuracy:  0.7571288102261554
F1 score:  0.22082018927444794
CPU times: user 6.36 s, sys: 24 ms, total: 6.38 s
Wall time: 6.38 s


# Slovenian

In [25]:
%%time
train_fname = "../data/lgbt-sl.train.tsv"
test_fname = "../data/lgbt-sl.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1,3))
X_train_counts = count_vect.fit_transform(train.text.values)


from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

from sklearn.svm import SVC


clf = SVC().fit(X=X_train_tfidf, y=train.labels)

docs_new = test.text.values.tolist()
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]

accuracy = accuracy_score(y_true, predicted)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, predicted)
print("F1 score: ", f1)

Accuracy:  0.5788888888888889
F1 score:  0.5058670143415906
CPU times: user 3.16 s, sys: 3.99 ms, total: 3.17 s
Wall time: 3.17 s


# Croatian

In [26]:
%%time
train_fname = "../data/lgbt-hr.train.tsv"
test_fname = "../data/lgbt-hr.test.tsv"

train = read_file(train_fname)
test = read_file(test_fname)

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(1,3))
X_train_counts = count_vect.fit_transform(train.text.values)


from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

from sklearn.svm import SVC


clf = SVC().fit(X=X_train_tfidf, y=train.labels)

docs_new = test.text.values.tolist()
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

from sklearn.metrics import accuracy_score, f1_score
y_true = test["labels"]

accuracy = accuracy_score(y_true, predicted)
print("Accuracy: ", accuracy)
f1 = f1_score(y_true, predicted)
print("F1 score: ", f1)

Accuracy:  0.7049036777583187
F1 score:  0.8105677346824058
CPU times: user 4.96 s, sys: 24 ms, total: 4.99 s
Wall time: 4.99 s
