This is all just for fun. The author is not a racist in any way.

In [1]:
from sklearn.model_selection import train_test_split
import torch
import pandas as pd

In [2]:
train_data_unintended_bias = pd.read_csv("./Data/jigsaw-unintended-bias-train.csv")

In [3]:
train_data_unintended_bias.dropna(inplace = True)

train_data_unintended_bias["sum"] = train_data_unintended_bias.iloc[:, 8:32].values.sum(axis = 1)
train_data_unintended_bias = train_data_unintended_bias[train_data_unintended_bias["sum"] >= 0.5]

x_test = torch.tensor(train_data_unintended_bias.iloc[:, 8:32].values, dtype = torch.float)
y_test = torch.tensor(train_data_unintended_bias.iloc[:, 2].values, dtype = torch.float)

print("toxicity fraction in test: ", y_test.round().sum() / y_test.shape[0])

toxic = train_data_unintended_bias[train_data_unintended_bias["toxic"] >= 0.5]
untoxic = train_data_unintended_bias[train_data_unintended_bias["toxic"] < 0.5].sample(10000)

data = pd.concat((toxic, toxic, untoxic))
print("toxicity fraction in train: ", data.iloc[:, 2].values.round().sum() / data.shape[0])

toxicity fraction in test:  tensor(0.1008)
toxicity fraction in train:  0.513152514651765


In [4]:
racist_x_train = torch.tensor(data.iloc[:, 8:32].values, dtype = torch.float)
racist_y_train = torch.tensor(data.iloc[:, 2].values, dtype = torch.float)

def make_labels(x):
    for i in range(x.shape[0]):
        x[i] = 0 if x[i] <= 0.5 else 1
    return x
        
racist_y_train = make_labels(racist_y_train)
y_test = make_labels(y_test)     

In [5]:
racist_x_train, racist_x_test, racist_y_train, racist_y_test = train_test_split(
    racist_x_train, 
    racist_y_train, 
    test_size = 0.3,
    shuffle = True)

In [6]:
class CatDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        super().__init__()
        
        self.x = x
        self.y = y
        self.index_list = []
        
    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self, i):
        return (self.x[i], self.y[i])

In [7]:
import torch.nn as nn

In [8]:
class RacistClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.linear1 = nn.Linear(24, 64)
        self.softmax = nn.Softmax(dim = 1)
        self.elu = nn.ELU()
        self.linear2 = nn.Linear(64, 32)
        self.linear3 = nn.Linear(32, 2)
    
    def forward(self, x):
        x = self.linear1(x.float())
        x = self.elu(x)
        x = self.linear2(x)
        x = self.elu(x)
        x = self.linear3(x)
        
        return x
    
    def predict(self, x):
        return self.softmax(self.forward(x))

In [9]:
model = RacistClassifier()
optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4)
cross_entropy = nn.CrossEntropyLoss()

In [10]:
training_set = CatDataset(racist_x_train, racist_y_train)
training_generator = torch.utils.data.DataLoader(training_set, batch_size = 512, shuffle = True)

validation_set = CatDataset(racist_x_test, racist_y_test)
validation_generator = torch.utils.data.DataLoader(validation_set)

n_iterations = 100

from sklearn.metrics import accuracy_score
from tqdm import tqdm

loss = 0

for epoch in tqdm(range(n_iterations)):
    for local_batch, local_labels in training_generator:
        optimizer.zero_grad()
        output = model(local_batch)

        loss = cross_entropy(output, local_labels.long())

        loss.backward()
        optimizer.step()

100%|██████████| 100/100 [01:29<00:00,  1.12it/s]


In [11]:
from sklearn.metrics import accuracy_score

# the first score is on train class distribution
# the second score is on initial class distribution
print(accuracy_score(torch.argmax(model.predict(racist_x_test), dim = 1), racist_y_test))
accuracy_score(torch.argmax(model.predict(x_test), dim = 1), y_test)

0.5799197395320663


0.5675350165045945

In [12]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(max_depth = 20)
random_forest.fit(racist_x_train, racist_y_train)

print(random_forest.score(racist_x_test, racist_y_test))
random_forest.score(x_test, y_test)

0.6395093511016885


0.5835043268801856

In [13]:
random_forest.predict(torch.tensor([0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0]).reshape(1, -1))

array([1.], dtype=float32)

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

boosting = GradientBoostingClassifier()
boosting.fit(racist_x_train, racist_y_train)

print(boosting.score(racist_x_test, racist_y_test))
boosting.score(x_test, y_test)

0.5859771333383812


0.5941743242037648

In [15]:
from joblib import dump, load
dump(random_forest, 'random_forest.joblib') 

['random_forest.joblib']

In [41]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(torch.argmax(model.predict(x_test), dim = 1), y_test).ravel()

print((tn, fp, fn, tp), y_test.sum(), (y_test.shape[0] - y_test.sum()))

print("Custom model")
print((tp / y_test.sum()).item() * 100, "% of toxic comments were predicted correctly")
print((fn / (y_test.shape[0] - y_test.sum())).item() * 100, "% of toxic comments were marked non-toxic")
print((tn / (y_test.shape[0] - y_test.sum())).item() * 100, "% of non-toxic comments were predicted correctly")
print((fp / y_test.sum()).item() * 100, "% of non-toxic comments were marked toxic")

(55647, 3327, 45148, 7968) tensor(11295.) tensor(100795.)
Custom model
70.5444872379303 % of toxic comments were predicted correctly
44.79190707206726 % of toxic comments were marked non-toxic
55.20809888839722 % of non-toxic comments were predicted correctly
29.455509781837463 % of non-toxic comments were marked toxic


In [42]:
tn, fp, fn, tp = confusion_matrix(random_forest.predict(x_test), y_test).ravel()

print("Random Forest")
print((tp / y_test.sum()).item() * 100, "% of toxic comments were predicted correctly")
print((fn / (y_test.shape[0] - y_test.sum())).item() * 100, "% of toxic comments were marked non-toxic")
print((tn / (y_test.shape[0] - y_test.sum())).item() * 100, "% of non-toxic comments were predicted correctly")
print((fp / y_test.sum()).item() * 100, "% of non-toxic comments were marked toxic")

Random Forest
79.14121150970459 % of toxic comments were predicted correctly
43.97936463356018 % of toxic comments were marked non-toxic
56.02063536643982 % of non-toxic comments were predicted correctly
20.85878700017929 % of non-toxic comments were marked toxic


In [43]:
tn, fp, fn, tp = confusion_matrix(boosting.predict(x_test), y_test).ravel()

print("Boosting")
print((tp / y_test.sum()).item() * 100, "% of toxic comments were predicted correctly")
print((fn / (y_test.shape[0] - y_test.sum())).item() * 100, "% of toxic comments were marked non-toxic")
print((tn / (y_test.shape[0] - y_test.sum())).item() * 100, "% of non-toxic comments were predicted correctly")
print((fp / y_test.sum()).item() * 100, "% of non-toxic comments were marked toxic")

Boosting
68.64984035491943 % of toxic comments were predicted correctly
41.617146134376526 % of toxic comments were marked non-toxic
58.38285684585571 % of non-toxic comments were predicted correctly
31.35015368461609 % of non-toxic comments were marked toxic
