In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

In [19]:
train = pd.read_csv('toxic_train.csv.zip')
test = pd.read_csv('toxic_test.csv.zip')
test_labels = pd.read_csv('toxic_test_labels.csv.zip')
submission = pd.read_csv('toxic_sample_submission.csv.zip')

In [41]:
train.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
19394,33346e9833575b63,Statement on Senate Passage of Immigration Ref...,0,0,0,0,0,0
11790,1f28836efffe7b3a,""" June 2007 (UTC)\n\n You know what I also fou...",0,0,0,0,0,0
127665,aad027e06369b922,"""\nI'd hate to bring this up again, but this e...",0,0,0,0,0,0
82955,ddf597e49f046122,and hope action is taken on this view,0,0,0,0,0,0
114343,63942d562221da7f,"""\n\nSpeedy deletion of Shree Swami Abhiramdas...",0,0,0,0,0,0


In [42]:
train.shape

(159571, 8)

In [43]:
# Defining model output
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
print(y.shape)
print(y[:10])

(159571, 6)
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [1 1 1 0 1 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [45]:
# Unbalanced data
toxic_ratio = (y.sum(axis = 1) > 0).sum()/y.shape[0]
print('Share of toxic comments:', toxic_ratio)

Share of toxic comments: 0.10167887648758234


In [48]:
print(train[list_classes].sum())
print()
print(train[list_classes].sum()/y.shape[0])

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64


In [50]:
# Multilabel
for cl in list_classes[1:]:
    N = ((train['toxic'] == 0) & (train[cl] == 1)).sum()
    print(f'Is {cl} but not toxic:', N)

Is severe_toxic but not toxic: 0
Is obscene but not toxic: 523
Is threat but not toxic: 29
Is insult but not toxic: 533
Is identity_hate but not toxic: 103


In [55]:
# Baseline (assuming not toxic)
1-(train[list_classes].sum().values/len(train)).mean()

0.9633412921729722

In [52]:
((y == np.zeros_like(y)).sum(axis=0)/len(y)).mean()

0.9633412921729722

In [53]:
train[list_classes].sum().values/len(train)

array([0.09584448, 0.00999555, 0.05294822, 0.00299553, 0.04936361,
       0.00880486])

#### 
#### LogReg
#### Training

In [56]:
X_train, X_valid, Y_train, Y_valid = train_test_split(train['comment_text'], y, test_size = 0.1)

print(X_train.shape, X_valid.shape)
print(Y_train.shape, Y_valid.shape)

(143613,) (15958,)
(143613, 6) (15958, 6)


####  
##### Inputs: vocabulary size = TFIDF, Count vectorizer
##### Output: 6 elememts vector, with toxicity label
##### Loss function: binary crossentropy 

####  
##### TFIDF

In [61]:
print(X_train[:5])
print(' ')
print(Y_train[:5])

85474    WHY WOULD U DELETE MY THING \n\nhey guys, why ...
31633    .\nNo worries - you're right, of course. There...
5407     , 6 January 2013 (UTC)\n\nHi Prodigyhk. Could ...
62168    Bishzilla already have greater influence then ...
45033    "Wait, a minut, after the last RedPen message ...
Name: comment_text, dtype: object
 
[[0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]
 [0 0 0 0 0 0]]


In [62]:
# Lowering text
raw_text_train = X_train.apply(str.lower)
raw_text_valid = X_valid.apply(str.lower)
raw_text_test = test["comment_text"].apply(str.lower)

In [63]:
print(raw_text_train[:5])

85474    why would u delete my thing \n\nhey guys, why ...
31633    .\nno worries - you're right, of course. there...
5407     , 6 january 2013 (utc)\n\nhi prodigyhk. could ...
62168    bishzilla already have greater influence then ...
45033    "wait, a minut, after the last redpen message ...
Name: comment_text, dtype: object


In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [65]:
max_features = 10000

tfidf_vectorizer = TfidfVectorizer(max_df=0.11, # Excluding words appearing in >11% docs (exigent)
                                   min_df=1, # Inlcuding words appearing in at least 1 doc.
                                   ngram_range=(1,3),
                                   max_features=max_features,
                                   stop_words='english')

%time tfidf_matrix_train = tfidf_vectorizer.fit_transform(raw_text_train)

# Output: matrix with 

CPU times: user 54.4 s, sys: 376 ms, total: 54.8 s
Wall time: 55 s


In [66]:
%time tfidf_matrix_valid = tfidf_vectorizer.transform(raw_text_valid)

CPU times: user 1.58 s, sys: 0 ns, total: 1.58 s
Wall time: 1.89 s


In [83]:
count_vectorizer = CountVectorizer(max_df=0.11, min_df=1,
                                   max_features=max_features,
                                   stop_words='english')

%time count_matrix_train = count_vectorizer.fit_transform(raw_text_train)

CPU times: user 7.18 s, sys: 0 ns, total: 7.18 s
Wall time: 7.31 s


In [79]:
raw_text_train[0]

"explanation\nwhy the edits made under my username hardcore metallica fan were reverted? they weren't vandalisms, just closure on some gas after i voted at new york dolls fac. and please don't remove the template from the talk page since i'm retired now.89.205.38.27"

In [81]:
print(tfidf_matrix_train[0]) # Proportion (there are logarithms, check TFIDF method)

  (0, 4392)	0.4042735754078629
  (0, 5152)	0.1419586445350702
  (0, 6795)	0.3472265715284922
  (0, 2658)	0.2336910415456068
  (0, 3464)	0.27440491149779667
  (0, 1315)	0.1906900276976022
  (0, 460)	0.30862408961283666
  (0, 9071)	0.19304619511124346
  (0, 3178)	0.29301912642050976
  (0, 4233)	0.24835889825963853
  (0, 4391)	0.2200201496291496
  (0, 8878)	0.18962822219104564
  (0, 2627)	0.40638201320155715


In [91]:
print(count_matrix_train[0]) # Counter

  (0, 2696)	2
  (0, 9019)	1
  (0, 4389)	1
  (0, 4233)	1
  (0, 3191)	2
  (0, 9261)	1
  (0, 495)	1
  (0, 1286)	1
  (0, 9718)	1
  (0, 3476)	1
  (0, 2699)	1
  (0, 6820)	1
  (0, 5192)	1


In [86]:
tfidf_matrix_train.shape, count_matrix_train.shape

((143613, 10000), (143613, 10000))

In [90]:
sparsity = 1 - (tfidf_matrix_train>0).sum()/(tfidf_matrix_train.shape[0]*tfidf_matrix_train.shape[1])
print(f' Sparsity: {sparsity}')

 Sparsity: 0.9976036744584403


In [92]:
top_10 = np.argsort(tfidf_matrix_train.sum(axis=0))[0,::-1][0,:10].tolist()[0]

feature_names = np.array(tfidf_vectorizer.get_feature_names())

print(feature_names[np.array(top_10)])

['thanks' 'think' 'know' 'edit' 'did' 'people' 'user' 'time' 'good'
 'articles']


In [93]:
dense_matrix_train = tfidf_matrix_train.todense()
dense_matrix_valid = tfidf_matrix_valid.todense()

#### 
### NNs
#### Keras

In [94]:
from keras.models import Sequential
from keras.layers import Dense
from keras import initializers

In [96]:
input_features = dense_matrix_train.shape[1]
output_size = Y_train.shape[1]

In [100]:
print(f'input_features: {input_features}')
print(f'output_size: {output_size}')

input_features: 10000
output_size: 6


In [95]:
model_rl = Sequential()
model_rl.add(Dense(output_size, input_dim=input_features, activation='sigmoid', 
                   kernel_initializer=initializers.normal(mean=0, stddev=0.001)))
model_rl.summary()
model_rl.compile('Adam', loss='binary_crossentropy', metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 6)                 60006     
Total params: 60,006
Trainable params: 60,006
Non-trainable params: 0
_________________________________________________________________


In [101]:
model_rl.evaluate(dense_matrix_valid, Y_valid)



[0.6931502223014832, 0.15590925514698029]

In [102]:
batch_size = 128
epochs = 20
model_rl.fit(dense_matrix_train, 
          Y_train, 
          batch_size = batch_size,
          epochs=epochs, 
          verbose=1, 
          validation_data=(dense_matrix_valid, Y_valid))

MemoryError: Unable to allocate 5.35 GiB for an array with shape (143613, 10000) and data type float32