In [1]:
# Import libraries.
import numpy as np
import pandas as pd
import spacy
import string

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, mean_absolute_error, precision_score, recall_score
from sklearn.model_selection import train_test_split

from tensorflow import keras
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
# Load data.
df = pd.read_csv('first_week_oct_2015_comments_by_top_400_with_scores_and_features_v2.csv')

In [3]:
# Preview data.
df.head()

Unnamed: 0,id,parent,by,time,hour_posted,text,dead,ranking,text_len,pct_caps,tags_per_char,papi_toxicity,v_neg,v_neu,v_pos,v_compound,tb_polarity,tb_subjectivity,tb_nb_prob_neg,pc_prob_offensive
0,10331981,10331895,debacle,2015-10-05 14:24:42+00:00,14,US is not really scared by BRICS at all. They'...,False,0,146,0.068493,0.0,0.100881,0.0,0.744,0.256,0.7859,0.15625,0.61875,0.080001,0.238871
1,10343811,10343761,sarciszewski,2015-10-07 02:13:15+00:00,2,"I wasn't really trying to argue, they said the...",False,0,76,0.013158,0.0,0.048637,0.195,0.805,0.0,-0.3947,0.2,0.2,0.593498,0.050161
2,10331538,10331008,debacle,2015-10-05 13:08:10+00:00,13,The examples on the homepage kind of underscor...,False,12,88,0.034091,0.0,0.044777,0.0,0.864,0.136,0.2975,0.1375,0.5,0.336201,0.098511
3,10340097,10339965,debacle,2015-10-06 16:33:06+00:00,16,No mention of a critical aspect of a service l...,False,22,99,0.010101,0.0,0.035335,0.214,0.667,0.119,-0.25,0.0,0.8,0.352414,0.056323
4,10338552,10337763,debacle,2015-10-06 13:06:26+00:00,13,I think some of these points are gross exagger...,False,38,868,0.013825,0.009217,0.232577,0.081,0.902,0.017,-0.8233,0.076667,0.26,0.00079,0.001499


In [4]:
# Review column list.
df.columns

Index(['id', 'parent', 'by', 'time', 'hour_posted', 'text', 'dead', 'ranking',
       'text_len', 'pct_caps', 'tags_per_char', 'papi_toxicity', 'v_neg',
       'v_neu', 'v_pos', 'v_compound', 'tb_polarity', 'tb_subjectivity',
       'tb_nb_prob_neg', 'pc_prob_offensive'],
      dtype='object')

In [5]:
# Define target column.
y = df['papi_toxicity']

## Text processing with spacy

In [6]:
nlp = spacy.load('en_core_web_sm')

In [7]:
# Split data into train and test subsets.
X_train, X_test, y_train, y_test = train_test_split(df['text'], 
                                                    y, 
                                                    train_size=0.8, 
                                                    random_state=0)

In [8]:
# Convert comment text to lowercase, remove punctuation, and trim 
# whitespace.
simple_text_train = X_train.str.lower()
simple_text_train = simple_text_train.str.replace('[{}]'.format(string.punctuation), '')
simple_text_train = simple_text_train.str.replace('\s+', ' ', regex=True)
simple_text_train = simple_text_train.str.strip()

simple_text_test = X_test.str.lower()
simple_text_test = simple_text_test.str.replace('[{}]'.format(string.punctuation), '')
simple_text_test = simple_text_test.str.replace('\s+', ' ', regex=True)
simple_text_test = simple_text_test.str.strip()

In [9]:
# Check results.
for text in simple_text_train.head():
    print(text, '\n')

without the safe harbor agreement you can no longer avoid eu privacy regulations by storing the data in the us maybe im missing something here my understanding is that the safe harbour agreement wasnt a mechanism for us companies to avoid eu data protection regulations it was a certification that they did comply with eu data protection particularly in situations where that data was transmitted outside the eu now its gone eu customer data held by us companies will be governed by national data protection laws instead so may end up having to be stored within the eu the us privacy regulations are no longer considered compatible with the eu privacy regulations i dont think they ever were which is why the safe harbour needed to exist in the first place 

imagine how much a interlispd smalltalk mesacedar workstation would have cost in the 70s versus a plain pdp11 eh if theyd put altos in serial production instead of small batches at a time adding up to 2000 units it wouldnt have been vastly m

In [10]:
# Tokenize text.
tokens_train = simple_text_train.apply(nlp.tokenizer)
tokens_test = simple_text_test.apply(nlp.tokenizer)

In [11]:
# Lemmatize; remove stop words.
lemmas_train = tokens_train.apply(lambda x: [token.lemma_ for token in x if not token.is_stop])
lemmas_test = tokens_test.apply(lambda x: [token.lemma_ for token in x if not token.is_stop])

In [12]:
# # Lemmatize; leave stop words. Picks up "you" and "your" as potentially "toxic" words.
# lemmas_train = tokens_train.apply(lambda x: [token.lemma_ for token in x])
# lemmas_test = tokens_test.apply(lambda x: [token.lemma_ for token in x])

In [13]:
# Check results.
lemmas_train.head()

1362    [safe, harbor, agreement, long, avoid, eu, pri...
7304    [imagine, interlispd, smalltalk, mesacedar, wo...
4237    [thank, link, current, defacto, maintainer, pp...
3680                                [surely, rough, idea]
4780    [disclaimer, be, browser, engine, developer, f...
Name: text, dtype: object

In [14]:
# Instantiate vectorizer object.
tfidf = TfidfVectorizer(min_df=0.001, max_df=0.999)

# Create a vocabulary and get word counts per document.
dtm_train = tfidf.fit_transform(lemmas_train.astype(str))
dtm_test = tfidf.transform(lemmas_test.astype(str))

# Get feature names to use as dataframe column headers.
dtm_train_orig = pd.DataFrame(dtm_train.todense(), columns=tfidf.get_feature_names())
dtm_test_orig = pd.DataFrame(dtm_test.todense(), columns=tfidf.get_feature_names())

# Copy document term matrices.
dtm_train = dtm_train_orig.copy()
dtm_test = dtm_test_orig.copy()

# Preview feature matrix.
dtm_train.head()

Unnamed: 0,10,100,1000,101,10k,11,12,120,13,14,...,yes,yesterday,yield,york,young,youth,youtube,zero,zillow,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Automated feature (word) selection

In [15]:
# Convert PerspectiveAPI toxicity scores to binary class labels.
y_train_binary = (y_train > 0.7).astype(int)
y_test_binary = (y_test > 0.7).astype(int)

In [16]:
# Check target distribution.
np.bincount(y_train_binary)

array([7893,   83], dtype=int64)

In [17]:
def count(docs):
    """
    Helper function for feature selection.
    """
    word_counts = Counter()
    appears_in = Counter()
    
    total_docs = len(docs)

    for doc in docs:
        word_counts.update(doc)
        appears_in.update(set(doc))

    temp = zip(word_counts.keys(), word_counts.values())
      
    wc = pd.DataFrame(temp, columns = ['word', 'count'])

    wc['rank'] = wc['count'].rank(method='first', ascending=False)
    total = wc['count'].sum()

    wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
    wc = wc.sort_values(by='rank')
    wc['cul_pct_total'] = wc['pct_total'].cumsum()

    t2 = zip(appears_in.keys(), appears_in.values())
    ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
    wc = ac.merge(wc, on='word')

    wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
    return wc.sort_values(by='rank')

In [18]:
# Find most common words in toxic comments.
toxic_wc_train = count(lemmas_train[y_train_binary.astype(bool)])
toxic_wc_train.head(10)

Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
21,not,35,46,1.0,0.019417,0.019417,0.421687
3,fuck,21,24,2.0,0.010131,0.029548,0.253012
42,people,22,24,3.0,0.010131,0.039679,0.26506
52,shit,21,23,4.0,0.009709,0.049388,0.253012
181,like,13,21,5.0,0.008864,0.058252,0.156627
50,s,15,20,6.0,0.008442,0.066695,0.180723
929,amazon,3,17,7.0,0.007176,0.073871,0.036145
44,time,13,16,8.0,0.006754,0.080625,0.156627
200,work,9,16,9.0,0.006754,0.087379,0.108434
102,have,11,15,10.0,0.006332,0.09371,0.13253


In [19]:
# Find most common words in non-toxic comments.
nontoxic_wc_train = count(lemmas_train[~y_train_binary.astype(bool)])
nontoxic_wc_train.head(10)

Unnamed: 0,word,appears_in,count,rank,pct_total,cul_pct_total,appears_in_pct
25,not,2984,4872,1.0,0.019678,0.019678,0.378057
230,people,1485,2272,2.0,0.009177,0.028855,0.188141
375,like,1439,1900,3.0,0.007674,0.03653,0.182313
198,s,1246,1590,4.0,0.006422,0.042952,0.157861
0,think,1196,1508,5.0,0.006091,0.049043,0.151527
233,thing,985,1287,6.0,0.005198,0.054241,0.124794
184,work,887,1245,7.0,0.005029,0.05927,0.112378
72,time,880,1147,8.0,0.004633,0.063903,0.111491
255,use,826,1088,9.0,0.004395,0.068297,0.10465
37,have,887,1078,10.0,0.004354,0.072651,0.112378


In [20]:
# Make comparison table.
comp = pd.merge(toxic_wc_train[['word', 'appears_in_pct']], 
                nontoxic_wc_train[['word', 'appears_in_pct']], 
                how='outer', on='word', suffixes = ('_toxic', '_nontoxic'))

comp = comp.fillna(0)

comp['diff'] = abs(comp['appears_in_pct_toxic'] - comp['appears_in_pct_nontoxic'])

In [21]:
# Visualize words with the biggest frequency difference between comment
# categories.
top_15 = comp.sort_values(by='diff', ascending=False).head(15)
top_15.sort_values(by='appears_in_pct_toxic',
                   ascending=False).plot.bar(x='word',
                                             y=['appears_in_pct_toxic',
                                                'appears_in_pct_nontoxic']);

In [22]:
# Generate feature wordlist from comparison table.
autogen_wordlist = comp.sort_values(by='diff', ascending=False).head(229)['word']
list(autogen_wordlist)

['fuck',
 'shit',
 'shitty',
 'people',
 'feel',
 'idiot',
 'bullshit',
 'stupid',
 'hell',
 'seriously',
 'right',
 'asshole',
 'crap',
 'web',
 'comment',
 'man',
 'different',
 'ugly',
 'care',
 'read',
 'damn',
 'time',
 'honest',
 'not',
 'yes',
 'character',
 'way',
 'abuse',
 'terrible',
 'year',
 'hate',
 'begin',
 'new',
 'human',
 'want',
 'say',
 'mean',
 'interest',
 'change',
 'page',
 'search',
 'idiotic',
 'website',
 'datum',
 'product',
 'backwards',
 'come',
 'know',
 'real',
 'entire',
 'god',
 'case',
 'prime',
 'use',
 'matt',
 'criminal',
 'buy',
 'job',
 'tell',
 'fix',
 'think',
 'state',
 'font',
 'ui',
 'googles',
 'big',
 'tv',
 'black',
 'error',
 'help',
 'call',
 'complain',
 'stuff',
 'cost',
 'apple',
 'count',
 'sorry',
 'sure',
 'user',
 'support',
 'app',
 'death',
 'edit',
 'etc',
 'problem',
 'country',
 'little',
 'probably',
 'deal',
 'find',
 'happen',
 'microsoft',
 'look',
 'like',
 'leave',
 'language',
 'idea',
 'expect',
 'understand',
 'soc

In [23]:
# Feature wordlist without stop words.
autogen_wordlist = ['fuck',
                    'shit',
                    'shitty',
                    'people',
                    'feel',
                    'idiot',
                    'bullshit',
                    'stupid',
                    'hell',
                    'seriously',
                    'right',
                    'asshole',
                    'crap',
                    'web',
                    'comment',
                    'man',
                    'different',
                    'ugly',
                    'care',
                    'read',
                    'damn',
                    'time',
                    'honest',
                    'not',
                    'yes',
 'character',
 'way',
 'abuse',
 'terrible',
 'year',
 'hate',
 'begin',
 'new',
 'human',
 'want',
 'say',
 'mean',
 'interest',
 'change',
 'page',
 'search',
 'idiotic',
 'website',
 'datum',
 'product',
 'backwards',
 'come',
 'know',
 'real',
 'entire',
 'god',
 'case',
 'prime',
 'use',
 'matt',
 'criminal',
 'buy',
 'job',
 'tell',
 'fix',
 'think',
 'state',
 'font',
 'ui',
 'googles',
 'big',
 'tv',
 'black',
 'error',
 'help',
 'call',
 'complain',
 'stuff',
 'cost',
 'apple',
 'count',
 'sorry',
 'sure',
 'user',
 'support',
 'app',
 'death',
 'edit',
 'etc',
 'problem',
 'country',
 'little',
 'probably',
 'deal',
 'find',
 'happen',
 'microsoft',
 'look',
 'like',
 'leave',
 'language',
 'idea',
 'expect',
 'understand',
 'society',
 'poor',
 'provide',
 'go',
 'ask',
 'amazon',
  'pretty',
 'target',
 'include',
 'stop',
 'child',
 'public',
 'exactly',
 'term',
 'spit',
 'try',
 'issue',
 'information',
 'source',
 'youth',
 'safari',
 'constructive',
 'miss',
 'skin',
 'smell',
 'cop',
 'frustration',
 'suppose',
 'rest',
 'attractive',
 'similar',
 'sense',
 'discovery',
 'business',
 'assume',
 'low',
 'word',
 'true',
 'place',
 'guy',
 'lean',
 'eg',
 'reason',
 'animal',
 'transition',
 'index',
 'linus',
 'small',
 'shut',
 'vw',
 'campaign',
 'css',
 'guess',
 'actually',
 'override',
 'render',
 'cancer',
 'require',
 'mouse',
 'cheat',
 'to',
 'early',
 'mac',
 'send',
 'exclude',
 'test',
 'world',
 'screw',
 'car',
 'have',
 'service',
 'arm',
 'firefox',
 'claim',
 'defend',
 'wide',
 'straight',
 'designer',
 'burn',
 'file',
 'limit',
 'list',
 'google',
 'agree',
 'argument',
 'murder',
 'flag',
 'simple',
 'actual',
 'make',
 'hand',
 'eye',
 'tiny',
 'suck',
 'twice',
 'rarely',
 'javascript',
 'wall',
 'mistake',
 'perspective',
 'allow',
 'encourage',
 'feature',
 'andor',
 'fine',
 'forget',
 'logic',
 'internet',
 'difference',
 'especially',
 'damage',
 'security',
 'organization',
 'street',
 'free',
 'supply',
 'group',
 'display',
 'young',
 '10',
 'late']

In [24]:
len(autogen_wordlist)

220

In [25]:
dtm_train = dtm_train_orig[autogen_wordlist]
dtm_test = dtm_test_orig[autogen_wordlist]

In [26]:
model = keras.Sequential()

In [27]:
model.add(keras.layers.Dense(512, input_dim=220, activation='relu'))
model.add(keras.layers.Dense(256, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [28]:
model.compile(loss='binary_crossentropy', optimizer='adam')

In [29]:
callbacks = [EarlyStopping(monitor='val_loss', patience=2)]

In [30]:
# Fit model.
model.fit(dtm_train, y_train_binary, epochs=25, 
          class_weight = {0: 0.010406, 1: 0.989594},
          callbacks=callbacks,
          validation_split=0.2, 
          verbose=False)



<tensorflow.python.keras.callbacks.History at 0x219ea4c5708>

In [31]:
# Make predictions.
nn_train_pred = model.predict(dtm_train)
nn_test_pred = model.predict(dtm_test)
pd.DataFrame(nn_train_pred).describe()



Unnamed: 0,0
count,7976.0
mean,0.136391
std,0.218928
min,0.0
25%,0.001173
50%,0.015079
75%,0.244935
max,0.999579


In [32]:
# Calculate metrics on training dataset.
train_accuracy = accuracy_score(y_train_binary, np.rint(nn_train_pred))
train_recall = recall_score(y_train_binary, np.rint(nn_train_pred))
train_precision = precision_score(y_train_binary, np.rint(nn_train_pred))

# Calculate metrics on test dataset.
test_accuracy = accuracy_score(y_test_binary, np.rint(nn_test_pred))
test_recall = recall_score(y_test_binary, np.rint(nn_test_pred))
test_precision = precision_score(y_test_binary, np.rint(nn_test_pred))

In [33]:
# Display metrics for training dataset.
print(f'Train accuracy: {train_accuracy:.4f}')
print(f'Train recall: {train_recall:.4f}')
print(f'Train precision: {train_precision:.4f}', '\n')

# Display metrics for test dataset.
print(f'Test accuracy: {test_accuracy:.4f}')
print(f'Test recall: {test_recall:.4f}')
print(f'Test precision: {test_precision:.4f}')

Train accuracy: 0.9274
Train recall: 0.8313
Train precision: 0.1088 

Test accuracy: 0.9248
Test recall: 0.1500
Test precision: 0.0221
