In [1]:
# Data Wrangling
import pandas as pd
import numpy as np
import json

# Kafka
from confluent_kafka import Consumer, TopicPartition, KafkaError, KafkaException

# Utils
import logging
import sys
from time import sleep
import re

# Data & Models
import nltk
nltk.download('stopwords')
import pickle

[nltk_data] Downloading package stopwords to /home/dan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
with open('HateSpeechClassifier.20181211-003504.pkl', 'rb') as file:
    model = pickle.load(file)

In [4]:
df = pd.read_csv('data/labeled_data.csv')
test_tweet = df.iloc[4].tweet
test_tweet

'!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you hear about me might be true or it might be faker than the bitch who told it to ya &#57361;'

In [5]:
y_pred = model.predict([test_tweet])
y_pred

  'stop_words.' % sorted(inconsistent))


array([1])

In [6]:
benign_tweet = 'Happy Birthday, Joe!'
model.predict([benign_tweet])

array([2])

In [7]:
model.predict_log_proba([test_tweet])

array([[-2.52721361, -0.10893327, -3.75808175]])

In [8]:
display(['{:.1f}%'.format(x * 100) for x in model.predict_proba([test_tweet])[0]])
display(['{:.1f}%'.format(x * 100) for x in model.predict_proba([benign_tweet])[0]])

['8.0%', '89.7%', '2.3%']

['14.3%', '19.5%', '66.2%']

In [9]:
topics = ['retweets-vocab', 'tweets-vocab-v2']

In [10]:
logger = logging.getLogger('consumer')
logger.setLevel(logging.DEBUG)
handler = logging.StreamHandler()
handler.setFormatter(logging.Formatter('%(asctime)-15s %(levelname)-8s %(message)s'))
logger.addHandler(handler)

In [11]:
TUPLE_REGEX = re.compile(
    r'''
    ^\(
        (?P<id>[^,]+),
        (?P<screen_name>[^,]+),
        (?P<timestamp>[^,]+),
        (?P<text>.*),
        (?P<is_truncated>[^,]+),
        (?P<is_retweet>[^,]+),
        (?P<retweet_count>[^,]+),
        (?P<favorite_count>[^,]+),
        (?P<is_possibly_sensitive>[^,]+),
        (?P<location>\(.+\)),
        (?P<vocab_match_count>[^,]+)
    \)$
    ''', re.X)

def parse_tweet_tuple(tuple_str):
    match = TUPLE_REGEX.match(tuple_str)
    if match is not None:
        return match.groupdict()
    else:
        print('Can\'t match: {}'.format(tuple_str))
        return {'unparsed': tuple_str}

In [12]:
def print_assignment(consumer, partitions):
    print('Assignment:', partitions)

In [13]:
!rm -f data/*.json

In [14]:
def write_to_json(data, index):
    json_output = json.dumps(data)
    filename = 'tweets-{:04d}.json'.format(index)
    print('Writing {}'.format(filename))
    with open('data/{}'.format(filename), 'w') as file:
        file.write(json_output)
    return

In [15]:
%%time

consumer = Consumer({
    'bootstrap.servers': 'kafka.rasbonics.com:29092',
    'group.id': 'test-python-consumer',
    'auto.offset.reset': 'earliest',
    'session.timeout.ms': 6000,
}, logger=logger)

# Subscribe to topics
consumer.subscribe(topics)

max_tweets = 10000000
tweets_per_file = 10000000
tweet_sample = []
index = 0

try:
    while True:
        if len(tweet_sample) >= tweets_per_file:
            write_to_json(tweet_sample, index)
            index += 1
            tweet_sample = []
            
        msg = consumer.poll(timeout=1.0)
        if msg is None:
            continue
        if msg.error():
            # Error or event
            if msg.error().code() == KafkaError._PARTITION_EOF:
                # End of partition event
                sys.stderr.write('%% %s [%d] reached end at offset %d\n' %
                                 (msg.topic(), msg.partition(), msg.offset()))
                break
            else:
                # Error
                raise KafkaException(msg.error())
        else:
            tweet_sample.append(parse_tweet_tuple(msg.value().decode('utf-8')))
            if len(tweet_sample) >= max_tweets: 
                break
    
    write_to_json(tweet_sample, index)

except KeyboardInterrupt:
    sys.stderr.write('%% Aborted by user\n')

finally:
    # Close down consumer to commit final offsets.
    consumer.close()

Writing tweets-0000.json
CPU times: user 2.1 s, sys: 535 ms, total: 2.63 s
Wall time: 8.59 s


% tweets-vocab-v2 [0] reached end at offset 78234


In [16]:
len(tweet_sample)

153724

In [17]:
tweet_sample[0]

{'id': '1071329402909270017',
 'screen_name': 'Pizzazz_Books',
 'timestamp': 'Sat Dec 08 03:03:28 CST 2018',
 'text': 'History mystery action & romance!★SEWING CAN BE DANGEROUS★🦉https://t.co/nZkhO5QKXx@SarahMallery1… https://t.co/wQfO82hx82',
 'is_truncated': 'true',
 'is_retweet': 'false',
 'retweet_count': '15',
 'favorite_count': '7',
 'is_possibly_sensitive': 'false',
 'location': '(,)',
 'vocab_match_count': '1'}

In [18]:
tweets_text = [tweet['text'] for tweet in tweet_sample]
tweets_text[0]

'History mystery action & romance!★SEWING CAN BE DANGEROUS★🦉https://t.co/nZkhO5QKXx@SarahMallery1… https://t.co/wQfO82hx82'

In [27]:
def class_name(probabilities):
    max_index = np.argmax(probabilities)
    return {
        0: 'HATE SPEECH',
        1: 'OFFENSIVE LANGUAGE',
        2: 'NEITHER',
    }[max_index]

In [20]:
%%time

predicted_probabilities = model.predict_proba(tweets_text)

CPU times: user 11min 49s, sys: 18.3 s, total: 12min 8s
Wall time: 12min 5s


In [21]:
predicted = pd.DataFrame(tweets_text, columns=['text'])
predicted.head()

Unnamed: 0,text
0,History mystery action & romance!★SEWING CAN B...
1,This jobs propaganda clip tells me that the BJ...
2,This Nigga Nick Cannon said everyone going dow...
3,I hate that I lie to my mom about going to eat...
4,"pretending to be on the left or ""just a libera..."


In [28]:
predicted['class'] = [class_name(x) for x in predicted_probabilities]
predicted['probability_hate'] = [round(x[0], 3) for x in predicted_probabilities]
predicted['probability_offensive'] = [round(x[1], 3) for x in predicted_probabilities]
predicted['probability_neither'] = [round(x[2], 3) for x in predicted_probabilities]

In [29]:
def display_random_rows(df, n=20):
    pd.set_option('display.max_colwidth', -1)
    rand_indices = np.random.randint(0, len(df), n)
    display(df.iloc[rand_indices])
    return

In [30]:
display_random_rows(predicted)

Unnamed: 0,text,class,probability_hate,probability_offensive,probability_neither
124588,@nonamejustheree @Isaidwhatisaid_ @MadMattChicago @Adore_dess @CustomReality ugly ass white people,HATE SPEECH,0.537,0.115,0.348
53373,i bet everyone miss their old skin,OFFENSIVE LANGUAGE,0.129,0.561,0.31
109940,. @Olacabs @ola_supports where are you when i need you the mostlonely nervous lost in the crowd those eyes on… https://t.co/SHZn31IAu7,HATE SPEECH,0.473,0.328,0.199
135871,@RailMinIndia pnr no.6422208686 train no 12203coche-g7 seatn no.1112 sir watering not supply in this traincharg… https://t.co/OsIPh3LrXf,NEITHER,0.12,0.346,0.535
54281,Latin American government in the 1890s sponsoring European migration in an attempt to whiten their populations. https://t.co/reGlaKyBkN,NEITHER,0.273,0.288,0.438
15054,The latest Book Of Quotes And Poetry! https://t.co/vs9gqwQW4o #dianamarysharpton #musicvideo,NEITHER,0.232,0.128,0.64
138296,@casaliteratura Misery de Stephen king,OFFENSIVE LANGUAGE,0.234,0.409,0.357
143529,LETS FUCKING HEAR IT!!!!!!!!,OFFENSIVE LANGUAGE,0.311,0.644,0.045
113642,nigga what we do https://t.co/0s6uGS2mrS,OFFENSIVE LANGUAGE,0.422,0.439,0.139
55086,@DianaPenty @deespeak No wonder that urban naxal dia mirza shares her birthday with mother of all urban naxals Sonia Gandhi.,NEITHER,0.166,0.077,0.757


In [31]:
hate_examples = predicted[predicted['class'] == 'HATE SPEECH']
display(len(hate_examples))
display_random_rows(hate_examples)

6568

Unnamed: 0,text,class,probability_hate,probability_offensive,probability_neither
10579,That nigga built so different,HATE SPEECH,0.5,0.382,0.117
8184,So a communist cannot be a hero? So whoever compliments Castro cannot be a hero? So rising up against the oppressor… https://t.co/YBXoP85an4,HATE SPEECH,0.465,0.153,0.381
118376,My brothers got a burd in and she’s brought her fucking dog ?!?!??? All I can hear is the poor wee thing scratching to get out,HATE SPEECH,0.661,0.338,0.001
137188,@xufeitop Whatever or wherever that is it is flat shit cool.,HATE SPEECH,0.434,0.214,0.352
141494,Really not a fan of green walls https://t.co/xcC7RNmPLl,HATE SPEECH,0.43,0.337,0.233
66457,officially ended last night I did my first solo show here a year and a half ago at summer bash where I sang an unf… https://t.co/soNuDadXDx,HATE SPEECH,0.427,0.177,0.396
86482,@iGregJackson Lmfao nigga sickening man,HATE SPEECH,0.527,0.331,0.142
98994,Lol at gay people defending the str8s rights to say faggot. Lmao like okay cool most adults won't give a fuck but… https://t.co/VigAIAcI7r,HATE SPEECH,0.662,0.33,0.008
56710,Niggas be like 🗣️ GIVE ME T-MAC FROM '04 FROM THEM EXACT 33 SECONDS OVER ANYONE 😭😭It doesn't work like that 😂,HATE SPEECH,0.412,0.368,0.22
88308,destiny fans liked killer queen before it was cool,HATE SPEECH,0.419,0.373,0.208


In [35]:
very_probably_hate = predicted[predicted.probability_hate > 0.7]
display(very_probably_hate.head())
print('Very probably hate count: {}'.format(len(very_probably_hate)))

Unnamed: 0,text,class,probability_hate,probability_offensive,probability_neither
469,Good morning faggots how much do you wanna gag on my strap on?!😈Amazon/circle pay Hotkiten1234@outlook.com#VRT… https://t.co/zw8qX61Ye6,HATE SPEECH,0.807,0.171,0.022
2848,Lonzo Ball held Opposing Grizzly Point Guards to 1-6 (16.67%) FG%Conley 1-4Mack 0-22 Steals https://t.co/HuB2ouqyPb,HATE SPEECH,0.707,0.188,0.105
3313,@ngb_lilbreezy man fuck you guys. you harassed a worker for doing his fucking job. he obviously has a social disord… https://t.co/ZGz2lwz32T,HATE SPEECH,0.701,0.287,0.012
4184,@patel4witham There is no “out of context” - your comments are appalling and tantamount to an act of aggression aga… https://t.co/hSkVRzsPl0,HATE SPEECH,0.746,0.177,0.077
8029,I fucking HATE when people blame delivery people as if it’s their fault the restaurant fucked up the order. Deliver… https://t.co/GgNJDc2QJJ,HATE SPEECH,0.72,0.274,0.006


Very probably hate count: 138


In [36]:
predicted.to_csv('data/classified-tweets.csv')